kubernetes device-plugin源碼分析

1. 基本信息


2. DeviceManager


// ManagerImpl is the structure in charge of managing Device Plugins.
type ManagerImpl struct {
	socketname string
	socketdir  string

	endpoints map[string]endpointInfo // Key is ResourceName
	mutex     sync.Mutex

	server *grpc.Server
	wg     sync.WaitGroup

	// activePods is a method for listing active pods on the node
	// so the amount of pluginResources requested by existing pods
	// could be counted when updating allocated devices
	// 獲得Node上所有存在pods的列表
	activePods ActivePodsFunc

	// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
	// We use it to determine when we can purge inactive pods from checkpointed state.
	// sourcesReady提供了kubelet配置源的準備情況,例如apiserver update readyiness,以此決定什麼時候可以清除不活躍節點從某個檢查點狀態
	sourcesReady config.SourcesReady

	// callback is used for updating devices' states in one time call.
	// e.g. a new device is advertised, two old devices are deleted and a running device fails.
	// 更新某resourceName的所有設備狀態
	callback monitorCallback

	// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
	healthyDevices map[string]sets.String

	// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
	unhealthyDevices map[string]sets.String

	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
	// 已被分配的設備列表
	allocatedDevices map[string]sets.String

	// podDevices contains pod to allocated device mapping.
	// type podDevices map[string]containerDevices ,記錄了哪個pod分配了哪些設備
	podDevices        podDevices
	checkpointManager checkpointmanager.CheckpointManager


type ManagerImpl struct {
	socketname string	//kubelet
	socketdir  string	//kubelet.sock的路徑
	// type endpointInfo struct {
	// 	// defined in ./endpoint.go ,endpoint can run ,stop ,allocate prestartContainer ,etc.
	// 	e    endpoint   
	// 	// defined in apis ,only have a item
	// 	opts *pluginapi.DevicePluginOptions
	// }
	endpoints map[string]endpointInfo	//保存了所有endpoint的信息
	mutex     sync.Mutex	//互斥鎖,因爲endpoint是單獨起的協程,有一些臨界區代碼
	server *grpc.Server	//保存了register grpc的server
	wg     sync.WaitGroup //防止協程未關閉
	// 獲得Node上所有存在active pods的列表
	activePods ActivePodsFunc
	// sourcesReady提供了kubelet配置源的準備情況,例如apiserver update readyiness,以此決定什麼時候可以清除不活躍節點從某個檢查點狀態
	sourcesReady config.SourcesReady
	// 更新某resource的所有設備狀態
	callback monitorCallback
	// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
	healthyDevices map[string]sets.String
	// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
	unhealthyDevices map[string]sets.String
	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
	allocatedDevices map[string]sets.String
	// podDevices contains pod to allocated device mapping.
	// type podDevices map[string]containerDevices ,記錄了哪個pod分配了哪些設備
	podDevices        podDevices
	checkpointManager checkpointmanager.CheckpointManager


newManagerImpl(socketPath string) (*ManagerImpl, error) {
	... // 檢查是否已經有已存在的manager
	// 創建manager
	manager := &ManagerImpl{
		endpoints: make(map[string]endpointInfo),

		socketname:       file,
		socketdir:        dir,
		healthyDevices:   make(map[string]sets.String),
		unhealthyDevices: make(map[string]sets.String),
		allocatedDevices: make(map[string]sets.String),
		podDevices:       make(podDevices),
	// 將healthyDevice,unhealthyDevice列表清空,後根據最新的列表重新生成設備列表,並寫入檢查點
	manager.callback = manager.genericDeviceUpdateCallback 
	// The following structs are populated with real implementations in manager.Start()
	// Before that, initializes them to perform no-op operations.
	manager.activePods = func() []*v1.Pod { return []*v1.Pod{} }
	manager.sourcesReady = &sourcesReadyStub{} // add resource or return if is allready
	checkpointManager, err := checkpointmanager.NewCheckpointManager(dir)
	if err != nil {
		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
	manager.checkpointManager = checkpointManager
	return manager, nil


func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
	m.healthyDevices[resourceName] = sets.NewString()
	m.unhealthyDevices[resourceName] = sets.NewString()
	for _, dev := range devices {
		if dev.Health == pluginapi.Healthy {
		} else {


3. Register()函數


// 該方法由device-plugin通過grpc調用,將自身信息註冊進來
// 1. 判斷api版本是否支持
// 2. 判斷resourceName是否合法
// 3. 創建對應該資源的endpoint並調用listandwatch持續檢測該資源的設備健康
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
	klog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
	var versionCompatible bool
	for _, v := range pluginapi.SupportedVersions {
		if r.Version == v {
			versionCompatible = true
	if !versionCompatible {
		errorString := fmt.Sprintf(errUnsupportedVersion, r.Version, pluginapi.SupportedVersions)
		klog.Infof("Bad registration request from device plugin with resource name %q: %s", r.ResourceName, errorString)
		return &pluginapi.Empty{}, fmt.Errorf(errorString)
	if !v1helper.IsExtendedResourceName(v1.ResourceName(r.ResourceName)) {
		errorString := fmt.Sprintf(errInvalidResourceName, r.ResourceName)
		klog.Infof("Bad registration request from device plugin: %s", errorString)
		return &pluginapi.Empty{}, fmt.Errorf(errorString)

	// TODO: for now, always accepts newest device plugin. Later may consider to
	// add some policies here, e.g., verify whether an old device plugin with the
	// same resource name is still alive to determine whether we want to accept
	// the new registration.
	// 大致意思是會可能會添加新老版本plugin更替的邏輯功能
	// 爲該Resource添加一個的endpoint
	go m.addEndpoint(r)

	return &pluginapi.Empty{}, nil

這裏用 go m.addEndpoint®爲每一個resource單獨起了一個協程做處理,m.runEndpoint(r.ResourceName, new)也是單獨起的協程

// 1. newEndpointImpl創建新的endpoint實例,並返回與plugin的連接conn
// 2. m.registerEndpoint將resourceName保存在m.endpoints[resourceName]中
// 3. go m.runEndpoint啓動一個endpoint,調用其run方法,通過listandwatch grpc獲取plugin發來的最新設備信息
//    ,並實時更新該資源設備的狀態
func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
	// 和device-plugin提供的endpoint建立連接,並返回client
	new, err := newEndpointImpl(filepath.Join(m.socketdir, r.Endpoint), r.ResourceName, m.callback)
	if err != nil {
		klog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
	// 將resourceName保存在m.endpoints[resourceName]中
	m.registerEndpoint(r.ResourceName, r.Options, new)
	go func() {
		m.runEndpoint(r.ResourceName, new)

下面看一下m.runEndpoint(r.ResourceName, new)

func (m *ManagerImpl) runEndpoint(resourceName string, e endpoint) {
	// 調用plugin的ListAndWatch,並持續檢測
	// 僅關閉了和plugin的grpc連接

	defer m.mutex.Unlock()
	// 如果該endpoint從run方法中跳出來並且停止了,便將該資源標定爲不健康
	if old, ok := m.endpoints[resourceName]; ok && old.e == e {
		// 將對應該resource的所有設備健康狀態標記爲不健康,resource本身不做改變

	klog.V(2).Infof("Endpoint (%s, %v) became unhealthy", resourceName, e)


func (e *endpointImpl) run() {
	stream, err := e.client.ListAndWatch(context.Background(), &pluginapi.Empty{})
	if err != nil {
		klog.Errorf(errListAndWatch, e.resourceName, err)

	// 持續檢測plugin發來的設備信息,並通過調用e.callback(實際調用爲manager的genericDeviceUpdateCallback函數)來更新設備列表
	// 但是從邏輯上來看調用callback時所有的endpoint都進入了一個臨界區代碼,resource如果太多的話可能需要等待
	// 每次更新設備信息都會把之前的該resource下的內容清空
	for {
		response, err := stream.Recv()
		if err != nil {
			klog.Errorf(errListAndWatch, e.resourceName, err)

		devs := response.Devices
		klog.V(2).Infof("State pushed for device plugin %s", e.resourceName)

		var newDevs []pluginapi.Device
		for _, d := range devs {
			newDevs = append(newDevs, *d)

		e.callback(e.resourceName, newDevs)

e.callback(e.resourceName, newDevs)實際上是調用了func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device)

// 將healthyDevice,unhealthyDevice列表清空,後根據最新的列表重新生成設備列表,並寫入檢查點
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
	m.healthyDevices[resourceName] = sets.NewString()
	m.unhealthyDevices[resourceName] = sets.NewString()
	for _, dev := range devices {
		if dev.Health == pluginapi.Healthy {
		} else {


4. Allocate() 函數


// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
// 當已註冊的plugin可以分配一組設備時調用
// PodAdmitAttributes is the context for a pod admission decision.被批准分配設備的pod
// type PodAdmitAttributes struct {
// 	// the pod to evaluate for admission
// 	Pod *v1.Pod
// 	// all pods bound to the kubelet excluding the pod being evaluated
// 	OtherPods []*v1.Pod
// }
// 傳入nodeInfo,PodAdmitAttributes,根據node信息和該pod的container組成如何分配設備?
func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
	pod := attrs.Pod
	devicesToReuse := make(map[string]sets.String) //保存可重用的設備
	// 下面的這個for循環實現了爲init-container分配設備,init-container設備先於container執行,依次執行完畢後纔會執行container
	for _, container := range pod.Spec.InitContainers {
		// allocateContainerResources嘗試和plugin做通信爲init-container分配設備
		if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
			return err
		// 將init-container中分配的設備加入可重用設備中,
		// 如果爲一個 Pod 指定了多個 Init 容器,那些容器會按順序一次運行一個。 每個 Init 容器必須運行成功,
		// 下一個才能夠運行。故這些設備是在所有init-container中傳遞的,邏輯上可能有多個init-container用的是同一個設備
		m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
	// 下面的這個for循環實現了爲container分配設備
	for _, container := range pod.Spec.Containers {
		if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
			return err
		m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)

	defer m.mutex.Unlock()

	// quick return if no pluginResources requested
	if _, podRequireDevicePluginResource := m.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
		return nil

	return nil

然後我們看一下其中的幾個關鍵函數,首先是m.allocateContainerResources(pod, &container, devicesToReuse),該函數嘗試和plugin做通信併爲這個pod的某個container/init-container分配設備,最終的結果是更新了manager的podDevices成員信息,該成員包含着該node下所有pod包含的所有container各個resource的分配數量和分配方式,例如env,mount等等。

// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,
// and updates the cached containerDevices on success.
// allocateContainerResources嘗試爲輸入容器分配所有必需的設備插件資源,
// 爲每個新設備資源需求發出Allocate rpc請求,處理其AllocateResponses,
// 並在成功時更新緩存的containerDevices。並將結果插入m.podDevices中,記錄哪個pod分配了哪些設備,和plugin返回的設備分配方式,mount,env,etc.
// 可以推測得知下面函數的輸入變量爲待分配設備的pod名,裏面某一container,和deviceToReuse這個map
func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container, devicesToReuse map[string]sets.String) error {
	podUID := string(pod.UID)
	contName := container.Name
	allocatedDevicesUpdated := false
	// Extended resources are not allowed to be overcommitted.
	// Since device plugin advertises extended resources,
	// therefore Requests must be equal to Limits and iterating
	// over the Limits should be sufficient.
	// Limits是yaml文件中對於擴展資源的標記字段,形如:
	// resources:
	// 	limits:
	// 	nvidia.com/gpu: 1
	// 不允許過度使用擴展資源。
	// 由於設備插件負責擴展資源分配,因此請求必須等於Limits,並且迭代限制應該足夠。
	// 以此可以推斷下面的k是擴展資源名,v是請求數量
	// 對於請求的每一類資源
	for k, v := range container.Resources.Limits {
		resource := string(k)
		needed := int(v.Value())
		klog.V(3).Infof("needs %d %s", needed, resource)
		// 判斷是否是註冊了的擴展資源
		if !m.isDevicePluginResource(resource) {
		// Updates allocatedDevices to garbage collect any stranded resources
		// before doing the device plugin allocation.
		// 更新已分配設備爲了在device-plugin之前垃圾收集任何標準設備
		// 在分配新設備之前首先將能回收資源的pod釋放掉
		if !allocatedDevicesUpdated {
			// 釋放所有終結狀態pod所分配的device,做垃圾收集用
			// 從m.podDevices刪除掉將要被移除的pod,同時也就將該pod中的device變爲可以分配的了
			allocatedDevicesUpdated = true
		// 返回一個爲該container分配的設備列表,如果是contianer restart則重新mount原設備,如果有可重用設備則用可重用設備,否則在當前可分配設備列表裏取前幾個,
		// 並將這幾個加入已分配設備中
		allocDevices, err := m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])
		if err != nil {
			return err
		if allocDevices == nil || len(allocDevices) <= 0 {

		startRPCTime := time.Now()
		// Manager.Allocate involves RPC calls to device plugin, which
		// could be heavy-weight. Therefore we want to perform this operation outside
		// mutex lock. Note if Allocate call fails, we may leave container resources
		// partially allocated for the failed container. We rely on updateAllocatedDevices()
		// to garbage collect these resources later. Another side effect is that if
		// we have X resource A and Y resource B in total, and two containers, container1
		// and container2 both require X resource A and Y resource B. Both allocation
		// requests may fail if we serve them in mixed order.
		// TODO: may revisit this part later if we see inefficient resource allocation
		// in real use as the result of this. Should also consider to parallize device
		// plugin Allocate grpc calls if it becomes common that a container may require
		// resources from multiple device plugins.
		// 每個endpoint都保存了一個可以和device-plugin交互的位置和客戶端
		eI, ok := m.endpoints[resource]
		// 如果連接endpoint不成功,則重置已分配設備
		if !ok {
			m.allocatedDevices = m.podDevices.devices()
			return fmt.Errorf("Unknown Device Plugin %s", resource)

		devs := allocDevices.UnsortedList()
		// TODO: refactor this part of code to just append a ContainerAllocationRequest
		// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
		klog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
		// 遠程調用grpc向plugin請求分配設備
		resp, err := eI.e.allocate(devs)
		if err != nil {
			// In case of allocation failure, we want to restore m.allocatedDevices
			// to the actual allocated state from m.podDevices.
			m.allocatedDevices = m.podDevices.devices()
			return err

		if len(resp.ContainerResponses) == 0 {
			return fmt.Errorf("No containers return in allocation response %v", resp)

		// Update internal cached podDevices state.
		// type podDevices map[string]containerDevices ,記錄了哪個pod的哪個container分配了哪些設備
		m.podDevices.insert(podUID, contName, resource, allocDevices, resp.ContainerResponses[0])

	// Checkpoints device to container allocation information.
	return m.writeCheckpoint()

updateAllocatedDevices(activePods []*v1.Pod)函數的功能是從m.podDevices中刪除所有處於終結狀態的pod,並回收其設備,反映到代碼上是用m.podDevices.device()重新生成一份已分配設備列表。

// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
// terminated pods. Returns error on failure.
// 釋放所有終結狀態pod所分配的device,做垃圾收集用,刪除m.podDevices下對應的pod全部內容,然後用m.podDevices.devices()
// 重新生成m.allocatedDevices的內容
func (m *ManagerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
	if !m.sourcesReady.AllReady() {
	defer m.mutex.Unlock()
	activePodUids := sets.NewString()
	for _, pod := range activePods {
	allocatedPodUids := m.podDevices.pods()
	// 已分配設備pod-activatepod = podsToRemoved
	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
	if len(podsToBeRemoved) <= 0 {
	klog.V(3).Infof("pods to be removed: %v", podsToBeRemoved.List())
	// 從m.podDevices刪除掉將要被移除的pod
	// Regenerated allocatedDevices after we update pod allocation information.
	// 將resourceName作爲鍵值返回所有被跟蹤pod的設備列表,即所有當前已被分配的資源設備列表
	m.allocatedDevices = m.podDevices.devices()

m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])函數用來生成需要向plugin請求的設備列表,如果可重用設備已經夠用或者沒有設備需求時則不向plugin請求分配新的設備,否則調用grpc向plugin申請分配新的設備。
設備分配的邏輯是首先看container中是否已經分配了設備,如果設備夠用則返回nil,否則查看reusableDevices,取出裏面的設備分配,否則根據最終缺少的設備量返回healthdevice - inusedevice(m.allocatedDevices[resource]),中的前needed個,這便是其分配設備的策略。

// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
// 返回一個爲該container分配的設備列表,如果是contianer restart則重新mount原設備,如果有可重用設備則用可重用設備,否則在當前可分配設備列表裏取前幾個,
// 並將這幾個加入已分配設備中
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
	defer m.mutex.Unlock()
	needed := required
	// Gets list of devices that have already been allocated.
	// This can happen if a container restarts for example.
	// 如果是container重啓的時候,則依舊mount原設備
	devices := m.podDevices.containerDevices(podUID, contName, resource)
	if devices != nil {
		klog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
		needed = needed - devices.Len()
		// A pod's resource is not expected to change once admitted by the API server,
		// so just fail loudly here. We can revisit this part if this no longer holds.
		if needed != 0 {
			return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", podUID, contName, resource, devices.Len(), required)
	if needed == 0 {
		// No change, no work.
		return nil, nil
	klog.V(3).Infof("Needs to allocate %d %q for pod %q container %q", needed, resource, podUID, contName)
	// Needs to allocate additional devices.
	if _, ok := m.healthyDevices[resource]; !ok {
		return nil, fmt.Errorf("can't allocate unregistered device %s", resource)
	devices = sets.NewString()

	// Allocates from reusableDevices list first.
	// 優先利用reusableDevices裏面的的設備
	for device := range reusableDevices {
		if needed == 0 {
			return devices, nil
	// Needs to allocate additional devices.
	if m.allocatedDevices[resource] == nil {
		m.allocatedDevices[resource] = sets.NewString()
	// Gets Devices in use.
	devicesInUse := m.allocatedDevices[resource]
	// Gets a list of available devices.
	// 每一次的可用設備都由所有健康設備減去已分配設備,這樣便可以將剛剛從終結狀態釋放掉的設備利用起來
	available := m.healthyDevices[resource].Difference(devicesInUse)
	if int(available.Len()) < needed {
		return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
	// 將被分配的設備是直接取前面的needed個,如果想實現擴展資源的自定義分配只需要改下面的代碼就可以了
	allocated := available.UnsortedList()[:needed]
	// Updates m.allocatedDevices with allocated devices to prevent them
	// from being allocated to other pods/containers, given that we are
	// not holding lock during the rpc call.
	// 加入已被分配的的設備之中
	for _, device := range allocated {
	return devices, nil

然後是m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)這個函數。

// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
// 將某資源分配給某pod內的container
// type podDevices map[string]containerDevices保存了所有pod已分配的設備列表,每一項對應一個containerDevice
// 每個containerDevice包含了所有該container已分配的資源名稱,每個資源名稱下保存着設備列表
func (pdev podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
	// 有沒有這個pod
	containers, exists := pdev[podUID]
	if !exists {
	resources, exists := containers[contName]
	if !exists {
	// 將已分配的設備全部添加進到該resource的allocatedResources中
	for resource, devices := range resources {
		allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds)

然後是m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)函數,其實現了將從可重用設備列表刪除本次分配給該container的設備。但是如果init-container生成的devicesToReuse數量大於container的所需設備那麼這些多餘的設備便還是處於已分配狀態,故其他的pod便無法利用這個設備,直到該pod進入終結狀態後纔會被釋放。舉個例子,如果init-container每次請求4個gpu而container總共請求2個,那麼剩餘的兩個gpu會被認爲還在這個pod中被利用,爲已分配狀態,那麼這個pod便會在運行時佔用4個gpu,但是實際上他只需要2個,另外兩個在pod終結前一直處於空閒狀態而其他的pod無法再被分配這兩塊gpu。

// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
// 將設備從該resource的已分配設備列表中刪除
func (pdev podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
	containers, exists := pdev[podUID]
	if !exists {
	resources, exists := containers[contName]
	if !exists {
	for resource, devices := range resources {
		// 返回差集,未被利用的設備
		allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds)


// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
// the allocated capacity. This allows pods that have already been scheduled on
// the node to pass GeneralPredicates admission checking even upon device plugin failure.
// 更新當前node的已分配設備資源量
func (m *ManagerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
	var newAllocatableResource *schedulercache.Resource
	allocatableResource := node.AllocatableResource()
	if allocatableResource.ScalarResources == nil {
		allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
	// needed表明對於scheduler來說申請設備是一個增量更新,最大程度上利用已經被plugin分配的設備。如果當前node已經分配的設備量
	// 沒有scheduler申請記錄表裏的多,那麼就保持scheduler的申請記錄,否則更新該記錄。
	for resource, devices := range m.allocatedDevices {
		needed := devices.Len()
		quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
		if ok && int(quant) >= needed {
		// Needs to update nodeInfo.AllocatableResource to make sure
		// NodeInfo.allocatableResource at least equal to the capacity already allocated.
		if newAllocatableResource == nil {
			newAllocatableResource = allocatableResource.Clone()
		newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
	if newAllocatableResource != nil {


5. kubelet和scheduler的資源調度

kubelet對於當前節點設備信息的獲取依賴func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
該函數返回三個值,分別爲當前node上設備總量、健康設備總量和刪除資源,分別叫做capacity, allocatable, deletedResources,刪除資源的意思是這個resource的endpoint不健康了,所以需要報告給master節點處理。Node.Status.Capacity和Node.Status.Allocatable記錄了節點全部資源,包括可用於分配的資源。

// GetCapacity is expected to be called when Kubelet updates its node status.
// The first returned variable contains the registered device plugin resource capacity.
// The second returned variable contains the registered device plugin resource allocatable.
// The third returned variable contains previously registered resources that are no longer active.
// Kubelet uses this information to update resource capacity/allocatable in its node status.
// After the call, device plugin can remove the inactive resources from its internal list as the
// change is already reflected in Kubelet node status.
// Note in the special case after Kubelet restarts, device plugin resource capacities can
// temporarily drop to zero till corresponding device plugins re-register. This is OK because
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
// capacity for already allocated pods so that they can continue to run. However, new pods
// requiring device plugin resources will not be scheduled till device plugin re-registers.
// 得到當前node上設備總量、健康設備總量和刪除資源,分別叫做capacity, allocatable, deletedResources,
// 這是kubelet的視角
func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
	needsUpdateCheckpoint := false
	var capacity = v1.ResourceList{}
	var allocatable = v1.ResourceList{}
	deletedResources := sets.NewString()
	// 可分配資源等於healthyDevices數
	for resourceName, devices := range m.healthyDevices {
		eI, ok := m.endpoints[resourceName]
		// 如果不存在或endpoint失效,則刪除資源
		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
			// The resources contained in endpoints and (un)healthyDevices
			// should always be consistent. Otherwise, we run with the risk
			// of failing to garbage collect non-existing resources or devices.
			if !ok {
				klog.Errorf("unexpected: healthyDevices and endpoints are out of sync")
			delete(m.endpoints, resourceName)
			delete(m.healthyDevices, resourceName)
			needsUpdateCheckpoint = true
		} else {
			capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
			allocatable[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
	// 節點資源總數等於healthyDevices+unhealthyDevices數
	for resourceName, devices := range m.unhealthyDevices {
		eI, ok := m.endpoints[resourceName]
		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
			if !ok {
				klog.Errorf("unexpected: unhealthyDevices and endpoints are out of sync")
			delete(m.endpoints, resourceName)
			delete(m.unhealthyDevices, resourceName)
			needsUpdateCheckpoint = true
		} else {
			capacityCount := capacity[v1.ResourceName(resourceName)]
			unhealthyCount := *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
			capacity[v1.ResourceName(resourceName)] = capacityCount
	if needsUpdateCheckpoint {
	// 返回資源容量,可用資源數,刪除的資源數
	return capacity, allocatable, deletedResources.UnsortedList()

那麼scheduler是如何得知當前節點有多少設備可以被分配呢,還記得Allocate()有一個函數叫sanitizeNodeAllocatable(node *schedulercache.NodeInfo)嗎,scheduler便是通過經由該函數維護的schedulercache.NodeInfo.ScalarResources[v1.ResourceName(resource)]獲取當前節點的可用設備的。

還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.