kube-scheduler源碼分析(二)之 registerAlgorithmProvider

本文個人博客地址:https://www.huweihuang.com/kubernetes-notes/code-analysis/kube-scheduler/registerAlgorithmProvider.html

kube-scheduler源碼分析(二)之 registerAlgorithmProvider

以下代碼分析基於 kubernetes v1.12.0 版本。

此部分主要介紹調度中使用的各種調度算法,包括調度算法的註冊部分。註冊部分的代碼主要在/pkg/scheduler/algorithmprovider中,具體的預選策略和優選策略的算法實現在/pkg/scheduler/algorithm中。

1. ApplyFeatureGates

註冊調度算法的調用入口在SchedulerCommand的Run函數中。

此部分代碼位於/cmd/kube-scheduler/app/server.go

// Run runs the Scheduler.
func Run(c schedulerserverconfig.CompletedConfig, stopCh <-chan struct{}) error {
	...
	// Apply algorithms based on feature gates.
	// TODO: make configurable?
	algorithmprovider.ApplyFeatureGates()
  ...
}  

ApplyFeatureGates的具體實現在pkg/scheduler/algorithmprovider的包中。

此部分代碼位於/pkg/scheduler/algorithmprovider/plugins.go

// ApplyFeatureGates applies algorithm by feature gates.
func ApplyFeatureGates() {
	defaults.ApplyFeatureGates()
}

ApplyFeatureGates具體實現如下:

此部分代碼位於/pkg/scheduler/algorithmprovider/defaults/defaults.go

根據feature移除部分調度策略。

// ApplyFeatureGates applies algorithm by feature gates.
func ApplyFeatureGates() {
	if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) {
		// Remove "CheckNodeCondition", "CheckNodeMemoryPressure", "CheckNodePIDPressurePred"
		// and "CheckNodeDiskPressure" predicates
		factory.RemoveFitPredicate(predicates.CheckNodeConditionPred)
		factory.RemoveFitPredicate(predicates.CheckNodeMemoryPressurePred)
		factory.RemoveFitPredicate(predicates.CheckNodeDiskPressurePred)
		factory.RemoveFitPredicate(predicates.CheckNodePIDPressurePred)
		// Remove key "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure"
		// from ALL algorithm provider
		// The key will be removed from all providers which in algorithmProviderMap[]
		// if you just want remove specific provider, call func RemovePredicateKeyFromAlgoProvider()
		factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeConditionPred)
		factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeMemoryPressurePred)
		factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeDiskPressurePred)
		factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodePIDPressurePred)

		// Fit is determined based on whether a pod can tolerate all of the node's taints
		factory.RegisterMandatoryFitPredicate(predicates.PodToleratesNodeTaintsPred, predicates.PodToleratesNodeTaints)
		// Fit is determined based on whether a pod can tolerate unschedulable of node
		factory.RegisterMandatoryFitPredicate(predicates.CheckNodeUnschedulablePred, predicates.CheckNodeUnschedulablePredicate)
		// Insert Key "PodToleratesNodeTaints" and "CheckNodeUnschedulable" To All Algorithm Provider
		// The key will insert to all providers which in algorithmProviderMap[]
		// if you just want insert to specific provider, call func InsertPredicateKeyToAlgoProvider()
		factory.InsertPredicateKeyToAlgorithmProviderMap(predicates.PodToleratesNodeTaintsPred)
		factory.InsertPredicateKeyToAlgorithmProviderMap(predicates.CheckNodeUnschedulablePred)

		glog.Warningf("TaintNodesByCondition is enabled, PodToleratesNodeTaints predicate is mandatory")
	}

	// Prioritizes nodes that satisfy pod's resource limits
	if utilfeature.DefaultFeatureGate.Enabled(features.ResourceLimitsPriorityFunction) {
		factory.RegisterPriorityFunction2("ResourceLimitsPriority", priorities.ResourceLimitsPriorityMap, nil, 1)
	}

}

2. init

當函數邏輯調用到algorithmprovider包時,就會自動調用init的初始化函數,此部分主要包括對預選算法和優選算法的註冊。

此部分代碼位於/pkg/scheduler/algorithmprovider/defaults/defaults.go

func init() {
	// Register functions that extract metadata used by predicates and priorities computations.
	factory.RegisterPredicateMetadataProducerFactory(
		func(args factory.PluginFactoryArgs) algorithm.PredicateMetadataProducer {
			return predicates.NewPredicateMetadataFactory(args.PodLister)
		})
	factory.RegisterPriorityMetadataProducerFactory(
		func(args factory.PluginFactoryArgs) algorithm.PriorityMetadataProducer {
			return priorities.NewPriorityMetadataFactory(args.ServiceLister, args.ControllerLister, args.ReplicaSetLister, args.StatefulSetLister)
		})

	registerAlgorithmProvider(defaultPredicates(), defaultPriorities())

	// IMPORTANT NOTES for predicate developers:
	// We are using cached predicate result for pods belonging to the same equivalence class.
	// So when implementing a new predicate, you are expected to check whether the result
	// of your predicate function can be affected by related API object change (ADD/DELETE/UPDATE).
	// If yes, you are expected to invalidate the cached predicate result for related API object change.
	// For example:
	// https://github.com/kubernetes/kubernetes/blob/36a218e/plugin/pkg/scheduler/factory/factory.go#L422

	// Registers predicates and priorities that are not enabled by default, but user can pick when creating their
	// own set of priorities/predicates.

	// PodFitsPorts has been replaced by PodFitsHostPorts for better user understanding.
	// For backwards compatibility with 1.0, PodFitsPorts is registered as well.
	factory.RegisterFitPredicate("PodFitsPorts", predicates.PodFitsHostPorts)
	// Fit is defined based on the absence of port conflicts.
	// This predicate is actually a default predicate, because it is invoked from
	// predicates.GeneralPredicates()
	factory.RegisterFitPredicate(predicates.PodFitsHostPortsPred, predicates.PodFitsHostPorts)
	// Fit is determined by resource availability.
	// This predicate is actually a default predicate, because it is invoked from
	// predicates.GeneralPredicates()
	factory.RegisterFitPredicate(predicates.PodFitsResourcesPred, predicates.PodFitsResources)
	// Fit is determined by the presence of the Host parameter and a string match
	// This predicate is actually a default predicate, because it is invoked from
	// predicates.GeneralPredicates()
	factory.RegisterFitPredicate(predicates.HostNamePred, predicates.PodFitsHost)
	// Fit is determined by node selector query.
	factory.RegisterFitPredicate(predicates.MatchNodeSelectorPred, predicates.PodMatchNodeSelector)

	// ServiceSpreadingPriority is a priority config factory that spreads pods by minimizing
	// the number of pods (belonging to the same service) on the same node.
	// Register the factory so that it's available, but do not include it as part of the default priorities
	// Largely replaced by "SelectorSpreadPriority", but registered for backward compatibility with 1.0
	factory.RegisterPriorityConfigFactory(
		"ServiceSpreadingPriority",
		factory.PriorityConfigFactory{
			MapReduceFunction: func(args factory.PluginFactoryArgs) (algorithm.PriorityMapFunction, algorithm.PriorityReduceFunction) {
				return priorities.NewSelectorSpreadPriority(args.ServiceLister, algorithm.EmptyControllerLister{}, algorithm.EmptyReplicaSetLister{}, algorithm.EmptyStatefulSetLister{})
			},
			Weight: 1,
		},
	)
	// EqualPriority is a prioritizer function that gives an equal weight of one to all nodes
	// Register the priority function so that its available
	// but do not include it as part of the default priorities
	factory.RegisterPriorityFunction2("EqualPriority", core.EqualPriorityMap, nil, 1)
	// Optional, cluster-autoscaler friendly priority function - give used nodes higher priority.
	factory.RegisterPriorityFunction2("MostRequestedPriority", priorities.MostRequestedPriorityMap, nil, 1)
	factory.RegisterPriorityFunction2(
		"RequestedToCapacityRatioPriority",
		priorities.RequestedToCapacityRatioResourceAllocationPriorityDefault().PriorityMap,
		nil,
		1)
}

以下對init中的註冊進行拆分介紹。

2.1. registerAlgorithmProvider

此部分主要註冊默認的預選和優選策略。

// Register functions that extract metadata used by predicates and priorities computations.
factory.RegisterPredicateMetadataProducerFactory(
	func(args factory.PluginFactoryArgs) algorithm.PredicateMetadataProducer {
		return predicates.NewPredicateMetadataFactory(args.PodLister)
	})
factory.RegisterPriorityMetadataProducerFactory(
	func(args factory.PluginFactoryArgs) algorithm.PriorityMetadataProducer {
		return priorities.NewPriorityMetadataFactory(args.ServiceLister, args.ControllerLister, args.ReplicaSetLister, args.StatefulSetLister)
	})

registerAlgorithmProvider(defaultPredicates(), defaultPriorities())

registerAlgorithmProvider

註冊AlgorithmProvider,其中包括DefaultProviderClusterAutoscalerProvider

func registerAlgorithmProvider(predSet, priSet sets.String) {
	// Registers algorithm providers. By default we use 'DefaultProvider', but user can specify one to be used
	// by specifying flag.
	factory.RegisterAlgorithmProvider(factory.DefaultProvider, predSet, priSet)
	// Cluster autoscaler friendly scheduling algorithm.
	factory.RegisterAlgorithmProvider(ClusterAutoscalerProvider, predSet,
		copyAndReplace(priSet, "LeastRequestedPriority", "MostRequestedPriority"))
}

2.2. RegisterFitPredicate

在init部分註冊預選策略函數。

預選策略如下:

調度策略 函數 描述
PodFitsPorts PodFitsHostPorts PodFitsPorts已經被PodFitsHostPorts代替,此處主要是爲了兼容性。
PodFitsHostPortsPred PodFitsHostPorts 判斷是否與宿主機的端口衝突。
PodFitsResourcesPred PodFitsResources 判斷node資源是否充足。
HostNamePred PodFitsHost 判斷pod所指定調度的節點是否是當前的節點。
MatchNodeSelectorPred PodMatchNodeSelector 判斷pod指定的node selector是否匹配當前的node。

具體代碼如下:

// PodFitsPorts has been replaced by PodFitsHostPorts for better user understanding.
// For backwards compatibility with 1.0, PodFitsPorts is registered as well.
factory.RegisterFitPredicate("PodFitsPorts", predicates.PodFitsHostPorts)
// Fit is defined based on the absence of port conflicts.
// This predicate is actually a default predicate, because it is invoked from
// predicates.GeneralPredicates()
factory.RegisterFitPredicate(predicates.PodFitsHostPortsPred, predicates.PodFitsHostPorts)
// Fit is determined by resource availability.
// This predicate is actually a default predicate, because it is invoked from
// predicates.GeneralPredicates()
factory.RegisterFitPredicate(predicates.PodFitsResourcesPred, predicates.PodFitsResources)
// Fit is determined by the presence of the Host parameter and a string match
// This predicate is actually a default predicate, because it is invoked from
// predicates.GeneralPredicates()
factory.RegisterFitPredicate(predicates.HostNamePred, predicates.PodFitsHost)
// Fit is determined by node selector query.
factory.RegisterFitPredicate(predicates.MatchNodeSelectorPred, predicates.PodMatchNodeSelector)

2.3. RegisterPriorityFunction2

在init部分註冊優選策略函數。

// EqualPriority is a prioritizer function that gives an equal weight of one to all nodes
// Register the priority function so that its available
// but do not include it as part of the default priorities
factory.RegisterPriorityFunction2("EqualPriority", core.EqualPriorityMap, nil, 1)
// Optional, cluster-autoscaler friendly priority function - give used nodes higher priority.
factory.RegisterPriorityFunction2("MostRequestedPriority", priorities.MostRequestedPriorityMap, nil, 1)
factory.RegisterPriorityFunction2(
	"RequestedToCapacityRatioPriority",
	priorities.RequestedToCapacityRatioResourceAllocationPriorityDefault().PriorityMap,
	nil,
	1)

3. defaultPredicates

此部分爲默認預選策略的註冊函數。

默認的預選策略如下:

預選策略 函數 描述
NoVolumeZoneConflictPred NewVolumeZonePredicate 判斷pod使用到的volume是否有節點的要求。目前只支持pvc。
MaxEBSVolumeCountPred NewMaxPDVolumeCountPredicate 判斷pod使用EBSVolume在該節點上是否已經達到上限了。
MaxGCEPDVolumeCountPred NewMaxPDVolumeCountPredicate 判斷pod使用GCEPDVolume在該節點上是否已經達到上限了。
MaxAzureDiskVolumeCountPred NewMaxPDVolumeCountPredicate 判斷pod使用AzureDiskVolume在該節點上是否已經達到上限了。
MaxCSIVolumeCountPred NewCSIMaxVolumeLimitPredicate 判斷CSIVolume是否達到上限了。
MatchInterPodAffinityPred NewPodAffinityPredicate 匹配pod的親緣性。
NoDiskConflictPred NoDiskConflict 判斷是否有disk volumes的衝突。
GeneralPred GeneralPredicates 通用的預選策略
CheckNodeMemoryPressurePred CheckNodeMemoryPressurePredicate 判斷節點內存是否充足。
CheckNodeDiskPressurePred CheckNodeDiskPressurePredicate 判斷節點是否有磁盤壓力。
CheckNodePIDPressurePred CheckNodePIDPressurePredicate 判斷節點上的PID
CheckNodeConditionPred CheckNodeConditionPredicate 判斷node是否ready。
PodToleratesNodeTaintsPred PodToleratesNodeTaints 判斷pod是否可以容忍節點的taints。
CheckVolumeBindingPred NewVolumeBindingPredicate 判斷是否有volume拓撲的要求。

具體代碼如下:

func defaultPredicates() sets.String {
	return sets.NewString(
		// Fit is determined by volume zone requirements.
		factory.RegisterFitPredicateFactory(
			predicates.NoVolumeZoneConflictPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewVolumeZonePredicate(args.PVInfo, args.PVCInfo, args.StorageClassInfo)
			},
		),
		// Fit is determined by whether or not there would be too many AWS EBS volumes attached to the node
		factory.RegisterFitPredicateFactory(
			predicates.MaxEBSVolumeCountPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewMaxPDVolumeCountPredicate(predicates.EBSVolumeFilterType, args.PVInfo, args.PVCInfo)
			},
		),
		// Fit is determined by whether or not there would be too many GCE PD volumes attached to the node
		factory.RegisterFitPredicateFactory(
			predicates.MaxGCEPDVolumeCountPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewMaxPDVolumeCountPredicate(predicates.GCEPDVolumeFilterType, args.PVInfo, args.PVCInfo)
			},
		),
		// Fit is determined by whether or not there would be too many Azure Disk volumes attached to the node
		factory.RegisterFitPredicateFactory(
			predicates.MaxAzureDiskVolumeCountPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewMaxPDVolumeCountPredicate(predicates.AzureDiskVolumeFilterType, args.PVInfo, args.PVCInfo)
			},
		),
		factory.RegisterFitPredicateFactory(
			predicates.MaxCSIVolumeCountPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewCSIMaxVolumeLimitPredicate(args.PVInfo, args.PVCInfo)
			},
		),
		// Fit is determined by inter-pod affinity.
		factory.RegisterFitPredicateFactory(
			predicates.MatchInterPodAffinityPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewPodAffinityPredicate(args.NodeInfo, args.PodLister)
			},
		),

		// Fit is determined by non-conflicting disk volumes.
		factory.RegisterFitPredicate(predicates.NoDiskConflictPred, predicates.NoDiskConflict),

		// GeneralPredicates are the predicates that are enforced by all Kubernetes components
		// (e.g. kubelet and all schedulers)
		factory.RegisterFitPredicate(predicates.GeneralPred, predicates.GeneralPredicates),

		// Fit is determined by node memory pressure condition.
		factory.RegisterFitPredicate(predicates.CheckNodeMemoryPressurePred, predicates.CheckNodeMemoryPressurePredicate),

		// Fit is determined by node disk pressure condition.
		factory.RegisterFitPredicate(predicates.CheckNodeDiskPressurePred, predicates.CheckNodeDiskPressurePredicate),

		// Fit is determined by node pid pressure condition.
		factory.RegisterFitPredicate(predicates.CheckNodePIDPressurePred, predicates.CheckNodePIDPressurePredicate),

		// Fit is determined by node conditions: not ready, network unavailable or out of disk.
		factory.RegisterMandatoryFitPredicate(predicates.CheckNodeConditionPred, predicates.CheckNodeConditionPredicate),

		// Fit is determined based on whether a pod can tolerate all of the node's taints
		factory.RegisterFitPredicate(predicates.PodToleratesNodeTaintsPred, predicates.PodToleratesNodeTaints),

		// Fit is determined by volume topology requirements.
		factory.RegisterFitPredicateFactory(
			predicates.CheckVolumeBindingPred,
			func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
				return predicates.NewVolumeBindingPredicate(args.VolumeBinder)
			},
		),
	)
}

4. defaultPriorities

此部分主要爲默認優選策略的註冊函數。

默認優選策略如下:

優選策略 函數 描述
SelectorSpreadPriority NewSelectorSpreadPriority 屬於相同service和rs下的pod儘量分佈在不同的node上。
InterPodAffinityPriority NewInterPodAffinityPriority 根據pod的親緣性,將相同拓撲域中的pod放在同一個節點
LeastRequestedPriority LeastRequestedPriorityMap 按最少請求的利用率對節點進行優先級排序。
BalancedResourceAllocation BalancedResourceAllocationMap 實現資源的平衡使用。
NodePreferAvoidPodsPriority CalculateNodePreferAvoidPodsPriorityMap 將此權重設置爲足以覆蓋所有其他優先級函數。
NodeAffinityPriority CalculateNodeAffinityPriorityMap pod指定label節點調度,來匹配node親緣性。
TaintTolerationPriority ComputeTaintTolerationPriorityMap pod有設置tolerate屬性來容忍node的taint。
ImageLocalityPriority ImageLocalityPriorityMap 根據節點上是否有該pod使用到的鏡像打分。

具體代碼實現如下:

func defaultPriorities() sets.String {
	return sets.NewString(
		// spreads pods by minimizing the number of pods (belonging to the same service or replication controller) on the same node.
		factory.RegisterPriorityConfigFactory(
			"SelectorSpreadPriority",
			factory.PriorityConfigFactory{
				MapReduceFunction: func(args factory.PluginFactoryArgs) (algorithm.PriorityMapFunction, algorithm.PriorityReduceFunction) {
					return priorities.NewSelectorSpreadPriority(args.ServiceLister, args.ControllerLister, args.ReplicaSetLister, args.StatefulSetLister)
				},
				Weight: 1,
			},
		),
		// pods should be placed in the same topological domain (e.g. same node, same rack, same zone, same power domain, etc.)
		// as some other pods, or, conversely, should not be placed in the same topological domain as some other pods.
		factory.RegisterPriorityConfigFactory(
			"InterPodAffinityPriority",
			factory.PriorityConfigFactory{
				Function: func(args factory.PluginFactoryArgs) algorithm.PriorityFunction {
					return priorities.NewInterPodAffinityPriority(args.NodeInfo, args.NodeLister, args.PodLister, args.HardPodAffinitySymmetricWeight)
				},
				Weight: 1,
			},
		),

		// Prioritize nodes by least requested utilization.
		factory.RegisterPriorityFunction2("LeastRequestedPriority", priorities.LeastRequestedPriorityMap, nil, 1),

		// Prioritizes nodes to help achieve balanced resource usage
		factory.RegisterPriorityFunction2("BalancedResourceAllocation", priorities.BalancedResourceAllocationMap, nil, 1),

		// Set this weight large enough to override all other priority functions.
		// TODO: Figure out a better way to do this, maybe at same time as fixing #24720.
		factory.RegisterPriorityFunction2("NodePreferAvoidPodsPriority", priorities.CalculateNodePreferAvoidPodsPriorityMap, nil, 10000),

		// Prioritizes nodes that have labels matching NodeAffinity
		factory.RegisterPriorityFunction2("NodeAffinityPriority", priorities.CalculateNodeAffinityPriorityMap, priorities.CalculateNodeAffinityPriorityReduce, 1),

		// Prioritizes nodes that marked with taint which pod can tolerate.
		factory.RegisterPriorityFunction2("TaintTolerationPriority", priorities.ComputeTaintTolerationPriorityMap, priorities.ComputeTaintTolerationPriorityReduce, 1),

		// ImageLocalityPriority prioritizes nodes that have images requested by the pod present.
		factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1),
	)
}

參考:

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章