kubernetes daemonSet源碼學習
一、daemonSet controller對象創建
代碼路徑:/pkg/controller/daemon/daemon_controller.go
NewDaemonSetsController函數
函數聲明:
func NewDaemonSetsController(
daemonSetInformer appsinformers.DaemonSetInformer,
historyInformer appsinformers.ControllerRevisionInformer,
podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
kubeClient clientset.Interface,
failedPodsBackoff *flowcontrol.Backoff,
) (*DaemonSetsController, error)
主要功能:
-
給Informer註冊EventHandler
-
聲明一些核心處理函數
1、給Informer註冊EventHandler
入參中包含四種類型的informer:
-
daemonSetInformer appsinformers.DaemonSetInformer
- AddFunc
- DeleteFunc
- UpdateFunc
-
historyInformer appsinformers.ControllerRevisionInformer
- AddFunc
- DeleteFunc
- UpdateFunc
-
podInformer coreinformers.PodInformer
- AddFunc: addPod;
- UpdateFunc: updatePod;
- DeleteFunc: deletePod;
-
nodeInformer coreinformers.NodeInformer
- AddFunc: addNode;
- UpdateFunc: updateNode;
2、聲明核心處理函數
- dsc.syncHandler = dsc.syncDaemonSet:負責同步DaemonSet Queue中對象,包括Replicas管理、UpdateStrategy升級、更新DaemonSet Status等。
- dsc.enqueueDaemonSet = dsc.enqueue
- dsc.enqueueDaemonSetRateLimited = dsc.enqueueRateLimited
- dsc.failedPodsBackoff = failedPodsBackoff
二、daemonSet controller啓動
1、run函數
代碼路徑:/pkg/controller/daemon/daemon_controller.go
主要功能:
啓動worker協程,每個worker負責從queue中取DaemonSet Key調用syncHandler函數(實際是syncDaemonSet函數,因爲dsc.syncHandler = dsc.syncDaemonSet)進行sync。
啓動1個failedPodsBackoff GC協程,每隔一段時間清理一次集羣中所有DaemonSet/Node對應的Failed Pods。
func (dsc *DaemonSetsController) Run(workers int, stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
defer dsc.queue.ShutDown()
klog.Infof("Starting daemon sets controller")
defer klog.Infof("Shutting down daemon sets controller")
if !cache.WaitForNamedCacheSync("daemon sets", stopCh, dsc.podStoreSynced, dsc.nodeStoreSynced, dsc.historyStoreSynced, dsc.dsStoreSynced) {
return
}
for i := 0; i < workers; i++ {
//啓動worker協程,每個worker負責從queue中取DaemonSet Key進行sync。
go wait.Until(dsc.runWorker, time.Second, stopCh)
}
//啓動1個failedPodsBackoff GC協程,每隔1Min清理一次集羣中所有DaemonSet/Node對應的Failed Pods。
go wait.Until(dsc.failedPodsBackoff.GC, BackoffGCInterval, stopCh)
<-stopCh
}
func (dsc *DaemonSetsController) runWorker() {
for dsc.processNextWorkItem() {
}
}
// processNextWorkItem deals with one key off the queue. It returns false when it's time to quit.
func (dsc *DaemonSetsController) processNextWorkItem() bool {
dsKey, quit := dsc.queue.Get()
if quit {
return false
}
defer dsc.queue.Done(dsKey)
err := dsc.syncHandler(dsKey.(string))
if err == nil {
dsc.queue.Forget(dsKey)
return true
}
utilruntime.HandleError(fmt.Errorf("%v failed with : %v", dsKey, err))
dsc.queue.AddRateLimited(dsKey)
return true
}
2、failedPodsBackoff *flowcontrol.Backoff
代碼路徑:/vendor/k8s.io/client-go/util/flowcontrol/backoff.go
主要功能:
每隔2MaxDuration(215Min)會強制進行一次failedPods GC清理。每次syncDaemonSet處理該刪除的Pods時,會按照1s,2s,4s,8s,…15min的Backoff機制做一定的delay處理,實現流控的效果。
防止kubelet拒絕某些DaemonSet Pods後,馬上又被拒絕,如此就會出現很多無效的循環,因此加入了Backoff機制。
func (p *Backoff) GC() {
p.Lock()
defer p.Unlock()
now := p.Clock.Now()
for id, entry := range p.perItemBackoff {
if now.Sub(entry.lastUpdate) > p.maxDuration*2 {
// GC when entry has not been updated for 2*maxDuration
delete(p.perItemBackoff, id)
}
}
}
三、DaemonSet的同步
1、syncDaemonSet函數
代碼路徑:/pkg/controller/daemon/daemon_controller.go
處理步驟:
- 獲取 DaemonSet: 由key從dsLister(本地緩存)中獲取到需要處理的DaemonSet實例
- 獲取最新的 ControllerRevision和所有舊的ControllerRevision: 如果新的 ControllerRevision不存在,就新創建一個
- 獲取創建Pod用的hash: 從最新ControllerRevision的 Labels中提取
- 遍歷所有節點,創建或者刪除DaemonSet Pod
- DaemonSet Pod創建或者刪除完成後,進入Pod升級或者回滾處理邏輯
- 清理掉多餘的 ControllerRevision
- 更新 DaemonSet的Status
func (dsc *DaemonSetsController) syncDaemonSet(key string) error {
startTime := time.Now()
defer func() {
klog.V(4).Infof("Finished syncing daemon set %q (%v)", key, time.Since(startTime))
}()
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
}
//獲取 DaemonSet: 由key從dsLister(本地緩存)中獲取到需要處理的DaemonSet實例
ds, err := dsc.dsLister.DaemonSets(namespace).Get(name)
if errors.IsNotFound(err) {
klog.V(3).Infof("daemon set has been deleted %v", key)
dsc.expectations.DeleteExpectations(key)
return nil
}
if err != nil {
return fmt.Errorf("unable to retrieve ds %v from store: %v", key, err)
}
nodeList, err := dsc.nodeLister.List(labels.Everything())
if err != nil {
return fmt.Errorf("couldn't get list of nodes when syncing daemon set %#v: %v", ds, err)
}
everything := metav1.LabelSelector{}
if reflect.DeepEqual(ds.Spec.Selector, &everything) {
dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, SelectingAllReason, "This daemon set is selecting all pods. A non-empty selector is required.")
return nil
}
dsKey, err := controller.KeyFunc(ds)
if err != nil {
return fmt.Errorf("couldn't get key for object %#v: %v", ds, err)
}
if ds.DeletionTimestamp != nil {
return nil
}
// Construct histories of the DaemonSet, and get the hash of current history
//獲取最新的 ControllerRevision和所有舊的ControllerRevision: 如果新的 ControllerRevision不存在,就新創建一個
cur, old, err := dsc.constructHistory(ds)
if err != nil {
return fmt.Errorf("failed to construct revisions of DaemonSet: %v", err)
}
//獲取創建Pod用的hash: 從最新ControllerRevision的 Labels中提取
hash := cur.Labels[apps.DefaultDaemonSetUniqueLabelKey]
if !dsc.expectations.SatisfiedExpectations(dsKey) {
// Only update status. Don't raise observedGeneration since controller didn't process object of that generation.
return dsc.updateDaemonSetStatus(ds, nodeList, hash, false)
}
//遍歷所有節點,創建或者刪除DaemonSet Pod
err = dsc.manage(ds, nodeList, hash)
if err != nil {
return err
}
// Process rolling updates if we're ready.
//處理滾動更新的邏輯
if dsc.expectations.SatisfiedExpectations(dsKey) {
switch ds.Spec.UpdateStrategy.Type {
// OnDelete模式時,直接退出。等待用戶自行刪除舊Pod
case apps.OnDeleteDaemonSetStrategyType:
case apps.RollingUpdateDaemonSetStrategyType:
err = dsc.rollingUpdate(ds, nodeList, hash)
}
if err != nil {
return err
}
}
//清理掉多餘的 ControllerRevision
err = dsc.cleanupHistory(ds, old)
if err != nil {
return fmt.Errorf("failed to clean up revisions of DaemonSet: %v", err)
}
//更新 DaemonSet的Status
return dsc.updateDaemonSetStatus(ds, nodeList, hash, true)
}
2、manage函數
代碼路徑:/pkg/controller/daemon/daemon_controller.go
主要功能:進行DaemonSet Pod的管理:計算待刪除和創建的Pod列表,然後調用syncNodes分批次(1,2,4,8,…)的完成Pod的創建和刪除。如果syncNodes之前發現某些Node上對應DaemonSet Pod是Failed,那麼syncNodes後返回error。syncNode會將expectations中的add/del都歸零甚至負數
代碼如下:
// manage manages the scheduling and running of Pods of ds on nodes.
// After figuring out which nodes should run a Pod of ds but not yet running one and
// which nodes should not run a Pod of ds but currently running one, it calls function
// syncNodes with a list of pods to remove and a list of nodes to run a Pod of ds.
func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node, hash string) error {
// Find out the pods which are created for the nodes by DaemonSet.
nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ds)
if err != nil {
return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
}
// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
var nodesNeedingDaemonPods, podsToDelete []string
for _, node := range nodeList {
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, err := dsc.podsShouldBeOnNode(
node, nodeToDaemonPods, ds)
if err != nil {
continue
}
nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...)
podsToDelete = append(podsToDelete, podsToDeleteOnNode...)
}
// Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler.
// If node doesn't exist then pods are never scheduled and can't be deleted by PodGCController.
if utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
podsToDelete = append(podsToDelete, getUnscheduledPodsWithoutNode(nodeList, nodeToDaemonPods)...)
}
// Label new pods using the hash label value of the current history when creating them
if err = dsc.syncNodes(ds, podsToDelete, nodesNeedingDaemonPods, hash); err != nil {
return err
}
return nil
}
3、constructHistory函數
代碼路徑:/pkg/controller/daemon/update.go
主要功能:記錄daemonSet歷史,方便回滾
一些說明:
Pod回滾意味着DaemonSet的Spec.Template切換成舊的版本,可以理解Pod回滾爲RollingUpdate模式的升級到舊版本。如果Spec.Template要替換成舊版本,那麼首先需要保存舊版本的Spec.Template數據。下面首先說明下保存Spec.Template的數據結構
Controller Revision結構說明
每次升級的Spec.Template
數據就是以 Controllerrevision結構存儲在ETCD中。Controller Revision結構如下所示:
type ControllerRevision struct { metav1.TypeMeta metav1.ObjectMeta Data runtime.RawExtension Revision int64 }
其中Data
中保存序列化的Spec.Template
數據,Revison是每次升級對應的版本號,從1開始每次升級Revison值+1(即使回滾操作, Revision也會+1)。
代碼如下:
func (dsc *DaemonSetsController) constructHistory(ds *apps.DaemonSet) (cur *apps.ControllerRevision, old []*apps.ControllerRevision, err error) {
...
// 最新spec.Template對應的版本號=最大舊版本號+1
currRevision := maxRevision(old) + 1
switch len(currentHistories) {
case 0:
// 當前ControllerRevision不存在時,創建新的Contro1lerRevision
cur, err = dsc.snapshot(ds, currRevision)
if err != nil {
return nil, nil, err
}
default:
cur, err = dsc.dedupCurHistories(ds, currentHistories)
if err != nil {
return nil, nil, err
}
// 當版本回滾時會出現ControllerRevision.Revison < currRevision的狀態,
這時更新ControllerRevision的Revision爲currRevision
if cur.Revision < currRevision {
toUpdate := cur.DeepCopy()
toUpdate.Revision = currRevision
_, err = dsc.kubeClient.AppsV1().ControllerRevisions(ds.Namespace).Update(toUpdate)
if err != nil {
return nil, nil, err
}
}
}
return cur, old, err
}
4、rollingUpdate函數
代碼路徑:/pkg/controller/daemon/update.go
主要功能:實現daemonSet滾動更新
代碼如下:
func (dsc *DaemonSetsController) rollingUpdate(ds *apps.DaemonSet, hash string) error {
// 獲取所有節點上該DS已經運行的Pods
nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ds)
if err != nil {
return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
}
// 獲取所有的舊Pods
_, oldPods := dsc.getAllDaemonSetPods(ds, nodeToDaemonPods, hash)
// 獲取最大的不可用Pod數和當前不可用Pod數
maxUnavailable, numUnavailable, err := dsc.getUnavailableNumbers(ds, nodeToDaemonPods)
if err != nil {
return fmt.Errorf("Couldn't get unavailable numbers: %v", err)
}
// 對舊Pod進行分類,分爲可用Pod和不可用Pod
oldAvailablePods, oldUnavailablePods := util.SplitByAvailablePods(ds.Spec.MinReadySeconds, oldPods)
// 不可用舊Pod全部加入待刪除隊列
var oldPodsToDelete []string
for _, pod := range oldUnavailablePods {
if pod.DeletionTimestamp != nil {
continue
}
oldPodsToDelete = append(oldPodsToDelete, pod.Name)
}
// 從可用舊Pod中選取( maxUnavai1ab1e- numUnavai1able)箇舊Pod加入待刪除隊列
for _, pod := range oldAvailablePods {
if numUnavailable >= maxUnavailable {
break
}
oldPodsToDelete = append(oldPodsToDelete, pod.Name)
numUnavailable++
}
// 刪除oldPodsToDe1ete中的舊pod(保證可用Pod數不低於要求值)
return dsc.syncNodes(ds, oldPodsToDelete, []string{}, hash)
}
四、DaemonSet pod調度
1、nodeShouldRunDaemonPod函數
代碼路徑:/pkg/controller/daemon/daemon_controller.go
主要功能:返回3個bool類型值,判斷節點是否需要以及是否能夠部署daemonsetPod。
返回值參數說明:
- wantToRun: (理想情況)節點是否需要部署DaemonSet pod。主要用於DaemonSet狀態更新。
- shouldSchedule: (實際情況)節點是否可以部署DaemonSet Pod
- shouldContinueRunning: 節點上已經部署的DaemonSet Pod是否可以繼續運行(如節點新增了Pod不能tolerate的NoExecute taint時,該返回值爲false,即節點上DaemonSet Pod不能繼續運行)
Disk,Mem壓力/衝突或者資源(CPU或者內存等)不足時,wantToRun仍爲true,而shouldSchedule爲 false。即需要部署但是暫時不能部署的意思。
相關代碼如下:
case
predicates.ErrDiskConflict,
predicates.ErrVolumeZoneConflict,
predicates.ErrMaxVolumeCountExceeded,
predicates.ErrNodeUnderMemoryPressure,
predicates.ErrNodeUnderDiskPressure:
shouldSchedule = false //上述的error時, 暫時不能部署
...
if shouldSchedule && insufficientResourceErr != nil {
shouldSchedule = false // 資源不足時,也暫時不能部署
}
2、podsShouldBeOnNode函數
代碼路徑:/pkg/controller/daemon/daemon_controller.go
主要功能:podsShouldBeOnNode來計算出希望在該Node上啓動的DaemonSet Pods(nodesNeedingDaemonPods)、希望在該Node上刪除的DaemonSet Pods(podsToDelete),以及在該Node上已經Failed DamonSetPods數量
主要邏輯:
條件1: 需要部署(wantToRun=true)但是不能部署(shouldSchedule=false)時,先把Pod放入掛起隊列
條件2: 可以部署(shouldSchedule=true)且Pod未運行時,則要創建Pod
條件3: Pod可以繼續運行(shouldContinueRunning=true)時,如果Pod運行狀態爲failed,則刪除該Pod。如果節點上已經運行 DaemonSet Pod數 > 1,則刪除多餘的pod
條件4: Pod不可以繼續運行(shouldContinueRunning=false)但是Pod正在運行時,則刪除Pod。
代碼:
// podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node:
// - nodesNeedingDaemonPods: the pods need to start on the node
// - podsToDelete: the Pods need to be deleted on the node
// - err: unexpected error
func (dsc *DaemonSetsController) podsShouldBeOnNode(
node *v1.Node,
nodeToDaemonPods map[string][]*v1.Pod,
ds *apps.DaemonSet,
) (nodesNeedingDaemonPods, podsToDelete []string, err error) {
wantToRun, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(node, ds)
if err != nil {
return
}
daemonPods, exists := nodeToDaemonPods[node.Name]
dsKey, err := cache.MetaNamespaceKeyFunc(ds)
if err != nil {
utilruntime.HandleError(err)
return
}
dsc.removeSuspendedDaemonPods(node.Name, dsKey)
switch {
case wantToRun && !shouldSchedule:
//條件1
// If daemon pod is supposed to run, but can not be scheduled, add to suspended list.
dsc.addSuspendedDaemonPods(node.Name, dsKey)
case shouldSchedule && !exists:
//條件2
// If daemon pod is supposed to be running on node, but isn't, create daemon pod.
nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
case shouldContinueRunning:
//條件3
// If a daemon pod failed, delete it
// If there's non-daemon pods left on this node, we will create it in the next sync loop
var daemonPodsRunning []*v1.Pod
for _, pod := range daemonPods {
if pod.DeletionTimestamp != nil {
continue
}
// 運行結束且狀態爲失敗時,則刪除該Pod
if pod.Status.Phase == v1.PodFailed {
// This is a critical place where DS is often fighting with kubelet that rejects pods.
// We need to avoid hot looping and backoff.
backoffKey := failedPodsBackoffKey(ds, node.Name)
now := dsc.failedPodsBackoff.Clock.Now()
inBackoff := dsc.failedPodsBackoff.IsInBackOffSinceUpdate(backoffKey, now)
if inBackoff {
delay := dsc.failedPodsBackoff.Get(backoffKey)
klog.V(4).Infof("Deleting failed pod %s/%s on node %s has been limited by backoff - %v remaining",
pod.Namespace, pod.Name, node.Name, delay)
dsc.enqueueDaemonSetAfter(ds, delay)
continue
}
dsc.failedPodsBackoff.Next(backoffKey, now)
msg := fmt.Sprintf("Found failed daemon pod %s/%s on node %s, will try to kill it", pod.Namespace, pod.Name, node.Name)
klog.V(2).Infof(msg)
// Emit an event so that it's discoverable to users.
dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, FailedDaemonPodReason, msg)
podsToDelete = append(podsToDelete, pod.Name)
} else {
daemonPodsRunning = append(daemonPodsRunning, pod)
}
}
// If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods.
// Sort the daemon pods by creation time, so the oldest is preserved.
// 運行Pod數量超過1個時,則刪除所有後創建的DaemonSet Pod
if len(daemonPodsRunning) > 1 {
sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning))
for i := 1; i < len(daemonPodsRunning); i++ {
podsToDelete = append(podsToDelete, daemonPodsRunning[i].Name)
}
}
case !shouldContinueRunning && exists:
// 條件4
// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
for _, pod := range daemonPods {
if pod.DeletionTimestamp != nil {
continue
}
podsToDelete = append(podsToDelete, pod.Name)
}
}
return nodesNeedingDaemonPods, podsToDelete, nil
}
3、syncNodes函數
因爲DaemonSet Pod在每個節點上最多運行1個Pod,所以Pod創建有以下兩種方法:
方法1. 創建的Pod不經過kube-scheduler調度: 直接指定Pod運行節點(即設定pod.Spec.NodeName)。也意味DaemonSet Pod可以在kube-scheduler組件運行之前就啓動。
方法2. 創建的Pod需要經過kube-scheduler調度: 主要是搶佔調度時,所有Pod都由kube-scheduler來統籌調度更合理。實現上主要通過nodeAffinity來保證Pod最終會調度到該節點。代碼實現如下:
func (dsc *DaemonSetsController) syncNodes(ds *apps.DaemonSet, podsToDelete, nodesNeedingDaemonPods []string, hash string) error {
...
if utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
// 方法2: 設置NodeAffinity,經過kube-scheduler調度
podTemplate = template.DeepCopy()
podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodNodeNameNodeAffinity(
podTemplate.Spec.Affinity, nodesNeedingDaemonPods[ix])
podTemplate.Spec.Tolerations = util.AppendNoScheduleTolerationIfNotExist(podTemplate.Spec.Tolerations)
err = dsc.podControl.CreatePodsWithControllerRef(ds.Namespace, podTemplate, ds, metav1.NewControllerRef(ds, controllerKind))
} else {
// 方法1: 直接設置pod.Spec.NodeName,不經過kube-scheduler調度
err = dsc.podControl.CreatePodsOnNode(nodesNeedingDaemonPods[ix], ds.Namespace, podTemplate, ds, metav1.NewControllerRef(ds, controllerKind))
}
...
從上面代碼可知,K8S的V1.11.0版本中如果需要使用方法2,需要在kube-controller-manager的啓動參數中打開features.ScheduleDaemonSetPods功能。