透過真實場景分析K8S的EndpointController的源碼

場景重現
最近遇到一個問題，在K8S的幾臺機器上中創建了Glusterfs的集羣，通過官方的教程一步步的來利用Glusterfs創建Volume以及PV，不過只是創建了每個Volume的Endpoint，並沒有相對應的創建Service實例（官方說創建Service會使Endpoint持久化，當時並沒有理會），然後在一次集羣重啓的時候發現Endpoint實例並沒有啓動起來，很疑惑，像其他的K8S對象，例如POD，Deployment，Service都啓動起來了，但是Endpoint並沒有，帶着這個問題看了下官方的Issue，並沒有什麼有效的解答，大家可以參考一下Issue: Endpoints are not persistented

探究源碼
1.1 源碼版本

基於k8s release-1.13
1.2 源碼目錄結構
由於我們重點看Endpoint部分，因此我們只看Endpoint相關的源碼
Endpoint
1.3 Endpoint的初始化
文件位置： endpoints_controller.go

// NewEndpointController returns a new *EndpointController.
//我們可以看到在Endpoint初始化的時候，已經註冊了三個informer，分別是podInformer，serviceInformer，endpointsInformer
func NewEndpointController(podInformer coreinformers.PodInformer, serviceInformer coreinformers.ServiceInformer,

endpointsInformer coreinformers.EndpointsInformer, client clientset.Interface) *EndpointController {
broadcaster := record.NewBroadcaster()
broadcaster.StartLogging(klog.Infof)
broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: client.CoreV1().Events("")})
recorder := broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "endpoint-controller"})

if client != nil && client.CoreV1().RESTClient().GetRateLimiter() != nil {
    metrics.RegisterMetricAndTrackRateLimiterUsage("endpoint_controller", client.CoreV1().RESTClient().GetRateLimiter())
}
e := &EndpointController{
    client:           client,
    queue:            workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "endpoint"),
    workerLoopPeriod: time.Second,
}
    //這裏對service進行watch操作，並註冊了對應的add\update\del等操作
serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{

//add：以添加的service的namespace/name形式爲key，並將該key加入 queue

    AddFunc: e.enqueueService,

//update：以更新後的service的namespace/name形式爲key，並將該key加入 queue

    UpdateFunc: func(old, cur interface{}) {
        e.enqueueService(cur)
    },

//delete：以刪除的service的namespace/name形式爲key，並將該key加入 queue

    DeleteFunc: e.enqueueService,
})
e.serviceLister = serviceInformer.Lister()
e.servicesSynced = serviceInformer.Informer().HasSynced
    //這裏對pod進行watch操作，並註冊了對應的add\update\del等操作
podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    AddFunc:    e.addPod,
    UpdateFunc: e.updatePod,
    DeleteFunc: e.deletePod,
})
e.podLister = podInformer.Lister()
e.podsSynced = podInformer.Informer().HasSynced

e.endpointsLister = endpointsInformer.Lister()
e.endpointsSynced = endpointsInformer.Informer().HasSynced

e.triggerTimeTracker = NewTriggerTimeTracker()
e.eventBroadcaster = broadcaster
e.eventRecorder = recorder

return e

}
我們看看pod註冊的Handler引用了哪些函數

1.3.1 e.addPod
func (e *EndpointController) addPod(obj interface{}) {
//實例化一個pod對象

pod := obj.(*v1.Pod)
services, err := e.getPodServiceMemberships(pod)
if err != nil {
    utilruntime.HandleError(fmt.Errorf("Unable to get pod %s/%s's service memberships: %v", pod.Namespace, pod.Name, err))
    return
}

//將service集合以namespace/name爲key逐個加入到queue中

for key := range services {
    e.queue.Add(key)
}

}

func (e EndpointController) getPodServiceMemberships(pod v1.Pod) (sets.String, error) {

set := sets.String{}

//獲取pod與service的映射關係

services, err := e.serviceLister.GetPodServices(pod)
if err != nil {
    // don't log this error because this function makes pointless
    // errors when no services match.
    return set, nil
}

//查找邏輯爲逐個對比service的selector與該pod的label，如果service的selector爲該pod label的子集，則表示該pod屬於service

for i := range services {
    key, err := controller.KeyFunc(services[i])
    if err != nil {
        return nil, err
    }
    set.Insert(key)
}
return set, nil

}
1.3.2 e.updatePod
func (e *EndpointController) updatePod(old, cur interface{}) {

newPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod)

//比較兩者的ResourceVersion，對比更新後的pod與原pod，如果兩者的資源版本相等，則直接返回，不進行入隊操作

if newPod.ResourceVersion == oldPod.ResourceVersion {
    // Periodic resync will send update events for all known pods.
    // Two different versions of the same pod will always have different RVs.
    return
}

//判斷pod相關信息是否發生改變

podChangedFlag := podChanged(oldPod, newPod)

// Check if the pod labels have changed, indicating a possible
// change in the service membership
labelsChanged := false

//判斷兩者的label是否已經不一致，或者hostname或subdomain已改變

if !reflect.DeepEqual(newPod.Labels, oldPod.Labels) ||
    !hostNameAndDomainAreEqual(newPod, oldPod) {
    labelsChanged = true
}

// If both the pod and labels are unchanged, no update is needed
if !podChangedFlag && !labelsChanged {
    return
}

//判斷錯誤，則獲取對應的service和pod映射關係

services, err := e.getPodServiceMemberships(newPod)
if err != nil {
    utilruntime.HandleError(fmt.Errorf("Unable to get pod %v/%v's service memberships: %v", newPod.Namespace, newPod.Name, err))
    return
}

if labelsChanged {
    oldServices, err := e.getPodServiceMemberships(oldPod)
    if err != nil {
        utilruntime.HandleError(fmt.Errorf("Unable to get pod %v/%v's service memberships: %v", oldPod.Namespace, oldPod.Name, err))
        return
    }
    services = determineNeededServiceUpdates(oldServices, services, podChangedFlag)
}

for key := range services {
    e.queue.Add(key)
}

}

func podChanged(oldPod, newPod *v1.Pod) bool {
//podChanged函數，其檢測邏輯爲，如果新舊兩個pod的DeletionTimestamp字段不等則返回true，否則繼續判斷兩者的就緒狀態，如果不等則返回true，最後再判斷新舊pod的ip、nodename、namespace、UID是否相等，如果相等則返回false，否則返回true。將返回結果賦值給podChangedFlag

// If the pod's deletion timestamp is set, remove endpoint from ready address.
if newPod.DeletionTimestamp != oldPod.DeletionTimestamp {
    return true
}
// If the pod's readiness has changed, the associated endpoint address
// will move from the unready endpoints set to the ready endpoints.
// So for the purposes of an endpoint, a readiness change on a pod
// means we have a changed pod.
if podutil.IsPodReady(oldPod) != podutil.IsPodReady(newPod) {
    return true
}
// Convert the pod to an EndpointAddress, clear inert fields,
// and see if they are the same.
newEndpointAddress := podToEndpointAddress(newPod)
oldEndpointAddress := podToEndpointAddress(oldPod)
// Ignore the ResourceVersion because it changes
// with every pod update. This allows the comparison to
// show equality if all other relevant fields match.
newEndpointAddress.TargetRef.ResourceVersion = ""
oldEndpointAddress.TargetRef.ResourceVersion = ""
if reflect.DeepEqual(newEndpointAddress, oldEndpointAddress) {
    // The pod has not changed in any way that impacts the endpoints
    return false
}
return true

}
1.4 Endpoint-Controller具體邏輯
// Run will not return until stopCh is closed. workers determines how many
// endpoints will be handled in parallel.
func (e *EndpointController) Run(workers int, stopCh <-chan struct{}) {

defer utilruntime.HandleCrash()
defer e.queue.ShutDown()

klog.Infof("Starting endpoint controller")
defer klog.Infof("Shutting down endpoint controller")

// 等待pod、service、endpoint列表同步

if !controller.WaitForCacheSync("endpoint", stopCh, e.podsSynced, e.servicesSynced, e.endpointsSynced) {
    return
}
// 這裏workers數爲kube-controller-manager啓動參數中的--concurrent-endpoint-syncs決定，默認爲5，workerLoopPeriod爲1秒
for i := 0; i < workers; i++ {

// 執行worker函數，for死循環處理queue中的key

    go wait.Until(e.worker, e.workerLoopPeriod, stopCh)
}

go func() {
    defer utilruntime.HandleCrash()
    e.checkLeftoverEndpoints()
}()

<-stopCh

}

func (e *EndpointController) worker() {

for e.processNextWorkItem() {
}

}

func (e *EndpointController) processNextWorkItem() bool {

eKey, quit := e.queue.Get()
if quit {
    return false
}
defer e.queue.Done(eKey)

err := e.syncService(eKey.(string))
e.handleErr(err, eKey)

return true

}

endpointController的主要邏輯在syncService函數

func (e *EndpointController) syncService(key string) error {

startTime := time.Now()
defer func() {
    klog.V(4).Infof("Finished syncing service %q endpoints. (%v)", key, time.Since(startTime))
}()

// 根據key獲取service的namespace和name

namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
    return err
}
service, err := e.serviceLister.Services(namespace).Get(name)
if err != nil {

// 如果service已經被刪除，則也要刪除對用的endpoint資源

    // Delete the corresponding endpoint, as the service has been deleted.
    // TODO: Please note that this will delete an endpoint when a
    // service is deleted. However, if we're down at the time when
    // the service is deleted, we will miss that deletion, so this
    // doesn't completely solve the problem. See #6877.
    err = e.client.CoreV1().Endpoints(namespace).Delete(name, nil)
    if err != nil && !errors.IsNotFound(err) {
        return err
    }
    e.triggerTimeTracker.DeleteEndpoints(namespace, name)
    return nil
}
// 如果service的.spec.selector字段爲空，直接返回，endpointController不處理這種情況
if service.Spec.Selector == nil {
    // services without a selector receive no endpoints from this controller;
    // these services will receive the endpoints that are created out-of-band via the REST API.
    return nil
}

klog.V(5).Infof("About to update endpoints for service %q", key)
pods, err := e.podLister.Pods(service.Namespace).List(labels.Set(service.Spec.Selector).AsSelectorPreValidated())
if err != nil {
    // Since we're getting stuff from a local cache, it is
    // basically impossible to get this error.
    return err
}

// If the user specified the older (deprecated) annotation, we have to respect it.
tolerateUnreadyEndpoints := service.Spec.PublishNotReadyAddresses
//如果service的註解含有key爲service.alpha.kubernetes.io/tolerate-unready-endpoints的值，該值爲bool類型，默認tolerateUnreadyEndpoints值爲false
if v, ok := service.Annotations[TolerateUnreadyEndpointsAnnotation]; ok {
    b, err := strconv.ParseBool(v)
    if err == nil {
        tolerateUnreadyEndpoints = b
    } else {
        utilruntime.HandleError(fmt.Errorf("Failed to parse annotation %v: %v", TolerateUnreadyEndpointsAnnotation, err))
    }
}

// We call ComputeEndpointsLastChangeTriggerTime here to make sure that the state of the trigger
// time tracker gets updated even if the sync turns out to be no-op and we don't update the
// endpoints object.
endpointsLastChangeTriggerTime := e.triggerTimeTracker.
    ComputeEndpointsLastChangeTriggerTime(namespace, name, service, pods)

subsets := []v1.EndpointSubset{}
var totalReadyEps int
var totalNotReadyEps int
//循環處理pod列表
for _, pod := range pods {
    // pod的podIp爲空，則continue for循環
    if len(pod.Status.PodIP) == 0 {
        klog.V(5).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name)
        continue
    }
    // 如果該pod正在被刪除，則continue for循環
    if !tolerateUnreadyEndpoints && pod.DeletionTimestamp != nil {

// 獲取該pod的信息，輸出EndpointAddress結構體變量

        klog.V(5).Infof("Pod is being deleted %s/%s", pod.Namespace, pod.Name)
        continue
    }

    epa := *podToEndpointAddress(pod)

    hostname := pod.Spec.Hostname
    // 如果pod存在hostname，則最後的FQDN爲hostname.subdomain.namespace.svc.cluster.local
    if len(hostname) > 0 && pod.Spec.Subdomain == service.Name && service.Namespace == pod.Namespace {
        epa.Hostname = hostname
    }

    // Allow headless service not to have ports.
    // 允許headless service沒有端口
    if len(service.Spec.Ports) == 0 {
        if service.Spec.ClusterIP == api.ClusterIPNone {

// 1、如果tolerateUnreadyEndpoints爲true，允許未就緒的pod也列入Addresses列表，如果tolerateUnreadyEndpoints爲false但pod狀態爲ready則將pod列入Addresses列表;

            // 2、檢測pod的重啓策略，如果重啓策略爲Never，pod的運行狀態不爲Failed且不是Succeeded，將該pod列入NotReadyAddresses，如果重啓策略爲OnFailure並且pod的運行狀態不爲Succeeded，將該pod列入NotReadyAddresses，其它情況也將該pod列入NotReadyAddresses；
            subsets, totalReadyEps, totalNotReadyEps = addEndpointSubset(subsets, pod, epa, nil, tolerateUnreadyEndpoints)
            // No need to repack subsets for headless service without ports.
        }
    } else {
        //　循環service的ports端口
        for i := range service.Spec.Ports {
            servicePort := &service.Spec.Ports[i]

            portName := servicePort.Name
            portProto := servicePort.Protocol
            portNum, err := podutil.FindPort(pod, servicePort)
            // 如果service中的port在pod中不存在，則繼續for循環
            if err != nil {
                klog.V(4).Infof("Failed to find port for service %s/%s: %v", service.Namespace, service.Name, err)
                continue
            }

            var readyEps, notReadyEps int
            epp := &v1.EndpointPort{Name: portName, Port: int32(portNum), Protocol: portProto}
            subsets, readyEps, notReadyEps = addEndpointSubset(subsets, pod, epa, epp, tolerateUnreadyEndpoints)
            totalReadyEps = totalReadyEps + readyEps
            totalNotReadyEps = totalNotReadyEps + notReadyEps
        }
    }
}
// 重新整理subsets
subsets = endpoints.RepackSubsets(subsets)
// 如果endpoint不存在(通常該情況是新建一個service的情況)，則新建一個，如果是其他未知錯誤，則返回err
// See if there's actually an update here.
currentEndpoints, err := e.endpointsLister.Endpoints(service.Namespace).Get(service.Name)
if err != nil {
    if errors.IsNotFound(err) {
        currentEndpoints = &v1.Endpoints{
            ObjectMeta: metav1.ObjectMeta{
                Name:   service.Name,
                Labels: service.Labels,
            },
        }
    } else {
        return err
    }
}
// currentEndpoints的資源版本爲空時，表示要創建endpoint
createEndpoints := len(currentEndpoints.ResourceVersion) == 0
// 如果當前currentEndpoints的subset列表和重新整理後的subsets相等，並且label與service的label一致，則忽略本次更新操作
if !createEndpoints &&
    apiequality.Semantic.DeepEqual(currentEndpoints.Subsets, subsets) &&
    apiequality.Semantic.DeepEqual(currentEndpoints.Labels, service.Labels) {
    klog.V(5).Infof("endpoints are equal for %s/%s, skipping update", service.Namespace, service.Name)
    return nil
}
newEndpoints := currentEndpoints.DeepCopy()
newEndpoints.Subsets = subsets
newEndpoints.Labels = service.Labels
if newEndpoints.Annotations == nil {
    newEndpoints.Annotations = make(map[string]string)
}

if !endpointsLastChangeTriggerTime.IsZero() {
    newEndpoints.Annotations[v1.EndpointsLastChangeTriggerTime] =
        endpointsLastChangeTriggerTime.Format(time.RFC3339Nano)
} else { // No new trigger time, clear the annotation.
    delete(newEndpoints.Annotations, v1.EndpointsLastChangeTriggerTime)
}

klog.V(4).Infof("Update endpoints for %v/%v, ready: %d not ready: %d", service.Namespace, service.Name, totalReadyEps, totalNotReadyEps)
if createEndpoints {
    // 如果沒有與service同命名空間和同名的endpoint，則生成新的endpoint
    // No previous endpoints, create them
    _, err = e.client.CoreV1().Endpoints(service.Namespace).Create(newEndpoints)
} else {
    // Pre-existing
    // 已經存在與service同命名空間和同名的endpoint，需要更新endpoint
    _, err = e.client.CoreV1().Endpoints(service.Namespace).Update(newEndpoints)
}
if err != nil {
    if createEndpoints && errors.IsForbidden(err) {
        // A request is forbidden primarily for two reasons:
        // 1. namespace is terminating, endpoint creation is not allowed by default.
        // 2. policy is misconfigured, in which case no service would function anywhere.
        // Given the frequency of 1, we log at a lower level.
        klog.V(5).Infof("Forbidden from creating endpoints: %v", err)
    }

    if createEndpoints {
        e.eventRecorder.Eventf(newEndpoints, v1.EventTypeWarning, "FailedToCreateEndpoint", "Failed to create endpoint for service %v/%v: %v", service.Namespace, service.Name, err)
    } else {
        e.eventRecorder.Eventf(newEndpoints, v1.EventTypeWarning, "FailedToUpdateEndpoint", "Failed to update endpoint %v/%v: %v", service.Namespace, service.Name, err)
    }

    return err
}
return nil

}
1.5 Endpoint檢測
之前說的是當Endpoint和Service綁定的時候Service和Pod改變時的一系列操作，現在我們回到問題，如果Endpoint單獨存在，K8S是如何檢測並且刪除的？
我們重新看看Run函數中的

go func() {

    defer utilruntime.HandleCrash()
    e.checkLeftoverEndpoints()
}()

K8S在運行Run函數的時候啓動了一個協程去檢測當前所有的Endpoint

// checkLeftoverEndpoints lists all currently existing endpoints and adds their
// service to the queue. This will detect endpoints that exist with no
// corresponding service; these endpoints need to be deleted. We only need to
// do this once on startup, because in steady-state these are detected (but
// some stragglers could have been left behind if the endpoint controller
// reboots).
func (e *EndpointController) checkLeftoverEndpoints() {
//拉取當前所有的endpoint對象

list, err := e.endpointsLister.List(labels.Everything())
if err != nil {
    utilruntime.HandleError(fmt.Errorf("Unable to list endpoints (%v); orphaned endpoints will not be cleaned up. (They're pretty harmless, but you can restart this component if you want another attempt made.)", err))
    return
}

//輪詢所有endpoint

for _, ep := range list {
    if _, ok := ep.Annotations[resourcelock.LeaderElectionRecordAnnotationKey]; ok {
        // when there are multiple controller-manager instances,
        // we observe that it will delete leader-election endpoints after 5min
        // and cause re-election
        // so skip the delete here
        // as leader-election only have endpoints without service
        continue
    }
    key, err := controller.KeyFunc(ep)
    if err != nil {
        utilruntime.HandleError(fmt.Errorf("Unable to get key for endpoint %#v", ep))
        continue
    }

//假如此處endpoint沒有對應的service，猜想會把endpoint的name當成key傳入queue，然後在之前的邏輯中判斷獲取service name錯誤，於是刪除endpoint

    e.queue.Add(key)
}

}

總結
一句話，遇到如上問題有兩種解決的方式：

創建Service的時候使用Selector，這樣可以自動創建Endpoint
在創建Endpoint還需要創建Service，這樣纔可以持久化Endpoint

透過真實場景分析K8S的EndpointController的源碼

NETCore中實現一個輕量無負擔的極簡任務調度ScheduleTask

docker使用特定的網絡

使用c#強大的表達式樹實現對象的深克隆之解決循環引用的問題

容器中nginx無法使用同一個網絡下的容器域名

避免DbContext同時在多個線程調用

GPT-4o 引領人機交互新風向，向量數據庫賽道沸騰了

free AI online tools All In One

痞子衡嵌入式：恩智浦i.MX RT1xxx系列MCU啓動那些事（12.A）- uSDHC eMMC啓動時間(RT1170)

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（二）使用kube-vip實現集羣VIP訪問

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（三）數據卷掛載NFS（網絡文件系統）

日誌太多怎麼搞？一套爬蟲監控系統全搞定！

深度解密Python工廠類相關模式

深入理解Python的TLS機制和Threading.local()

透過真實場景分析K8S的EndpointController的源碼

深度解密Python單例模式

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結