kube-proxy工作模式解析(一) -- iptables

我們都知道在創建service資源對象時，會爲該服務分配一個虛擬的ip地址，我們訪問該ip時，它會將請求轉發到具體的後端pod上。service的作用就相當於反向代理，我們需要理解的是這個服務的ip是虛擬的，既然它是虛擬的，那我們是通過什麼機制去訪問的呢。service在很多情況下只是一個概念，而真正讓service發揮作用的其實是kube-proxy組件，我們需要理解kube-proxy的原理和機制才能真正理解service的實現邏輯。
本次分析的源碼基於k8s release 1.9，在當前版本下，支持的proxy模式爲iptables、ipvs、userspace、winkernel、winuserspace。這裏主要分析iptables和ipvs模式。對於kube-proxy，我們關注的無外乎是

kube-proxy關注k8s的哪些資源?
kube-proxy資源監聽事件的操作流程？
kube-proxy是通過什麼方式實現集羣的請求轉發的？

帶着這幾個疑問，我們開始分析kube-proxy源碼，其源碼結構如下

cmd/proxy
├── app  
│   ├── BUILD
│   ├── server.go  //proxy初始化以及運行啓動函數
│   ├── server_other.go
│   ├── server_test.go
│   ├── server_windows.go
├── BUILD
├── proxy.go    kube-proxy的函數入口
pkg/proxy
├── /apis   //api目錄
├── config  //config目錄
├── healthcheck  //健康檢測目錄
├── iptables  //iptables模式
│   ├── BUILD
│   ├── OWNERS
│   ├── proxier.go   //iptables模式主要邏輯
│   ├── proxier_test.go
├── ...
├── BUILD
├── OWNERS
├── doc.go
├── types.go

我們先來解決第一個疑問，k8s關注的是哪些資源

	informerFactory := informers.NewSharedInformerFactory(s.Client, s.ConfigSyncPeriod)

	// Create configs (i.e. Watches for Services and Endpoints)
	// Note: RegisterHandler() calls need to happen before creation of Sources because sources
	// only notify on changes, and the initial update (on process start) may be lost if no handlers
	// are registered yet.
	serviceConfig := config.NewServiceConfig(informerFactory.Core().InternalVersion().Services(), s.ConfigSyncPeriod)
	serviceConfig.RegisterEventHandler(s.ServiceEventHandler)
	go serviceConfig.Run(wait.NeverStop)

	endpointsConfig := config.NewEndpointsConfig(informerFactory.Core().InternalVersion().Endpoints(), s.ConfigSyncPeriod)
	endpointsConfig.RegisterEventHandler(s.EndpointsEventHandler)
	go endpointsConfig.Run(wait.NeverStop)

	// This has to start after the calls to NewServiceConfig and NewEndpointsConfig because those
	// functions must configure their shared informer event handlers first.
	go informerFactory.Start(wait.NeverStop)

這裏我們看到了informerFactory，這裏關注了service和endpoint資源，所以第一個問題我們解決了，那我們關注的這些資源我們的邏輯是怎樣的呢，即我們的增刪改事件的流程是怎樣的。看一看上述代碼的NewServiceConfig和NewEndpointsConfig

// NewServiceConfig creates a new ServiceConfig.
func NewServiceConfig(serviceInformer coreinformers.ServiceInformer, resyncPeriod time.Duration) *ServiceConfig {
	result := &ServiceConfig{
		lister:       serviceInformer.Lister(),
		listerSynced: serviceInformer.Informer().HasSynced,
	}

	serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddService,
			UpdateFunc: result.handleUpdateService,
			DeleteFunc: result.handleDeleteService,
		},
		resyncPeriod,
	)

	return result
}

// NewEndpointsConfig creates a new EndpointsConfig.
func NewEndpointsConfig(endpointsInformer coreinformers.EndpointsInformer, resyncPeriod time.Duration) *EndpointsConfig {
	result := &EndpointsConfig{
		lister:       endpointsInformer.Lister(),
		listerSynced: endpointsInformer.Informer().HasSynced,
	}

	endpointsInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddEndpoints,
			UpdateFunc: result.handleUpdateEndpoints,
			DeleteFunc: result.handleDeleteEndpoints,
		},
		resyncPeriod,
	)

	return result
}

看到這裏，我們應該很熟悉了，k8s源碼裏面的大多數都是這樣的一種結構，list and watch和緩存機制(可以查看本博客的理解kubernetes tools/cache包系列文章)。我們知道，informer最後會將watch的事件添加到隊列裏面，然後處理事件，並將數據存儲到store裏，再distpatch我們之前註冊的eventHandlerFuncs。我們查看service和endpoint的eventHandlerFuncs的，其增刪改的邏輯幾乎一樣，唯一區別是update的函數有前任數據。查看serviceChangeMap和endpointChangeMap的update

func (scm *serviceChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Service) bool {
	scm.lock.Lock()
	defer scm.lock.Unlock()

	change, exists := scm.items[*namespacedName]
	if !exists {
		change = &serviceChange{}
		change.previous = serviceToServiceMap(previous)
		scm.items[*namespacedName] = change
	}
	change.current = serviceToServiceMap(current)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(scm.items, *namespacedName)
	}
	return len(scm.items) > 0
}

func (ecm *endpointsChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Endpoints) bool {
	ecm.lock.Lock()
	defer ecm.lock.Unlock()

	change, exists := ecm.items[*namespacedName]
	if !exists {
		change = &endpointsChange{}
		change.previous = endpointsToEndpointsMap(previous, ecm.hostname)
		ecm.items[*namespacedName] = change
	}
	change.current = endpointsToEndpointsMap(current, ecm.hostname)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(ecm.items, *namespacedName)
	}
	return len(ecm.items) > 0
}

看上面的邏輯其實一樣，根據previous, current判斷增刪改邏輯，最後去更新map緩存數據。serviceToServiceMap根據service的信息去生成serviceMap，endpointsToEndpointsMap根據endpoint的信息生成，關於proxyServiceMap和proxyEndpointsMap的數據結構如下

type proxyServiceMap map[proxy.ServicePortName]*serviceInfo

// internal struct for string service information
type serviceInfo struct {
	clusterIP                net.IP
	port                     int
	protocol                 api.Protocol
	nodePort                 int
	loadBalancerStatus       api.LoadBalancerStatus
	sessionAffinityType      api.ServiceAffinity
	stickyMaxAgeSeconds      int
	externalIPs              []string
	loadBalancerSourceRanges []string
	onlyNodeLocalEndpoints   bool
	healthCheckNodePort      int
	// The following fields are computed and stored for performance reasons.
	serviceNameString        string
	servicePortChainName     utiliptables.Chain
	serviceFirewallChainName utiliptables.Chain
	serviceLBChainName       utiliptables.Chain
}

type proxyEndpointsMap map[proxy.ServicePortName][]*endpointsInfo

// internal struct for endpoints information
type endpointsInfo struct {
	endpoint string // TODO: should be an endpointString type
	isLocal  bool
	// The following fields we lazily compute and store here for performance
	// reasons. If the protocol is the same as you expect it to be, then the
	// chainName can be reused, otherwise it should be recomputed.
	protocol  string
	chainName utiliptables.Chain
}

我們得到數據數據來源了，那接下來的邏輯是怎樣的呢？我們在OnserviceAdd函數(刪改函數的代碼類似)看到

func (proxier *Proxier) OnServiceAdd(service *api.Service) {
	namespacedName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
	if proxier.serviceChanges.update(&namespacedName, nil, service) && proxier.isInitialized() {
		proxier.syncRunner.Run()
	}
}

在每個事件到達並且建立緩存數據之後，都會運行syncRunner.Run()函數，這個Run()函數是我們在選擇工作模式時給定的syncProxyRules函數

	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)

到這裏，我們的第二個疑問也解答了，接下來第3個問題就是，我們的這些數據如何使用，最後讓到達目的主機上面的數據包能夠訪問到後端pod，我們接下來分析syncProxyRules實現邏輯。
既然我們講的kube-proxy的iptables工作模式，顧名思義，它是通過操作主機上的iptables起作用的，所以，你要先具備點iptables知識，這裏推薦一個博文iptables詳解系列。
當然你還必須要了解iptables的匹配順序，這裏copy一張數據包的iptables匹配規則上來

瞭解iptables的配置和匹配規則對於我們理解kube-proxy的iptables模式由極大的幫助，說白了，其實本身就是利用的iptables的知識去實現的。
我們接下來分析syncProxyRules函數，由於代碼太長，關於iptables的操作這塊代碼我就不一一貼上來了，我們主要分析，爲了給iptables操作準備數據，函數裏面實現了哪些邏輯

	// We assume that if this was called, we really want to sync them,
	// even if nothing changed in the meantime. In other words, callers are
	// responsible for detecting no-op changes and not calling this function.
	serviceUpdateResult := updateServiceMap(
		proxier.serviceMap, &proxier.serviceChanges)
	endpointUpdateResult := updateEndpointsMap(
		proxier.endpointsMap, &proxier.endpointsChanges, proxier.hostname)

查看updateServiceMap和updateEndpointsMap函數，這裏面有兩個主要的函數merge和unmerge，merge根據serviceChanges或endpointsChanges數據去更新proxyServiceMap，unmerge將刪除掉過時的數據。每次操作完畢後將changeMap置空。

// <serviceMap> is updated by this function (based on the given changes).
// <changes> map is cleared after applying them.
func updateServiceMap(
	serviceMap proxyServiceMap,
	changes *serviceChangeMap) (result updateServiceMapResult) {
	result.staleServices = sets.NewString()

	func() {
		changes.lock.Lock()
		defer changes.lock.Unlock()
		for _, change := range changes.items {
			existingPorts := serviceMap.merge(change.current)
			serviceMap.unmerge(change.previous, existingPorts, result.staleServices)
		}
		changes.items = make(map[types.NamespacedName]*serviceChange)
	}()

	// TODO: If this will appear to be computationally expensive, consider
	// computing this incrementally similarly to serviceMap.
	result.hcServices = make(map[types.NamespacedName]uint16)
	for svcPortName, info := range serviceMap {
		if info.healthCheckNodePort != 0 {
			result.hcServices[svcPortName.NamespacedName] = uint16(info.healthCheckNodePort)
		}
	}

	return result
}

初始化數據後，接下來就看怎麼去使用了，kube-proxy的iptables模式保證主機上存在以下鏈

// Create and link the kube services chain.
	{
		tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}
		for _, table := range tablesNeedServicesChain {
			if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)
				return
			}
		}

		tableChainsNeedJumpServices := []struct {
			table utiliptables.Table
			chain utiliptables.Chain
		}{
			{utiliptables.TableFilter, utiliptables.ChainInput},
			{utiliptables.TableFilter, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainPrerouting},
		}
		comment := "kubernetes service portals"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}
		for _, tc := range tableChainsNeedJumpServices {
			if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)
				return
			}
		}
	}

	// Create and link the kube postrouting chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)
			return
		}

		comment := "kubernetes postrouting rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)
			return
		}
	}

	// Create and link the kube forward chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableFilter, kubeForwardChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableFilter, kubeForwardChain, err)
			return
		}

		comment := "kubernetes forward rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeForwardChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableFilter, utiliptables.ChainForward, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableFilter, utiliptables.ChainForward, kubeForwardChain, err)
			return
		}
	}

在syncProxyRules函數中，首先將調用iptables-save -t nat/filter去獲取這兩個表的數據，然後循環proxier.serviceMap和proxier.endpointsMap根據這些數據內容去完成k8s的iptables的規則和鏈的構建工作。
在這裏我們需要明白以下幾點

如果service對應的endpoint的後端pod個數爲0，則在iptables中建立的轉發規則爲reject;
如果sessionAffinityType設置爲"ClusterIP"，則將來自同一個ClientIP的請求都轉發到同一個後端Pod上；
如果service的port選項中有nodePort，除了要創建iptables轉發規則，還要佔用主機上端口號爲nodePort的端口，該操作時爲了防止主機上的其它進程使用了該nodePort端口，導致訪問衝突；
如果service設置了externalIPs，如果externalIPs爲本節點，該操作和設置nodePort類似，區別是externalIPs使用的是service的port做轉發，同時在節點上佔用該端口，避免與其它應用進程出現端口衝突；
如果service的spec.loadBalancerSourceRanges不爲空，則在使用一個集羣內部IP地址和在NodePort上開放一個服務之外，向雲提供商申請一個負載均衡器，會讓流量轉發到這個在每個節點上以:NodePort的形式開放的服務上。
kube-proxy模式在執行iptables-restore操作時會剔除不屬於k8s生成的鏈和規則，避免對主機上的其它iptables造成影響。

kube-proxy工作模式解析(一) -- iptables

Python 爬蟲：Spring Boot 反爬蟲的成功案例

京東科技數字化營銷能力的演進與最佳實踐| 京東雲技術團隊

kube-proxy工作模式解析(一) -- iptables

升級ingress-nginx-controller的nginx版本

kubelet驅逐與buffer/cache的計算關係

通過prometheus實現k8s hpa自定義指標 (四)

通過prometheus實現k8s hpa自定義指標 (二)

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結