kube-proxy工作模式解析(一) -- iptables

我們都知道在創建service資源對象時,會爲該服務分配一個虛擬的ip地址,我們訪問該ip時,它會將請求轉發到具體的後端pod上。service的作用就相當於反向代理,我們需要理解的是這個服務的ip是虛擬的,既然它是虛擬的,那我們是通過什麼機制去訪問的呢。service在很多情況下只是一個概念,而真正讓service發揮作用的其實是kube-proxy組件,我們需要理解kube-proxy的原理和機制才能真正理解service的實現邏輯。
本次分析的源碼基於k8s release 1.9,在當前版本下,支持的proxy模式爲iptables、ipvs、userspace、winkernel、winuserspace。這裏主要分析iptables和ipvs模式。對於kube-proxy,我們關注的無外乎是

kube-proxy關注k8s的哪些資源?
kube-proxy資源監聽事件的操作流程?
kube-proxy是通過什麼方式實現集羣的請求轉發的?

帶着這幾個疑問,我們開始分析kube-proxy源碼,其源碼結構如下

cmd/proxy
├── app  
│   ├── BUILD
│   ├── server.go  //proxy初始化以及運行啓動函數
│   ├── server_other.go
│   ├── server_test.go
│   ├── server_windows.go
├── BUILD
├── proxy.go    kube-proxy的函數入口
pkg/proxy
├── /apis   //api目錄
├── config  //config目錄
├── healthcheck  //健康檢測目錄
├── iptables  //iptables模式
│   ├── BUILD
│   ├── OWNERS
│   ├── proxier.go   //iptables模式主要邏輯
│   ├── proxier_test.go
├── ...
├── BUILD
├── OWNERS
├── doc.go
├── types.go

我們先來解決第一個疑問,k8s關注的是哪些資源

	informerFactory := informers.NewSharedInformerFactory(s.Client, s.ConfigSyncPeriod)

	// Create configs (i.e. Watches for Services and Endpoints)
	// Note: RegisterHandler() calls need to happen before creation of Sources because sources
	// only notify on changes, and the initial update (on process start) may be lost if no handlers
	// are registered yet.
	serviceConfig := config.NewServiceConfig(informerFactory.Core().InternalVersion().Services(), s.ConfigSyncPeriod)
	serviceConfig.RegisterEventHandler(s.ServiceEventHandler)
	go serviceConfig.Run(wait.NeverStop)

	endpointsConfig := config.NewEndpointsConfig(informerFactory.Core().InternalVersion().Endpoints(), s.ConfigSyncPeriod)
	endpointsConfig.RegisterEventHandler(s.EndpointsEventHandler)
	go endpointsConfig.Run(wait.NeverStop)

	// This has to start after the calls to NewServiceConfig and NewEndpointsConfig because those
	// functions must configure their shared informer event handlers first.
	go informerFactory.Start(wait.NeverStop)

這裏我們看到了informerFactory,這裏關注了service和endpoint資源,所以第一個問題我們解決了,那我們關注的這些資源我們的邏輯是怎樣的呢,即我們的增刪改事件的流程是怎樣的。看一看上述代碼的NewServiceConfig和NewEndpointsConfig

// NewServiceConfig creates a new ServiceConfig.
func NewServiceConfig(serviceInformer coreinformers.ServiceInformer, resyncPeriod time.Duration) *ServiceConfig {
	result := &ServiceConfig{
		lister:       serviceInformer.Lister(),
		listerSynced: serviceInformer.Informer().HasSynced,
	}

	serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddService,
			UpdateFunc: result.handleUpdateService,
			DeleteFunc: result.handleDeleteService,
		},
		resyncPeriod,
	)

	return result
}

// NewEndpointsConfig creates a new EndpointsConfig.
func NewEndpointsConfig(endpointsInformer coreinformers.EndpointsInformer, resyncPeriod time.Duration) *EndpointsConfig {
	result := &EndpointsConfig{
		lister:       endpointsInformer.Lister(),
		listerSynced: endpointsInformer.Informer().HasSynced,
	}

	endpointsInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddEndpoints,
			UpdateFunc: result.handleUpdateEndpoints,
			DeleteFunc: result.handleDeleteEndpoints,
		},
		resyncPeriod,
	)

	return result
}

看到這裏,我們應該很熟悉了,k8s源碼裏面的大多數都是這樣的一種結構,list and watch和緩存機制(可以查看本博客的理解kubernetes tools/cache包系列文章)。我們知道,informer最後會將watch的事件添加到隊列裏面,然後處理事件,並將數據存儲到store裏,再distpatch我們之前註冊的eventHandlerFuncs。我們查看service和endpoint的eventHandlerFuncs的,其增刪改的邏輯幾乎一樣,唯一區別是update的函數有前任數據。查看serviceChangeMap和endpointChangeMap的update

func (scm *serviceChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Service) bool {
	scm.lock.Lock()
	defer scm.lock.Unlock()

	change, exists := scm.items[*namespacedName]
	if !exists {
		change = &serviceChange{}
		change.previous = serviceToServiceMap(previous)
		scm.items[*namespacedName] = change
	}
	change.current = serviceToServiceMap(current)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(scm.items, *namespacedName)
	}
	return len(scm.items) > 0
}

func (ecm *endpointsChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Endpoints) bool {
	ecm.lock.Lock()
	defer ecm.lock.Unlock()

	change, exists := ecm.items[*namespacedName]
	if !exists {
		change = &endpointsChange{}
		change.previous = endpointsToEndpointsMap(previous, ecm.hostname)
		ecm.items[*namespacedName] = change
	}
	change.current = endpointsToEndpointsMap(current, ecm.hostname)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(ecm.items, *namespacedName)
	}
	return len(ecm.items) > 0
}

看上面的邏輯其實一樣,根據previous, current判斷增刪改邏輯,最後去更新map緩存數據。serviceToServiceMap根據service的信息去生成serviceMap,endpointsToEndpointsMap根據endpoint的信息生成,關於proxyServiceMap和proxyEndpointsMap的數據結構如下

type proxyServiceMap map[proxy.ServicePortName]*serviceInfo

// internal struct for string service information
type serviceInfo struct {
	clusterIP                net.IP
	port                     int
	protocol                 api.Protocol
	nodePort                 int
	loadBalancerStatus       api.LoadBalancerStatus
	sessionAffinityType      api.ServiceAffinity
	stickyMaxAgeSeconds      int
	externalIPs              []string
	loadBalancerSourceRanges []string
	onlyNodeLocalEndpoints   bool
	healthCheckNodePort      int
	// The following fields are computed and stored for performance reasons.
	serviceNameString        string
	servicePortChainName     utiliptables.Chain
	serviceFirewallChainName utiliptables.Chain
	serviceLBChainName       utiliptables.Chain
}

type proxyEndpointsMap map[proxy.ServicePortName][]*endpointsInfo

// internal struct for endpoints information
type endpointsInfo struct {
	endpoint string // TODO: should be an endpointString type
	isLocal  bool
	// The following fields we lazily compute and store here for performance
	// reasons. If the protocol is the same as you expect it to be, then the
	// chainName can be reused, otherwise it should be recomputed.
	protocol  string
	chainName utiliptables.Chain
}

我們得到數據數據來源了,那接下來的邏輯是怎樣的呢?我們在OnserviceAdd函數(刪改函數的代碼類似)看到

func (proxier *Proxier) OnServiceAdd(service *api.Service) {
	namespacedName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
	if proxier.serviceChanges.update(&namespacedName, nil, service) && proxier.isInitialized() {
		proxier.syncRunner.Run()
	}
}

在每個事件到達並且建立緩存數據之後,都會運行syncRunner.Run()函數,這個Run()函數是我們在選擇工作模式時給定的syncProxyRules函數

	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)

到這裏,我們的第二個疑問也解答了,接下來第3個問題就是,我們的這些數據如何使用,最後讓到達目的主機上面的數據包能夠訪問到後端pod,我們接下來分析syncProxyRules實現邏輯。
既然我們講的kube-proxy的iptables工作模式,顧名思義,它是通過操作主機上的iptables起作用的,所以,你要先具備點iptables知識,這裏推薦一個博文iptables詳解系列。
當然你還必須要了解iptables的匹配順序,這裏copy一張數據包的iptables匹配規則上來
在這裏插入圖片描述
瞭解iptables的配置和匹配規則對於我們理解kube-proxy的iptables模式由極大的幫助,說白了,其實本身就是利用的iptables的知識去實現的。
我們接下來分析syncProxyRules函數,由於代碼太長,關於iptables的操作這塊代碼我就不一一貼上來了,我們主要分析,爲了給iptables操作準備數據,函數裏面實現了哪些邏輯

	// We assume that if this was called, we really want to sync them,
	// even if nothing changed in the meantime. In other words, callers are
	// responsible for detecting no-op changes and not calling this function.
	serviceUpdateResult := updateServiceMap(
		proxier.serviceMap, &proxier.serviceChanges)
	endpointUpdateResult := updateEndpointsMap(
		proxier.endpointsMap, &proxier.endpointsChanges, proxier.hostname)

查看updateServiceMap和updateEndpointsMap函數,這裏面有兩個主要的函數merge和unmerge,merge根據serviceChanges或endpointsChanges數據去更新proxyServiceMap,unmerge將刪除掉過時的數據。每次操作完畢後將changeMap置空。

// <serviceMap> is updated by this function (based on the given changes).
// <changes> map is cleared after applying them.
func updateServiceMap(
	serviceMap proxyServiceMap,
	changes *serviceChangeMap) (result updateServiceMapResult) {
	result.staleServices = sets.NewString()

	func() {
		changes.lock.Lock()
		defer changes.lock.Unlock()
		for _, change := range changes.items {
			existingPorts := serviceMap.merge(change.current)
			serviceMap.unmerge(change.previous, existingPorts, result.staleServices)
		}
		changes.items = make(map[types.NamespacedName]*serviceChange)
	}()

	// TODO: If this will appear to be computationally expensive, consider
	// computing this incrementally similarly to serviceMap.
	result.hcServices = make(map[types.NamespacedName]uint16)
	for svcPortName, info := range serviceMap {
		if info.healthCheckNodePort != 0 {
			result.hcServices[svcPortName.NamespacedName] = uint16(info.healthCheckNodePort)
		}
	}

	return result
}

初始化數據後,接下來就看怎麼去使用了,kube-proxy的iptables模式保證主機上存在以下鏈

// Create and link the kube services chain.
	{
		tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}
		for _, table := range tablesNeedServicesChain {
			if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)
				return
			}
		}

		tableChainsNeedJumpServices := []struct {
			table utiliptables.Table
			chain utiliptables.Chain
		}{
			{utiliptables.TableFilter, utiliptables.ChainInput},
			{utiliptables.TableFilter, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainPrerouting},
		}
		comment := "kubernetes service portals"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}
		for _, tc := range tableChainsNeedJumpServices {
			if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)
				return
			}
		}
	}

	// Create and link the kube postrouting chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)
			return
		}

		comment := "kubernetes postrouting rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)
			return
		}
	}

	// Create and link the kube forward chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableFilter, kubeForwardChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableFilter, kubeForwardChain, err)
			return
		}

		comment := "kubernetes forward rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeForwardChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableFilter, utiliptables.ChainForward, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableFilter, utiliptables.ChainForward, kubeForwardChain, err)
			return
		}
	}

在syncProxyRules函數中,首先將調用iptables-save -t nat/filter去獲取這兩個表的數據,然後循環proxier.serviceMap和proxier.endpointsMap根據這些數據內容去完成k8s的iptables的規則和鏈的構建工作。
在這裏我們需要明白以下幾點

  • 如果service對應的endpoint的後端pod個數爲0,則在iptables中建立的轉發規則爲reject;
  • 如果sessionAffinityType設置爲"ClusterIP",則將來自同一個ClientIP的請求都轉發到同一個後端Pod上;
  • 如果service的port選項中有nodePort,除了要創建iptables轉發規則,還要佔用主機上端口號爲nodePort的端口,該操作時爲了防止主機上的其它進程使用了該nodePort端口,導致訪問衝突;
  • 如果service設置了externalIPs,如果externalIPs爲本節點,該操作和設置nodePort類似,區別是externalIPs使用的是service的port做轉發,同時在節點上佔用該端口,避免與其它應用進程出現端口衝突;
  • 如果service的spec.loadBalancerSourceRanges不爲空,則在使用一個集羣內部IP地址和在NodePort上開放一個服務之外,向雲提供商申請一個負載均衡器,會讓流量轉發到這個在每個節點上以:NodePort的形式開放的服務上。
  • kube-proxy模式在執行iptables-restore操作時會剔除不屬於k8s生成的鏈和規則,避免對主機上的其它iptables造成影響。
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章