kube-proxy工作模式解析(一) -- iptables

我们都知道在创建service资源对象时，会为该服务分配一个虚拟的ip地址，我们访问该ip时，它会将请求转发到具体的后端pod上。service的作用就相当于反向代理，我们需要理解的是这个服务的ip是虚拟的，既然它是虚拟的，那我们是通过什么机制去访问的呢。service在很多情况下只是一个概念，而真正让service发挥作用的其实是kube-proxy组件，我们需要理解kube-proxy的原理和机制才能真正理解service的实现逻辑。
本次分析的源码基于k8s release 1.9，在当前版本下，支持的proxy模式为iptables、ipvs、userspace、winkernel、winuserspace。这里主要分析iptables和ipvs模式。对于kube-proxy，我们关注的无外乎是

kube-proxy关注k8s的哪些资源?
kube-proxy资源监听事件的操作流程？
kube-proxy是通过什么方式实现集群的请求转发的？

带着这几个疑问，我们开始分析kube-proxy源码，其源码结构如下

cmd/proxy
├── app  
│   ├── BUILD
│   ├── server.go  //proxy初始化以及运行启动函数
│   ├── server_other.go
│   ├── server_test.go
│   ├── server_windows.go
├── BUILD
├── proxy.go    kube-proxy的函数入口
pkg/proxy
├── /apis   //api目录
├── config  //config目录
├── healthcheck  //健康检测目录
├── iptables  //iptables模式
│   ├── BUILD
│   ├── OWNERS
│   ├── proxier.go   //iptables模式主要逻辑
│   ├── proxier_test.go
├── ...
├── BUILD
├── OWNERS
├── doc.go
├── types.go

我们先来解决第一个疑问，k8s关注的是哪些资源

	informerFactory := informers.NewSharedInformerFactory(s.Client, s.ConfigSyncPeriod)

	// Create configs (i.e. Watches for Services and Endpoints)
	// Note: RegisterHandler() calls need to happen before creation of Sources because sources
	// only notify on changes, and the initial update (on process start) may be lost if no handlers
	// are registered yet.
	serviceConfig := config.NewServiceConfig(informerFactory.Core().InternalVersion().Services(), s.ConfigSyncPeriod)
	serviceConfig.RegisterEventHandler(s.ServiceEventHandler)
	go serviceConfig.Run(wait.NeverStop)

	endpointsConfig := config.NewEndpointsConfig(informerFactory.Core().InternalVersion().Endpoints(), s.ConfigSyncPeriod)
	endpointsConfig.RegisterEventHandler(s.EndpointsEventHandler)
	go endpointsConfig.Run(wait.NeverStop)

	// This has to start after the calls to NewServiceConfig and NewEndpointsConfig because those
	// functions must configure their shared informer event handlers first.
	go informerFactory.Start(wait.NeverStop)

这里我们看到了informerFactory，这里关注了service和endpoint资源，所以第一个问题我们解决了，那我们关注的这些资源我们的逻辑是怎样的呢，即我们的增删改事件的流程是怎样的。看一看上述代码的NewServiceConfig和NewEndpointsConfig

// NewServiceConfig creates a new ServiceConfig.
func NewServiceConfig(serviceInformer coreinformers.ServiceInformer, resyncPeriod time.Duration) *ServiceConfig {
	result := &ServiceConfig{
		lister:       serviceInformer.Lister(),
		listerSynced: serviceInformer.Informer().HasSynced,
	}

	serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddService,
			UpdateFunc: result.handleUpdateService,
			DeleteFunc: result.handleDeleteService,
		},
		resyncPeriod,
	)

	return result
}

// NewEndpointsConfig creates a new EndpointsConfig.
func NewEndpointsConfig(endpointsInformer coreinformers.EndpointsInformer, resyncPeriod time.Duration) *EndpointsConfig {
	result := &EndpointsConfig{
		lister:       endpointsInformer.Lister(),
		listerSynced: endpointsInformer.Informer().HasSynced,
	}

	endpointsInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddEndpoints,
			UpdateFunc: result.handleUpdateEndpoints,
			DeleteFunc: result.handleDeleteEndpoints,
		},
		resyncPeriod,
	)

	return result
}

看到这里，我们应该很熟悉了，k8s源码里面的大多数都是这样的一种结构，list and watch和缓存机制(可以查看本博客的理解kubernetes tools/cache包系列文章)。我们知道，informer最后会将watch的事件添加到队列里面，然后处理事件，并将数据存储到store里，再distpatch我们之前注册的eventHandlerFuncs。我们查看service和endpoint的eventHandlerFuncs的，其增删改的逻辑几乎一样，唯一区别是update的函数有前任数据。查看serviceChangeMap和endpointChangeMap的update

func (scm *serviceChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Service) bool {
	scm.lock.Lock()
	defer scm.lock.Unlock()

	change, exists := scm.items[*namespacedName]
	if !exists {
		change = &serviceChange{}
		change.previous = serviceToServiceMap(previous)
		scm.items[*namespacedName] = change
	}
	change.current = serviceToServiceMap(current)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(scm.items, *namespacedName)
	}
	return len(scm.items) > 0
}

func (ecm *endpointsChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Endpoints) bool {
	ecm.lock.Lock()
	defer ecm.lock.Unlock()

	change, exists := ecm.items[*namespacedName]
	if !exists {
		change = &endpointsChange{}
		change.previous = endpointsToEndpointsMap(previous, ecm.hostname)
		ecm.items[*namespacedName] = change
	}
	change.current = endpointsToEndpointsMap(current, ecm.hostname)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(ecm.items, *namespacedName)
	}
	return len(ecm.items) > 0
}

看上面的逻辑其实一样，根据previous, current判断增删改逻辑，最后去更新map缓存数据。serviceToServiceMap根据service的信息去生成serviceMap，endpointsToEndpointsMap根据endpoint的信息生成，关于proxyServiceMap和proxyEndpointsMap的数据结构如下

type proxyServiceMap map[proxy.ServicePortName]*serviceInfo

// internal struct for string service information
type serviceInfo struct {
	clusterIP                net.IP
	port                     int
	protocol                 api.Protocol
	nodePort                 int
	loadBalancerStatus       api.LoadBalancerStatus
	sessionAffinityType      api.ServiceAffinity
	stickyMaxAgeSeconds      int
	externalIPs              []string
	loadBalancerSourceRanges []string
	onlyNodeLocalEndpoints   bool
	healthCheckNodePort      int
	// The following fields are computed and stored for performance reasons.
	serviceNameString        string
	servicePortChainName     utiliptables.Chain
	serviceFirewallChainName utiliptables.Chain
	serviceLBChainName       utiliptables.Chain
}

type proxyEndpointsMap map[proxy.ServicePortName][]*endpointsInfo

// internal struct for endpoints information
type endpointsInfo struct {
	endpoint string // TODO: should be an endpointString type
	isLocal  bool
	// The following fields we lazily compute and store here for performance
	// reasons. If the protocol is the same as you expect it to be, then the
	// chainName can be reused, otherwise it should be recomputed.
	protocol  string
	chainName utiliptables.Chain
}

我们得到数据数据来源了，那接下来的逻辑是怎样的呢？我们在OnserviceAdd函数(删改函数的代码类似)看到

func (proxier *Proxier) OnServiceAdd(service *api.Service) {
	namespacedName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
	if proxier.serviceChanges.update(&namespacedName, nil, service) && proxier.isInitialized() {
		proxier.syncRunner.Run()
	}
}

在每个事件到达并且建立缓存数据之后，都会运行syncRunner.Run()函数，这个Run()函数是我们在选择工作模式时给定的syncProxyRules函数

	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)

到这里，我们的第二个疑问也解答了，接下来第3个问题就是，我们的这些数据如何使用，最后让到达目的主机上面的数据包能够访问到后端pod，我们接下来分析syncProxyRules实现逻辑。
既然我们讲的kube-proxy的iptables工作模式，顾名思义，它是通过操作主机上的iptables起作用的，所以，你要先具备点iptables知识，这里推荐一个博文iptables详解系列。
当然你还必须要了解iptables的匹配顺序，这里copy一张数据包的iptables匹配规则上来

了解iptables的配置和匹配规则对于我们理解kube-proxy的iptables模式由极大的帮助，说白了，其实本身就是利用的iptables的知识去实现的。
我们接下来分析syncProxyRules函数，由于代码太长，关于iptables的操作这块代码我就不一一贴上来了，我们主要分析，为了给iptables操作准备数据，函数里面实现了哪些逻辑

	// We assume that if this was called, we really want to sync them,
	// even if nothing changed in the meantime. In other words, callers are
	// responsible for detecting no-op changes and not calling this function.
	serviceUpdateResult := updateServiceMap(
		proxier.serviceMap, &proxier.serviceChanges)
	endpointUpdateResult := updateEndpointsMap(
		proxier.endpointsMap, &proxier.endpointsChanges, proxier.hostname)

查看updateServiceMap和updateEndpointsMap函数，这里面有两个主要的函数merge和unmerge，merge根据serviceChanges或endpointsChanges数据去更新proxyServiceMap，unmerge将删除掉过时的数据。每次操作完毕后将changeMap置空。

// <serviceMap> is updated by this function (based on the given changes).
// <changes> map is cleared after applying them.
func updateServiceMap(
	serviceMap proxyServiceMap,
	changes *serviceChangeMap) (result updateServiceMapResult) {
	result.staleServices = sets.NewString()

	func() {
		changes.lock.Lock()
		defer changes.lock.Unlock()
		for _, change := range changes.items {
			existingPorts := serviceMap.merge(change.current)
			serviceMap.unmerge(change.previous, existingPorts, result.staleServices)
		}
		changes.items = make(map[types.NamespacedName]*serviceChange)
	}()

	// TODO: If this will appear to be computationally expensive, consider
	// computing this incrementally similarly to serviceMap.
	result.hcServices = make(map[types.NamespacedName]uint16)
	for svcPortName, info := range serviceMap {
		if info.healthCheckNodePort != 0 {
			result.hcServices[svcPortName.NamespacedName] = uint16(info.healthCheckNodePort)
		}
	}

	return result
}

初始化数据后，接下来就看怎么去使用了，kube-proxy的iptables模式保证主机上存在以下链

// Create and link the kube services chain.
	{
		tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}
		for _, table := range tablesNeedServicesChain {
			if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)
				return
			}
		}

		tableChainsNeedJumpServices := []struct {
			table utiliptables.Table
			chain utiliptables.Chain
		}{
			{utiliptables.TableFilter, utiliptables.ChainInput},
			{utiliptables.TableFilter, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainPrerouting},
		}
		comment := "kubernetes service portals"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}
		for _, tc := range tableChainsNeedJumpServices {
			if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)
				return
			}
		}
	}

	// Create and link the kube postrouting chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)
			return
		}

		comment := "kubernetes postrouting rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)
			return
		}
	}

	// Create and link the kube forward chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableFilter, kubeForwardChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableFilter, kubeForwardChain, err)
			return
		}

		comment := "kubernetes forward rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeForwardChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableFilter, utiliptables.ChainForward, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableFilter, utiliptables.ChainForward, kubeForwardChain, err)
			return
		}
	}

在syncProxyRules函数中，首先将调用iptables-save -t nat/filter去获取这两个表的数据，然后循环proxier.serviceMap和proxier.endpointsMap根据这些数据内容去完成k8s的iptables的规则和链的构建工作。
在这里我们需要明白以下几点

如果service对应的endpoint的后端pod个数为0，则在iptables中建立的转发规则为reject;
如果sessionAffinityType设置为"ClusterIP"，则将来自同一个ClientIP的请求都转发到同一个后端Pod上；
如果service的port选项中有nodePort，除了要创建iptables转发规则，还要占用主机上端口号为nodePort的端口，该操作时为了防止主机上的其它进程使用了该nodePort端口，导致访问冲突；
如果service设置了externalIPs，如果externalIPs为本节点，该操作和设置nodePort类似，区别是externalIPs使用的是service的port做转发，同时在节点上占用该端口，避免与其它应用进程出现端口冲突；
如果service的spec.loadBalancerSourceRanges不为空，则在使用一个集群内部IP地址和在NodePort上开放一个服务之外，向云提供商申请一个负载均衡器，会让流量转发到这个在每个节点上以:NodePort的形式开放的服务上。
kube-proxy模式在执行iptables-restore操作时会剔除不属于k8s生成的链和规则，避免对主机上的其它iptables造成影响。

kube-proxy工作模式解析(一) -- iptables

使用neovim打造go ide(支持代码跳转, 代码补全, 实时语法检查)

挑战程序设计竞赛 2.3章习题 poj 3046 Ant Counting

Shell/Python中的用户名获取

kube-proxy工作模式解析(一) -- iptables

升級ingress-nginx-controller的nginx版本

kubelet驅逐與buffer/cache的計算關係

通過prometheus實現k8s hpa自定義指標 (四)

通過prometheus實現k8s hpa自定義指標 (二)

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結