kube-proxy工作模式解析(一) -- iptables

我们都知道在创建service资源对象时,会为该服务分配一个虚拟的ip地址,我们访问该ip时,它会将请求转发到具体的后端pod上。service的作用就相当于反向代理,我们需要理解的是这个服务的ip是虚拟的,既然它是虚拟的,那我们是通过什么机制去访问的呢。service在很多情况下只是一个概念,而真正让service发挥作用的其实是kube-proxy组件,我们需要理解kube-proxy的原理和机制才能真正理解service的实现逻辑。
本次分析的源码基于k8s release 1.9,在当前版本下,支持的proxy模式为iptables、ipvs、userspace、winkernel、winuserspace。这里主要分析iptables和ipvs模式。对于kube-proxy,我们关注的无外乎是

kube-proxy关注k8s的哪些资源?
kube-proxy资源监听事件的操作流程?
kube-proxy是通过什么方式实现集群的请求转发的?

带着这几个疑问,我们开始分析kube-proxy源码,其源码结构如下

cmd/proxy
├── app  
│   ├── BUILD
│   ├── server.go  //proxy初始化以及运行启动函数
│   ├── server_other.go
│   ├── server_test.go
│   ├── server_windows.go
├── BUILD
├── proxy.go    kube-proxy的函数入口
pkg/proxy
├── /apis   //api目录
├── config  //config目录
├── healthcheck  //健康检测目录
├── iptables  //iptables模式
│   ├── BUILD
│   ├── OWNERS
│   ├── proxier.go   //iptables模式主要逻辑
│   ├── proxier_test.go
├── ...
├── BUILD
├── OWNERS
├── doc.go
├── types.go

我们先来解决第一个疑问,k8s关注的是哪些资源

	informerFactory := informers.NewSharedInformerFactory(s.Client, s.ConfigSyncPeriod)

	// Create configs (i.e. Watches for Services and Endpoints)
	// Note: RegisterHandler() calls need to happen before creation of Sources because sources
	// only notify on changes, and the initial update (on process start) may be lost if no handlers
	// are registered yet.
	serviceConfig := config.NewServiceConfig(informerFactory.Core().InternalVersion().Services(), s.ConfigSyncPeriod)
	serviceConfig.RegisterEventHandler(s.ServiceEventHandler)
	go serviceConfig.Run(wait.NeverStop)

	endpointsConfig := config.NewEndpointsConfig(informerFactory.Core().InternalVersion().Endpoints(), s.ConfigSyncPeriod)
	endpointsConfig.RegisterEventHandler(s.EndpointsEventHandler)
	go endpointsConfig.Run(wait.NeverStop)

	// This has to start after the calls to NewServiceConfig and NewEndpointsConfig because those
	// functions must configure their shared informer event handlers first.
	go informerFactory.Start(wait.NeverStop)

这里我们看到了informerFactory,这里关注了service和endpoint资源,所以第一个问题我们解决了,那我们关注的这些资源我们的逻辑是怎样的呢,即我们的增删改事件的流程是怎样的。看一看上述代码的NewServiceConfig和NewEndpointsConfig

// NewServiceConfig creates a new ServiceConfig.
func NewServiceConfig(serviceInformer coreinformers.ServiceInformer, resyncPeriod time.Duration) *ServiceConfig {
	result := &ServiceConfig{
		lister:       serviceInformer.Lister(),
		listerSynced: serviceInformer.Informer().HasSynced,
	}

	serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddService,
			UpdateFunc: result.handleUpdateService,
			DeleteFunc: result.handleDeleteService,
		},
		resyncPeriod,
	)

	return result
}

// NewEndpointsConfig creates a new EndpointsConfig.
func NewEndpointsConfig(endpointsInformer coreinformers.EndpointsInformer, resyncPeriod time.Duration) *EndpointsConfig {
	result := &EndpointsConfig{
		lister:       endpointsInformer.Lister(),
		listerSynced: endpointsInformer.Informer().HasSynced,
	}

	endpointsInformer.Informer().AddEventHandlerWithResyncPeriod(
		cache.ResourceEventHandlerFuncs{
			AddFunc:    result.handleAddEndpoints,
			UpdateFunc: result.handleUpdateEndpoints,
			DeleteFunc: result.handleDeleteEndpoints,
		},
		resyncPeriod,
	)

	return result
}

看到这里,我们应该很熟悉了,k8s源码里面的大多数都是这样的一种结构,list and watch和缓存机制(可以查看本博客的理解kubernetes tools/cache包系列文章)。我们知道,informer最后会将watch的事件添加到队列里面,然后处理事件,并将数据存储到store里,再distpatch我们之前注册的eventHandlerFuncs。我们查看service和endpoint的eventHandlerFuncs的,其增删改的逻辑几乎一样,唯一区别是update的函数有前任数据。查看serviceChangeMap和endpointChangeMap的update

func (scm *serviceChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Service) bool {
	scm.lock.Lock()
	defer scm.lock.Unlock()

	change, exists := scm.items[*namespacedName]
	if !exists {
		change = &serviceChange{}
		change.previous = serviceToServiceMap(previous)
		scm.items[*namespacedName] = change
	}
	change.current = serviceToServiceMap(current)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(scm.items, *namespacedName)
	}
	return len(scm.items) > 0
}

func (ecm *endpointsChangeMap) update(namespacedName *types.NamespacedName, previous, current *api.Endpoints) bool {
	ecm.lock.Lock()
	defer ecm.lock.Unlock()

	change, exists := ecm.items[*namespacedName]
	if !exists {
		change = &endpointsChange{}
		change.previous = endpointsToEndpointsMap(previous, ecm.hostname)
		ecm.items[*namespacedName] = change
	}
	change.current = endpointsToEndpointsMap(current, ecm.hostname)
	if reflect.DeepEqual(change.previous, change.current) {
		delete(ecm.items, *namespacedName)
	}
	return len(ecm.items) > 0
}

看上面的逻辑其实一样,根据previous, current判断增删改逻辑,最后去更新map缓存数据。serviceToServiceMap根据service的信息去生成serviceMap,endpointsToEndpointsMap根据endpoint的信息生成,关于proxyServiceMap和proxyEndpointsMap的数据结构如下

type proxyServiceMap map[proxy.ServicePortName]*serviceInfo

// internal struct for string service information
type serviceInfo struct {
	clusterIP                net.IP
	port                     int
	protocol                 api.Protocol
	nodePort                 int
	loadBalancerStatus       api.LoadBalancerStatus
	sessionAffinityType      api.ServiceAffinity
	stickyMaxAgeSeconds      int
	externalIPs              []string
	loadBalancerSourceRanges []string
	onlyNodeLocalEndpoints   bool
	healthCheckNodePort      int
	// The following fields are computed and stored for performance reasons.
	serviceNameString        string
	servicePortChainName     utiliptables.Chain
	serviceFirewallChainName utiliptables.Chain
	serviceLBChainName       utiliptables.Chain
}

type proxyEndpointsMap map[proxy.ServicePortName][]*endpointsInfo

// internal struct for endpoints information
type endpointsInfo struct {
	endpoint string // TODO: should be an endpointString type
	isLocal  bool
	// The following fields we lazily compute and store here for performance
	// reasons. If the protocol is the same as you expect it to be, then the
	// chainName can be reused, otherwise it should be recomputed.
	protocol  string
	chainName utiliptables.Chain
}

我们得到数据数据来源了,那接下来的逻辑是怎样的呢?我们在OnserviceAdd函数(删改函数的代码类似)看到

func (proxier *Proxier) OnServiceAdd(service *api.Service) {
	namespacedName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
	if proxier.serviceChanges.update(&namespacedName, nil, service) && proxier.isInitialized() {
		proxier.syncRunner.Run()
	}
}

在每个事件到达并且建立缓存数据之后,都会运行syncRunner.Run()函数,这个Run()函数是我们在选择工作模式时给定的syncProxyRules函数

	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)

到这里,我们的第二个疑问也解答了,接下来第3个问题就是,我们的这些数据如何使用,最后让到达目的主机上面的数据包能够访问到后端pod,我们接下来分析syncProxyRules实现逻辑。
既然我们讲的kube-proxy的iptables工作模式,顾名思义,它是通过操作主机上的iptables起作用的,所以,你要先具备点iptables知识,这里推荐一个博文iptables详解系列。
当然你还必须要了解iptables的匹配顺序,这里copy一张数据包的iptables匹配规则上来
在这里插入图片描述
了解iptables的配置和匹配规则对于我们理解kube-proxy的iptables模式由极大的帮助,说白了,其实本身就是利用的iptables的知识去实现的。
我们接下来分析syncProxyRules函数,由于代码太长,关于iptables的操作这块代码我就不一一贴上来了,我们主要分析,为了给iptables操作准备数据,函数里面实现了哪些逻辑

	// We assume that if this was called, we really want to sync them,
	// even if nothing changed in the meantime. In other words, callers are
	// responsible for detecting no-op changes and not calling this function.
	serviceUpdateResult := updateServiceMap(
		proxier.serviceMap, &proxier.serviceChanges)
	endpointUpdateResult := updateEndpointsMap(
		proxier.endpointsMap, &proxier.endpointsChanges, proxier.hostname)

查看updateServiceMap和updateEndpointsMap函数,这里面有两个主要的函数merge和unmerge,merge根据serviceChanges或endpointsChanges数据去更新proxyServiceMap,unmerge将删除掉过时的数据。每次操作完毕后将changeMap置空。

// <serviceMap> is updated by this function (based on the given changes).
// <changes> map is cleared after applying them.
func updateServiceMap(
	serviceMap proxyServiceMap,
	changes *serviceChangeMap) (result updateServiceMapResult) {
	result.staleServices = sets.NewString()

	func() {
		changes.lock.Lock()
		defer changes.lock.Unlock()
		for _, change := range changes.items {
			existingPorts := serviceMap.merge(change.current)
			serviceMap.unmerge(change.previous, existingPorts, result.staleServices)
		}
		changes.items = make(map[types.NamespacedName]*serviceChange)
	}()

	// TODO: If this will appear to be computationally expensive, consider
	// computing this incrementally similarly to serviceMap.
	result.hcServices = make(map[types.NamespacedName]uint16)
	for svcPortName, info := range serviceMap {
		if info.healthCheckNodePort != 0 {
			result.hcServices[svcPortName.NamespacedName] = uint16(info.healthCheckNodePort)
		}
	}

	return result
}

初始化数据后,接下来就看怎么去使用了,kube-proxy的iptables模式保证主机上存在以下链

// Create and link the kube services chain.
	{
		tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}
		for _, table := range tablesNeedServicesChain {
			if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)
				return
			}
		}

		tableChainsNeedJumpServices := []struct {
			table utiliptables.Table
			chain utiliptables.Chain
		}{
			{utiliptables.TableFilter, utiliptables.ChainInput},
			{utiliptables.TableFilter, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainOutput},
			{utiliptables.TableNAT, utiliptables.ChainPrerouting},
		}
		comment := "kubernetes service portals"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}
		for _, tc := range tableChainsNeedJumpServices {
			if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {
				glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)
				return
			}
		}
	}

	// Create and link the kube postrouting chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)
			return
		}

		comment := "kubernetes postrouting rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)
			return
		}
	}

	// Create and link the kube forward chain.
	{
		if _, err := proxier.iptables.EnsureChain(utiliptables.TableFilter, kubeForwardChain); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableFilter, kubeForwardChain, err)
			return
		}

		comment := "kubernetes forward rules"
		args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeForwardChain)}
		if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableFilter, utiliptables.ChainForward, args...); err != nil {
			glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableFilter, utiliptables.ChainForward, kubeForwardChain, err)
			return
		}
	}

在syncProxyRules函数中,首先将调用iptables-save -t nat/filter去获取这两个表的数据,然后循环proxier.serviceMap和proxier.endpointsMap根据这些数据内容去完成k8s的iptables的规则和链的构建工作。
在这里我们需要明白以下几点

  • 如果service对应的endpoint的后端pod个数为0,则在iptables中建立的转发规则为reject;
  • 如果sessionAffinityType设置为"ClusterIP",则将来自同一个ClientIP的请求都转发到同一个后端Pod上;
  • 如果service的port选项中有nodePort,除了要创建iptables转发规则,还要占用主机上端口号为nodePort的端口,该操作时为了防止主机上的其它进程使用了该nodePort端口,导致访问冲突;
  • 如果service设置了externalIPs,如果externalIPs为本节点,该操作和设置nodePort类似,区别是externalIPs使用的是service的port做转发,同时在节点上占用该端口,避免与其它应用进程出现端口冲突;
  • 如果service的spec.loadBalancerSourceRanges不为空,则在使用一个集群内部IP地址和在NodePort上开放一个服务之外,向云提供商申请一个负载均衡器,会让流量转发到这个在每个节点上以:NodePort的形式开放的服务上。
  • kube-proxy模式在执行iptables-restore操作时会剔除不属于k8s生成的链和规则,避免对主机上的其它iptables造成影响。
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章