NSQ系列(二):nsqd原理和实现

在前一篇文章,我们大概了解了NSQ的集群架构和交互流程,这里我们将结合代码详细了解下nsqd的一些特性是如何实现,主要有,

  • nsqd与Producer的交互。nsqd如何处理Producer投递的消息,如果消息是延迟发送怎么处理
  • nsqd与Consumer的交互。nsqd如何处理Consumer Client的连接,如何把消息投递给Consumer,如果消息ack超时怎么处理
  • nsqd与nsqlookupd的交互。nsqd与nsqlookupd定期ping以及获取Topic和Channel的变更信息

数据结构

先看下数据结构的设计,主要三部分:NSQD、Topic和Channel

NSQD

nsq的数据结构定义如下,

type NSQD struct {
	// 64bit atomic vars need to be first for proper alignment on 32bit platforms
	clientIDSequence int64

	sync.RWMutex

	opts atomic.Value

	dl        *dirlock.DirLock
	isLoading int32
	errValue  atomic.Value
	startTime time.Time

	topicMap map[string]*Topic

	clientLock sync.RWMutex
	clients    map[int64]Client

	lookupPeers atomic.Value

	tcpListener   net.Listener
	httpListener  net.Listener
	httpsListener net.Listener
	tlsConfig     *tls.Config

	poolSize int

	notifyChan           chan interface{}
	optsNotificationChan chan struct{}
	exitChan             chan int
	waitGroup            util.WaitGroupWrapper

	ci *clusterinfo.ClusterInfo
}

NSQD的一些主要的字段,

topicMap map[string]*Topic

存储所有的Topic信息,key为topic name,

clients    map[int64]Client

存储当前所有与nsqd建立连接的Consumer,key为ClientID(nsqd会为每个Consumer Client在建立连接时分配一个ID)。

Topic

type Topic struct {
	// 64bit atomic vars need to be first for proper alignment on 32bit platforms
	messageCount uint64
	messageBytes uint64

	sync.RWMutex

	name              string
	// 记录Topic对应的Channel
	channelMap        map[string]*Channel 
	// 消息后端存储(目前主要是磁盘存储)
	backend           BackendQueue
	// 消息内存存储
	memoryMsgChan     chan *Message
	startChan         chan int
	exitChan          chan int
	// 监听Topic下的Channel信息是否更新
	channelUpdateChan chan int
	waitGroup         util.WaitGroupWrapper
	exitFlag          int32
	idFactory         *guidFactory
	
	// 是否是临时Topic
	ephemeral      bool
	deleteCallback func(*Topic)
	deleter        sync.Once

	paused    int32
	pauseChan chan int

	ctx *context
}

Channel

NSQ中,消息从Producer投递到Topic,Topic再投递到Channel,和Topic一样,Channel对消息的存储也分内存和后端,其中,后端存储目前也主要是磁盘存储。

type Channel struct {
	// 64bit atomic vars need to be first for proper alignment on 32bit platforms
	requeueCount uint64
	messageCount uint64
	timeoutCount uint64

	sync.RWMutex

	topicName string
	name      string
	ctx       *context
	
	// 后端存储
	backend BackendQueue
	// 内存存储
	memoryMsgChan chan *Message
	exitFlag      int32
	exitMutex     sync.RWMutex

	// state tracking
	clients        map[int64]Consumer
	paused         int32
	// 是否是临时Channel
	ephemeral      bool
	deleteCallback func(*Channel)
	deleter        sync.Once

	// Stats tracking
	e2eProcessingLatencyStream *quantile.Quantile

	// TODO: these can be DRYd up
	// NSQ支持消息延迟发送,Channel中会存储消息的延迟发送信息
	// map存储所有需要延迟发送的消息
	deferredMessages map[MessageID]*pqueue.Item
	// 延迟发送队列, 基于延迟时间进行堆排序
	deferredPQ       pqueue.PriorityQueue
	deferredMutex    sync.Mutex
	// map存储处于发送中的消息
	inFlightMessages map[MessageID]*Message
	// 同样用队列来记录消息的超时时间
	inFlightPQ       inFlightPqueue
	inFlightMutex    sync.Mutex
}

功能实现

nsqd启动

先看下nsqd的启动过程,

func (p *program) Start() error {
	// 加载 and 验证 启动配置
	opts := nsqd.NewOptions()
	...
	
	// 加载元数据
	err = p.nsqd.LoadMetadata()
	// 持久化元数据
	err = p.nsqd.PersistMetadata()
	// 异步启动 nsqd 实例
	go func() {
		err := p.nsqd.Main()
		if err != nil {
			p.Stop()
			os.Exit(1)
		}
	}()
}

上面代码涉及到了MetaData的加载和持久化,这里MetaData主要存储Topic和Channel信息,这样nsqd在崩溃重启时,可以像崩溃前一样继续服务。

func (n *NSQD) Main() error {
	// 监听错误退出
	exitCh := make(chan error)
	var once sync.Once
	exitFunc := func(err error) {
		once.Do(func() {
			if err != nil {
				n.logf(LOG_FATAL, "%s", err)
			}
			exitCh <- err
		})
	}

	tcpServer := &tcpServer{ctx: ctx}
	n.waitGroup.Wrap(func() {
		exitFunc(protocol.TCPServer(n.tcpListener, tcpServer, n.logf))
	})
	httpServer := newHTTPServer(ctx, false, n.getOpts().TLSRequired == TLSRequired)
	n.waitGroup.Wrap(func() {
		exitFunc(http_api.Serve(n.httpListener, httpServer, "HTTP", n.logf))
	})
	if n.tlsConfig != nil && n.getOpts().HTTPSAddress != "" {
		httpsServer := newHTTPServer(ctx, true, true)
		n.waitGroup.Wrap(func() {
			exitFunc(http_api.Serve(n.httpsListener, httpsServer, "HTTPS", n.logf))
		})
	}

	n.waitGroup.Wrap(n.queueScanLoop)
	n.waitGroup.Wrap(n.lookupLoop)
	if n.getOpts().StatsdAddress != "" {
		n.waitGroup.Wrap(n.statsdLoop)
	}
}

nsqd的启动,除了启动TCP Server和HTTP Server(nsq支持TCP和HTTP两种方式发布消息,消费信息只支持TCP),还启动了三个Goroutine loop,queueScanLoop、lookupLoop 和 statsdLoop。

提供服务

在启动模块我看看到了nsqd启动了TCP Server和HTTP Server,以及三个goroutine,这里详细介绍下这些服务和loop的作用,

TCP Server

nsq支持基于TCP连接来发送、消费消息,创建、删除Topic和Channel等。TCP Server的Handle方法负责处理每个与nsqd建立的TCP连接。nsq制定了一套简单的通讯协议(详细可以参考https://nsq.io/clients/tcp_protocol_spec.html),代码中的实现就是protocolV2的IOLoop方法,

IOLoop

func (p *protocolV2) IOLoop(conn net.Conn) error {
	// 注册Client信息。每个与nsqd建立连接的Client都会被分配一个ID
	clientID := atomic.AddInt64(&p.ctx.nsqd.clientIDSequence, 1)
	client := newClientV2(clientID, conn, p.ctx)
	p.ctx.nsqd.AddClient(client.ID, client)

	// 启动messagePump goroutine,主要进行消息投递(后面详细解释)
	// messagePumpStartedChan确保messagePump goroutine启动完成后,再执行后续逻辑
	messagePumpStartedChan := make(chan bool)
	go p.messagePump(client, messagePumpStartedChan)
	<-messagePumpStartedChan

	// 在for循环中不断读取并处理Client发来的命令
	for {
		// 设置Client的心跳信息
		...

		// 读取信息
		line, err = client.Reader.ReadSlice('\n')
		...
		params := bytes.Split(line, separatorBytes)

		var response []byte
		// Exec方法实现了命令处理的逻辑(后面介绍)
		response, err = p.Exec(client, params)
		if err != nil {
			// 如果处理出错,返回Client错误信息
			...
			// 如果发送Client错误信息失败 或者 错误属于 FatalClientErr,终止与Client的连接
			sendErr := p.Send(client, frameTypeError, []byte(err.Error()))
			if sendErr != nil {
				break
			}
			if _, ok := err.(*protocol.FatalClientErr); ok {
				break
			}
			continue
		}
		// 返回客户端执行结果
		if response != nil {
			err = p.Send(client, frameTypeResponse, response)
			if err != nil {
				err = fmt.Errorf("failed to send response - %s", err)
				break
			}
		}
	}
	
	// 关闭与Client的连接;关闭ExitChan;删除nsqd中注册的Client信息
	conn.Close()
	close(client.ExitChan)
	if client.Channel != nil {
		client.Channel.RemoveClient(client.ID)
	}

	p.ctx.nsqd.RemoveClient(client.ID)
	return err
}

IOLoop就是循环接受Client的命令请求、执行并返回Client执行结果,整个流程有两个比较重要的地方(messagePump和Exec,后面会单独结束),详细过程可以看代码注释。

messagePump

在IOLoop的for开始前启动了一个messagePump goroutine进行消息投递,这里结合代码看下详细过程,大家可以顺着代码看,整个过程没有很绕的逻辑,

func (p *protocolV2) messagePump(client *clientV2, startedChan chan bool) {
	// 一堆变量...
	var err error
	// Channel在内存中存储消息的chan
	var memoryMsgChan chan *Message
	// Channel在磁盘中存储消息的chan
	var backendMsgChan chan []byte
	// Client订阅的Channel
	var subChannel *Channel
	// NOTE: `flusherChan` is used to bound message latency for
	// the pathological case of a channel on a low volume topic
	// with >1 clients having >1 RDY counts
	var flusherChan <-chan time.Time
	// 发送消息采样率
	var sampleRate int32

	subEventChan := client.SubEventChan
	identifyEventChan := client.IdentifyEventChan
	outputBufferTicker := time.NewTicker(client.OutputBufferTimeout)
	heartbeatTicker := time.NewTicker(client.HeartbeatInterval)
	heartbeatChan := heartbeatTicker.C
	msgTimeout := client.MsgTimeout

	// v2 opportunistically buffers data to clients to reduce write system calls
	// we force flush in two cases:
	//    1. when the client is not ready to receive messages
	//    2. we're buffered and the channel has nothing left to send us
	//       (ie. we would block in this loop anyway)
	//
	flushed := true

	// signal to the goroutine that started the messagePump
	// that we've started up
	close(startedChan)

	for {
		// 如果Client还未指定订阅的Channel
		// 或者Client尚未准备好接受消息(Channel处于Pause状态;处于发送中的消息超过阈值)
		if subChannel == nil || !client.IsReadyForMessages() {
			// the client is not ready to receive messages...
			memoryMsgChan = nil
			backendMsgChan = nil
			flusherChan = nil
			// force flush
			client.writeLock.Lock()
			err = client.Flush()
			client.writeLock.Unlock()
			if err != nil {
				goto exit
			}
			flushed = true
		} else if flushed {
			// last iteration we flushed...
			// do not select on the flusher ticker channel
			memoryMsgChan = subChannel.memoryMsgChan
			backendMsgChan = subChannel.backend.ReadChan()
			flusherChan = nil
		} else {
			// we're buffered (if there isn't any more data we should flush)...
			// select on the flusher ticker channel, too
			memoryMsgChan = subChannel.memoryMsgChan
			backendMsgChan = subChannel.backend.ReadChan()
			flusherChan = outputBufferTicker.C
		}

		select {
		case <-flusherChan:
			// if this case wins, we're either starved
			// or we won the race between other channels...
			// in either case, force flush
			client.writeLock.Lock()
			err = client.Flush()
			client.writeLock.Unlock()
			if err != nil {
				goto exit
			}
			flushed = true
		case <-client.ReadyStateChan:
		case subChannel = <-subEventChan:
			// you can't SUB anymore
			subEventChan = nil
		case identifyData := <-identifyEventChan:
			// you can't IDENTIFY anymore
			identifyEventChan = nil

			outputBufferTicker.Stop()
			if identifyData.OutputBufferTimeout > 0 {
				outputBufferTicker = time.NewTicker(identifyData.OutputBufferTimeout)
			}

			heartbeatTicker.Stop()
			heartbeatChan = nil
			if identifyData.HeartbeatInterval > 0 {
				heartbeatTicker = time.NewTicker(identifyData.HeartbeatInterval)
				heartbeatChan = heartbeatTicker.C
			}

			if identifyData.SampleRate > 0 {
				sampleRate = identifyData.SampleRate
			}

			msgTimeout = identifyData.MsgTimeout
		case <-heartbeatChan:
			err = p.Send(client, frameTypeResponse, heartbeatBytes)
			if err != nil {
				goto exit
			}
		case b := <-backendMsgChan:
			// 从磁盘消息chan中读取消息,投递给Client
			// 如果指定了sample_rate,消息将会采样发送给Client
			if sampleRate > 0 && rand.Int31n(100) > sampleRate {
				continue
			}

			msg, err := decodeMessage(b)
			if err != nil {
				continue
			}
			// 记录尝试次数
			msg.Attempts++
			// 发送消息
			subChannel.StartInFlightTimeout(msg, client.ID, msgTimeout)
			client.SendingMessage()
			err = p.SendMessage(client, msg)
			if err != nil {
				goto exit
			}
			flushed = false
		case msg := <-memoryMsgChan:
			// 发送内存chan的消息和发送磁盘chan中的消息逻辑相同,这里不再重复
			if sampleRate > 0 && rand.Int31n(100) > sampleRate {
				continue
			}
			msg.Attempts++

			subChannel.StartInFlightTimeout(msg, client.ID, msgTimeout)
			client.SendingMessage()
			err = p.SendMessage(client, msg)
			if err != nil {
				goto exit
			}
			flushed = false
		case <-client.ExitChan:
			goto exit
		}
	}
}

注意:在从Channel中获取消息,投递给Consumer的这段代码中,消息是从内存chan中取还是从后端存储中取,不存在优先级,是随机取消息。

Exec

Exec方法就是封装了各个命令处理逻辑的实现,这里抽取几个常见的命令看下实现原理,

func (p *protocolV2) Exec(client *clientV2, params [][]byte) ([]byte, error) {
	if bytes.Equal(params[0], []byte("IDENTIFY")) {
		return p.IDENTIFY(client, params)
	}
	err := enforceTLSPolicy(client, p, params[0])
	if err != nil {
		return nil, err
	}
	switch {
	case bytes.Equal(params[0], []byte("FIN")):
		return p.FIN(client, params)
	case bytes.Equal(params[0], []byte("RDY")):
		return p.RDY(client, params)
	case bytes.Equal(params[0], []byte("REQ")):
		return p.REQ(client, params)
	case bytes.Equal(params[0], []byte("PUB")):
		return p.PUB(client, params)
	case bytes.Equal(params[0], []byte("MPUB")):
		return p.MPUB(client, params)
	case bytes.Equal(params[0], []byte("DPUB")):
		return p.DPUB(client, params)
	case bytes.Equal(params[0], []byte("NOP")):
		return p.NOP(client, params)
	case bytes.Equal(params[0], []byte("TOUCH")):
		return p.TOUCH(client, params)
	case bytes.Equal(params[0], []byte("SUB")):
		return p.SUB(client, params)
	case bytes.Equal(params[0], []byte("CLS")):
		return p.CLS(client, params)
	case bytes.Equal(params[0], []byte("AUTH")):
		return p.AUTH(client, params)
	}
	return nil, protocol.NewFatalClientErr(nil, "E_INVALID", fmt.Sprintf("invalid command %s", params[0]))
}

PUB

PUB是Producer发送消息,与发送有关的还有MPUB(批量发送)、DPUB(延迟发送),流程和PUB差不多,这里结合PUB的实现看下消息发送的整个流程。

func (p *protocolV2) PUB(client *clientV2, params [][]byte) ([]byte, error) {
	// 参数校验、读取数据、权限校验
	...
	messageBody := make([]byte, bodyLen)
	...
	// 获取消息所属的Topic
	// 如果当前nsqd没有这个Topic的信息,nsqd会创建一个新的Topic,同时从nsqlookupd中获取这个Topic的Channel信息
	topic := p.ctx.nsqd.GetTopic(topicName)
	msg := NewMessage(topic.GenerateID(), messageBody)
	// 消息投递到Topic
	err = topic.PutMessage(msg)
	if err != nil {
		return nil, protocol.NewFatalClientErr(err, "E_PUB_FAILED", "PUB failed "+err.Error())
	}

	client.PublishedMessage(topicName, 1)

	return okBytes, nil
}

Topic的消息投递是把消息存储内存或者后端,

func (t *Topic) put(m *Message) error {
	select {
	case t.memoryMsgChan <- m:
	default:
		b := bufferPoolGet()
		err := writeMessageToBackend(b, m, t.backend)
		bufferPoolPut(b)
		t.ctx.nsqd.SetHealth(err)
		if err != nil {
			return err
		}
	}
	return nil
}

这里的投递可以看出,消息还是优先投递到内存chan中,如果内存chan满了,才会写到后端存储中。
大家可能疑惑消息进一步投递的逻辑在哪儿,其实nsqd在新建一个Topic的时候,也会启动一个messagePump goroutine,来把消息

// Topic的messagePump负责从内存chan和后端存储中拿到消息,投递到每一个Channel
func (t *Topic) messagePump() {
	var msg *Message
	var buf []byte
	var err error
	var chans []*Channel
	var memoryMsgChan chan *Message
	var backendChan chan []byte

	// do not pass messages before Start(), but avoid blocking Pause() or GetChannel()
	for {
		select {
		case <-t.channelUpdateChan:
			continue
		case <-t.pauseChan:
			continue
		case <-t.exitChan:
			goto exit
		case <-t.startChan:
		}
		break
	}
	t.RLock()
	// 获取Topic下所有的Channel
	for _, c := range t.channelMap {
		chans = append(chans, c)
	}
	t.RUnlock()
	if len(chans) > 0 && !t.IsPaused() {
		memoryMsgChan = t.memoryMsgChan
		backendChan = t.backend.ReadChan()
	}

	// main message loop
	for {
		select {
		case msg = <-memoryMsgChan:
		case buf = <-backendChan:
			msg, err = decodeMessage(buf)
			if err != nil {
				continue
			}
		case <-t.channelUpdateChan:
			// 更新Channel信息
			chans = chans[:0]
			t.RLock()
			for _, c := range t.channelMap {
				chans = append(chans, c)
			}
			t.RUnlock()
			...
			continue
		case <-t.pauseChan:
			...
			continue
		case <-t.exitChan:
			goto exit
		}

		for i, channel := range chans {
			chanMsg := msg
			// 每个Channel都需要一个单独的消息变量
			// 但是Topic已经创建了一个,所以第一个Channel直接引用Topic创建的消息
			if i > 0 {
				chanMsg = NewMessage(msg.ID, msg.Body)
				chanMsg.Timestamp = msg.Timestamp
				chanMsg.deferred = msg.deferred
			}
			// 如果消息需要延迟发送,放到Channel的延迟队列中
			if chanMsg.deferred != 0 {
				channel.PutMessageDeferred(chanMsg, chanMsg.deferred)
				continue
			}
			err := channel.PutMessage(chanMsg)
		}
	}
}

消息投递到Channel后,SUB整个流程就结束了,等待着Consumer来消费。

SUB

SUB是消费消息,是Consumer与nsqd建立连接后发送的消费命令,SUB的逻辑其实就是把Consumer Client注册到对应的Channel上,而实际把消息从Channel投递到Consumer Client是在前面TCP Server的messagePump中完成的。

func (p *protocolV2) SUB(client *clientV2, params [][]byte) ([]byte, error) {
	// 参数校验;获取要消费的Topic、Channel;权限校验
	...
	// This retry-loop is a work-around for a race condition, where the
	// last client can leave the channel between GetChannel() and AddClient().
	// Avoid adding a client to an ephemeral channel / topic which has started exiting.
	var channel *Channel
	for {
		topic := p.ctx.nsqd.GetTopic(topicName)
		channel = topic.GetChannel(channelName)
		if err := channel.AddClient(client.ID, client); err != nil {
			return nil, protocol.NewFatalClientErr(nil, "E_TOO_MANY_CHANNEL_CONSUMERS",
				fmt.Sprintf("channel consumers for %s:%s exceeds limit of %d",
					topicName, channelName, p.ctx.nsqd.getOpts().MaxChannelConsumers))
		}

		if (channel.ephemeral && channel.Exiting()) || (topic.ephemeral && topic.Exiting()) {
			channel.RemoveClient(client.ID)
			time.Sleep(1 * time.Millisecond)
			continue
		}
		break
	}
	atomic.StoreInt32(&client.State, stateSubscribed)
	client.Channel = channel
	// update message pump
	client.SubEventChan <- channel

	return okBytes, nil
}

queueScanLoop

queueScanLoop负责处理两个消息队列:发送中队列和延迟队列。它维护一个woker pool来并发处理消息队列,pool的size是可以动态调整的(默认是4)。
queueScanLoop会定期随机抽取一定数量(可以通过QueueScanSelectionCount指定)的channel进行处理。

func (n *NSQD) queueScanLoop() {
	workCh := make(chan *Channel, n.getOpts().QueueScanSelectionCount)
	responseCh := make(chan bool, n.getOpts().QueueScanSelectionCount)
	closeCh := make(chan int)
	
	// 开启queueScanLoop定期抽取的timer
	workTicker := time.NewTicker(n.getOpts().QueueScanInterval)
	refreshTicker := time.NewTicker(n.getOpts().QueueScanRefreshInterval)

	channels := n.channels()
	// 调整并初始化worker pool
	// worker数据的确定: 1 <= pool <= min(num * 0.25, QueueScanWorkerPoolMax)
	n.resizePool(len(channels), workCh, responseCh, closeCh)

	for {
		select {
		case <-workTicker.C:
			if len(channels) == 0 {
				continue
			}
		case <-refreshTicker.C:
			channels = n.channels()
			n.resizePool(len(channels), workCh, responseCh, closeCh)
			continue
		case <-n.exitChan:
			goto exit
		}
		
		// 确定每次要抽取多少个channel
		num := n.getOpts().QueueScanSelectionCount
		if num > len(channels) {
			num = len(channels)
		}

	loop:
		for _, i := range util.UniqRands(num, len(channels)) {
			workCh <- channels[i]
		}

		numDirty := 0
		for i := 0; i < num; i++ {
			if <-responseCh {
				numDirty++
			}
		}
		// dirty 队列超过一定比例, 跳过等待, 继续循环处理
		if float64(numDirty)/float64(num) > n.getOpts().QueueScanDirtyPercent {
			goto loop
		}
	}
}

关于每个worker处理Channel的逻辑,代码如下,

func (n *NSQD) queueScanWorker(workCh chan *Channel, responseCh chan bool, closeCh chan int) {
	for {
		select {
		case c := <-workCh:
			now := time.Now().UnixNano()
			dirty := false
			if c.processInFlightQueue(now) {
				dirty = true
			}
			if c.processDeferredQueue(now) {
				dirty = true
			}
			responseCh <- dirty
		case <-closeCh:
			return
		}
	}
}

worker在收到要处理的Channel后,先处理待确认的消息队列(processInFlightQueue),然后再处理延迟消息队列(processDeferredQueue)。

  • processInFlightQueue。因为Channel会把处于发送中的消息基于过期时间放在堆排序的队列中,worker传入当前时间,从队列中取出过期时间小于当前时间(意味着已经超过了过期时间仍然没有ACK)的消息,将这部分消息从队列中移除,重新放入到channel中。
  • processDeferredQueue。nsqd也会把延迟发送的消息基于发送时间放在堆排序的队列中,这样worker就可以知道哪些消息需要被发送,把它们再次存储到channel中。延迟发送也是NSQ支持的特性之一,这里可以看到NSQ把需要延迟发送的消息存储在内存中,因此存在问题是 消息因宕机丢失;消息量过大占用过多内存。
    关于dirty,如果一个channel存在过期的消息要再次投递、或者延迟到期的消息需要投递,就把这个channel标记为dirty。如果被标记为dirty的channel比例超过设定的阈值(QueueScanDirtyPercent),就会再开启一轮消息处理,而不是sleep一段时间,这样可以方式消息处理过慢。

lookupLoop

lookupLoop,负责与nsqlookupd的交互,主要有,

  • 定时ping所有的nsqlookupd
  • 更新nsqlookupd配置并与所有的nsqlookupd建立连接
  • 接受nsqlookupd关于Topic或者Channel变更的通知
func (n *NSQD) lookupLoop() {
	// for announcements, lookupd determines the host automatically
	ticker := time.Tick(15 * time.Second)
	for {
		// connect用来标记是否需要建立连接
		// 当nsqd初次启动或者nsqd收到有新的nsqlookupd加入时,nsqd需要check是否需要与nsqlookupd建立连接
		if connect {
			for _, host := range n.getOpts().NSQLookupdTCPAddresses {
				if in(host, lookupAddrs) {
					continue
				}
				lookupPeer := newLookupPeer(host, n.getOpts().MaxBodySize, n.logf, connectCallback(n, hostname))
				lookupPeer.Command(nil) // start the connection
				lookupPeers = append(lookupPeers, lookupPeer)
				lookupAddrs = append(lookupAddrs, host)
			}
			n.lookupPeers.Store(lookupPeers)
			connect = false
		}

		select {
		case <-ticker:
			// 定期ping一下每个nsqlookupd
			for _, lookupPeer := range lookupPeers {
				cmd := nsq.Ping()
				_, err := lookupPeer.Command(cmd)
			}
		case val := <-n.notifyChan:
			// 接收nsqlookupd的消息通知,可能是Topic或者Channel存在变更
			// 把变更同步到nsqd
			var cmd *nsq.Command
			var branch string

			switch val.(type) {
			case *Channel:
				// notify all nsqlookupds that a new channel exists, or that it's removed
				branch = "channel"
				channel := val.(*Channel)
				if channel.Exiting() == true {
					cmd = nsq.UnRegister(channel.topicName, channel.name)
				} else {
					cmd = nsq.Register(channel.topicName, channel.name)
				}
			case *Topic:
				// notify all nsqlookupds that a new topic exists, or that it's removed
				branch = "topic"
				topic := val.(*Topic)
				if topic.Exiting() == true {
					cmd = nsq.UnRegister(topic.name, "")
				} else {
					cmd = nsq.Register(topic.name, "")
				}
			}

			for _, lookupPeer := range lookupPeers {
				_, err := lookupPeer.Command(cmd)
			}
		case <-n.optsNotificationChan:
			// nsqlookupd列表可能发生变化(可以通过接口动态设置)
			// 更新nsqlookupd实例列表到最新的配置
			var tmpPeers []*lookupPeer
			var tmpAddrs []string
			for _, lp := range lookupPeers {
				if in(lp.addr, n.getOpts().NSQLookupdTCPAddresses) {
					tmpPeers = append(tmpPeers, lp)
					tmpAddrs = append(tmpAddrs, lp.addr)
					continue
				}
				lp.Close()
			}
			lookupPeers = tmpPeers
			lookupAddrs = tmpAddrs
			connect = true
		case <-n.exitChan:
			goto exit
		}
	}
}

statsdLoop

用户可以配置是否开启statsdLoop,statsdLoop主要用于将nsqd当前的指标信息以UDP的形式同步给另外的Server,类似于监控数据打点,这样可以方便用户在admin观察nsqd实例的运行状态。主要记录的指标有消息堆积量、Channel的Client连接数、Channel待确认消息数量、延迟发送的消息数量、GC耗时、内存使用率等信息。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章