如何處理fd讀事件
客戶端fd的觸發可讀事件後,回調函數是readQueryFromClient。該函數實現如下(文件networking.c文件中):
void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { client *c = (client*) privdata; int nread, readlen; size_t qblen; UNUSED(el); UNUSED(mask); readlen = PROTO_IOBUF_LEN; /* If this is a multi bulk request, and we are processing a bulk reply * that is large enough, try to maximize the probability that the query * buffer contains exactly the SDS string representing the object, even * at the risk of requiring more read(2) calls. This way the function * processMultiBulkBuffer() can avoid copying buffers to create the * Redis Object representing the argument. */ if (c->reqtype == PROTO_REQ_MULTIBULK && c->multibulklen && c->bulklen != -1 && c->bulklen >= PROTO_MBULK_BIG_ARG) { int remaining = (unsigned)(c->bulklen+2)-sdslen(c->querybuf); if (remaining < readlen) readlen = remaining; } qblen = sdslen(c->querybuf); if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; c->querybuf = sdsMakeRoomFor(c->querybuf, readlen); nread = read(fd, c->querybuf+qblen, readlen); if (nread == -1) { if (errno == EAGAIN) { return; } else { serverLog(LL_VERBOSE, "Reading from client: %s",strerror(errno)); freeClient(c); return; } } else if (nread == 0) { serverLog(LL_VERBOSE, "Client closed connection"); freeClient(c); return; } else if (c->flags & CLIENT_MASTER) { /* Append the query buffer to the pending (not applied) buffer * of the master. We'll use this buffer later in order to have a * copy of the string applied by the last command executed. */ c->pending_querybuf = sdscatlen(c->pending_querybuf, c->querybuf+qblen,nread); } sdsIncrLen(c->querybuf,nread); c->lastinteraction = server.unixtime; if (c->flags & CLIENT_MASTER) c->read_reploff += nread; server.stat_net_input_bytes += nread; if (sdslen(c->querybuf) > server.client_max_querybuf_len) { sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty(); bytes = sdscatrepr(bytes,c->querybuf,64); serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); sdsfree(ci); sdsfree(bytes); freeClient(c); return; } /* Time to process the buffer. If the client is a master we need to * compute the difference between the applied offset before and after * processing the buffer, to understand how much of the replication stream * was actually applied to the master state: this quantity, and its * corresponding part of the replication stream, will be propagated to * the sub-slaves and to the replication backlog. */ if (!(c->flags & CLIENT_MASTER)) { processInputBuffer(c); } else { size_t prev_offset = c->reploff; processInputBuffer(c); size_t applied = c->reploff - prev_offset; if (applied) { replicationFeedSlavesFromMasterStream(server.slaves, c->pending_querybuf, applied); sdsrange(c->pending_querybuf,applied,-1); } } }
我們給這個函數加個斷點,然後重新運行下redis-server,再啓動一個客戶端,然後嘗試給服務器發送一個命令"set hello world"。但是在我們實際調試的時候,我們發現。只要redis-cli一連接成功,gdb就觸發該斷點,此時並沒有發送我們預先想的命令。我們單步調試readQueryFromClient函數,將收取到的數據打印出來,得到如下字符串:
(gdb) p c->querybuf $8 = (sds) 0x7ffff09b8685 "*1\r\n$7\r\nCOMMAND\r\n"
這裏的c->querybuf是什麼呢?這裏的c的類型是client結構體,它是上文中我們介紹的連接接收成功後,產生新的客戶端fd,綁定回調函數時產生的並傳遞給readQueryFromClient函數的參數。我們可以在server.h中找到它的定義:
* With multiplexing we need to take per-client state. * Clients are taken in a linked list. */ typedef struct client { uint64_t id; /* Client incremental unique ID. */ int fd; /* Client socket. */ redisDb *db; /* Pointer to currently SELECTed DB. */ robj *name; /* As set by CLIENT SETNAME. */ sds querybuf; /* Buffer we use to accumulate client queries. */ //省略掉部分字段 } client;
client實際上是存儲每個客戶端連接信息的對象,其fd字段就是當前連接的fd,querybuf字段就是當前連接的接收緩衝區,也就是說每個新客戶端連接都會產生這樣一個對象。從fd上收取數據後就存儲在這個這個querybuf字段中。
我們貼一下完整的createClient函數的代碼:
client *createClient(int fd) { client *c = zmalloc(sizeof(client)); /* passing -1 as fd it is possible to create a non connected client. * This is useful since all the commands needs to be executed * in the context of a client. When commands are executed in other * contexts (for instance a Lua script) we need a non connected client. */ if (fd != -1) { anetNonBlock(NULL,fd); anetEnableTcpNoDelay(NULL,fd); if (server.tcpkeepalive) anetKeepAlive(NULL,fd,server.tcpkeepalive); if (aeCreateFileEvent(server.el,fd,AE_READABLE, readQueryFromClient, c) == AE_ERR) { close(fd); zfree(c); return NULL; } } selectDb(c,0); uint64_t client_id; atomicGetIncr(server.next_client_id,client_id,1); c->id = client_id; c->fd = fd; c->name = NULL; c->bufpos = 0; c->querybuf = sdsempty(); c->pending_querybuf = sdsempty(); c->querybuf_peak = 0; c->reqtype = 0; c->argc = 0; c->argv = NULL; c->cmd = c->lastcmd = NULL; c->multibulklen = 0; c->bulklen = -1; c->sentlen = 0; c->flags = 0; c->ctime = c->lastinteraction = server.unixtime; c->authenticated = 0; c->replstate = REPL_STATE_NONE; c->repl_put_online_on_ack = 0; c->reploff = 0; c->read_reploff = 0; c->repl_ack_off = 0; c->repl_ack_time = 0; c->slave_listening_port = 0; c->slave_ip[0] = '\0'; c->slave_capa = SLAVE_CAPA_NONE; c->reply = listCreate(); c->reply_bytes = 0; c->obuf_soft_limit_reached_time = 0; listSetFreeMethod(c->reply,freeClientReplyValue); listSetDupMethod(c->reply,dupClientReplyValue); c->btype = BLOCKED_NONE; c->bpop.timeout = 0; c->bpop.keys = dictCreate(&objectKeyPointerValueDictType,NULL); c->bpop.target = NULL; c->bpop.numreplicas = 0; c->bpop.reploffset = 0; c->woff = 0; c->watched_keys = listCreate(); c->pubsub_channels = dictCreate(&objectKeyPointerValueDictType,NULL); c->pubsub_patterns = listCreate(); c->peerid = NULL; listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); if (fd != -1) listAddNodeTail(server.clients,c); initClientMultiState(c); return c; }
redis-server接收到客戶端的第一條命令
redis-cli給redis-server發送的第一條數據是*1\r\n$7\r\nCOMMAND\r\n。我們來看下對於這條數據如何處理的,這個很容易做到,單步調試一下readQueryFromClient調用read函數收取完數據,接着繼續處理c->querybuf的代碼即可。經實際跟蹤調試,調用的是processInputBuffer函數,位於networking.c文件中:
/* This function is called every time, in the client structure 'c', there is * more query buffer to process, because we read more data from the socket * or because a client was blocked and later reactivated, so there could be * pending query buffer, already representing a full command, to process. */ void processInputBuffer(client *c) { server.current_client = c; /* Keep processing while there is something in the input buffer */ while(sdslen(c->querybuf)) { /* Return if clients are paused. */ if (!(c->flags & CLIENT_SLAVE) && clientsArePaused()) break; /* Immediately abort if the client is in the middle of something. */ if (c->flags & CLIENT_BLOCKED) break; /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is * written to the client. Make sure to not let the reply grow after * this flag has been set (i.e. don't process more commands). * * The same applies for clients we want to terminate ASAP. */ if (c->flags & (CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP)) break; /* Determine request type when unknown. */ if (!c->reqtype) { if (c->querybuf[0] == '*') { c->reqtype = PROTO_REQ_MULTIBULK; } else { c->reqtype = PROTO_REQ_INLINE; } } if (c->reqtype == PROTO_REQ_INLINE) { if (processInlineBuffer(c) != C_OK) break; } else if (c->reqtype == PROTO_REQ_MULTIBULK) { if (processMultibulkBuffer(c) != C_OK) break; } else { serverPanic("Unknown request type"); } /* Multibulk processing could see a <= 0 length. */ if (c->argc == 0) { resetClient(c); } else { /* Only reset the client when the command was executed. */ if (processCommand(c) == C_OK) { if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) { /* Update the applied replication offset of our master. */ c->reploff = c->read_reploff - sdslen(c->querybuf); } /* Don't reset the client structure for clients blocked in a * module blocking command, so that the reply callback will * still be able to access the client argv and argc field. * The client will be reset in unblockClientFromModule(). */ if (!(c->flags & CLIENT_BLOCKED) || c->btype != BLOCKED_MODULE) resetClient(c); } /* freeMemoryIfNeeded may flush slave output buffers. This may * result into a slave, that may be the active client, to be * freed. */ if (server.current_client == NULL) break; } } server.current_client = NULL; }
processInputBuffer先判斷接收到的字符串是不是以星號(*)開頭,這裏是以星號開頭,然後設置Client對象的reqtype字段值爲PROTO_REQ_MULTIBULK類型,接着調用processMultibulkBuffer函數接着處理剩餘的字符串。處理後的字符串被解析成redis命令,記錄在client對象的argc和argv兩個字段中,前者記錄當前命令的數目,後者存儲的的是命令對應的結構體對象的地址。由於,如何解析這些命令以及存在哪些命令結構體不是我們本章節的關注點,這裏就不再詳細分析了。
命令解析完成以後,從processMultibulkBuffer函數返回後,在processCommand函數中處理剛纔記錄在client對象argv字段中的命令。
//爲了與原代碼保持一致,代碼縮進未調整 if (c->argc == 0) { resetClient(c); } else { /* Only reset the client when the command was executed. */ if (processCommand(c) == C_OK) { //省略部分代碼 } }
在processCommand函數中,處理命令,命令的處理流程大致如下:
- 先判斷是不是quit命令,如果是,則往發送緩衝區中添加一條應答命令(應答redis客戶端),並給當前client對象設置CLIENT_CLOSE_AFTER_REPLY標誌,這個標誌見名知意,即應答完畢後關閉連接。
- 如果不是quit命令,則使用lookupCommand函數從全局命令字典表中查找相應的命令,如果出錯,則向發送緩衝區中添加出錯應答。出錯不是指的是程序邏輯出錯,有可能是客戶端發送的非法命令。如果找到相應的命令,則執行命令後添加應答。
int processCommand(client *c) { /* The QUIT command is handled separately. Normal command procs will * go through checking for replication and QUIT will cause trouble * when FORCE_REPLICATION is enabled and would be implemented in * a regular command proc. */ if (!strcasecmp(c->argv[0]->ptr,"quit")) { addReply(c,shared.ok); c->flags |= CLIENT_CLOSE_AFTER_REPLY; return C_ERR; } /* Now lookup the command and check ASAP about trivial error conditions * such as wrong arity, bad command name and so forth. */ c->cmd = c->lastcmd = lookupCommand(c->argv[0]->ptr); if (!c->cmd) { flagTransaction(c); addReplyErrorFormat(c,"unknown command '%s'", (char*)c->argv[0]->ptr); return C_OK; } else if ((c->cmd->arity > 0 && c->cmd->arity != c->argc) || (c->argc < -c->cmd->arity)) { flagTransaction(c); addReplyErrorFormat(c,"wrong number of arguments for '%s' command", c->cmd->name); return C_OK; } //...省略部分代碼 }
全局字典表是前面介紹的server全局變量(類型是redisServer)的一個字段commands。
struct redisServer { /* General */ pid_t pid; /* Main process pid. */ //無關字段省略 dict *commands; /* Command table */ //無關字段省略 }
至於這個全局字典表在哪裏初始化的以及相關的數據結構類型,由於與本節主題無關,這裏就不分析了。
下面重點探究下如何將應答命令(包括出錯的應答)添加到發送緩衝區去。我們以添加一個“ok”命令爲例:
void addReply(client *c, robj *obj) { if (prepareClientToWrite(c) != C_OK) return; /* This is an important place where we can avoid copy-on-write * when there is a saving child running, avoiding touching the * refcount field of the object if it's not needed. * * If the encoding is RAW and there is room in the static buffer * we'll be able to send the object to the client without * messing with its page. */ if (sdsEncodedObject(obj)) { if (_addReplyToBuffer(c,obj->ptr,sdslen(obj->ptr)) != C_OK) _addReplyObjectToList(c,obj); } else if (obj->encoding == OBJ_ENCODING_INT) { /* Optimization: if there is room in the static buffer for 32 bytes * (more than the max chars a 64 bit integer can take as string) we * avoid decoding the object and go for the lower level approach. */ if (listLength(c->reply) == 0 && (sizeof(c->buf) - c->bufpos) >= 32) { char buf[32]; int len; len = ll2string(buf,sizeof(buf),(long)obj->ptr); if (_addReplyToBuffer(c,buf,len) == C_OK) return; /* else... continue with the normal code path, but should never * happen actually since we verified there is room. */ } obj = getDecodedObject(obj); if (_addReplyToBuffer(c,obj->ptr,sdslen(obj->ptr)) != C_OK) _addReplyObjectToList(c,obj); decrRefCount(obj); } else { serverPanic("Wrong obj->encoding in addReply()"); } }
addReply函數中有兩個關鍵的地方,一個是prepareClientToWrite函數調用,另外一個是_addReplyToBuffer函數調用。先來看prepareClientToWrite。這個函數中有這樣一段代碼:
if (!clientHasPendingReplies(c) && !(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack))) { /* Here instead of installing the write handler, we just flag the * client and put it into a list of clients that have something * to write to the socket. This way before re-entering the event * loop, we can try to directly write to the client sockets avoiding * a system call. We'll only really install the write handler if * we'll not be able to write the whole reply at once. */ c->flags |= CLIENT_PENDING_WRITE; listAddNodeHead(server.clients_pending_write,c); }
這段代碼先判斷髮送緩衝區中是否還有未發送的應答命令——通過判斷client的對象的bufpos字段(int型)和reply字段(這是一個鏈表)的長度是否大於0。
/* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { return c->bufpos || listLength(c->reply); }
如果當前client對象不是處於CLIENT_PENDING_WRITE狀態,且在發送緩衝區沒有剩餘數據,則給該client對象設置CLIENT_PENDING_WRITE標誌,並將當前client對象添加到全局server對象的名叫clients_pending_write鏈表中去。這個鏈表中存的是所有有數據要發送的client對象,注意和上面說的reply鏈表區分開來。
關於CLIENT_PENDING_WRITE標誌,redis解釋是:
Client has output to send but a write handler is yet not installed
翻譯成中文就是,一個有數據需要發送,但是還沒有註冊可寫事件的client對象。
接着討論_addReplyToBuffer函數。其實現位於networking.c文件中。
int _addReplyToBuffer(client *c, const char *s, size_t len) { size_t available = sizeof(c->buf)-c->bufpos; if (c->flags & CLIENT_CLOSE_AFTER_REPLY) return C_OK; /* If there already are entries in the reply list, we cannot * add anything more to the static buffer. */ if (listLength(c->reply) > 0) return C_ERR; /* Check that the buffer has enough space available for this string. */ if (len > available) return C_ERR; memcpy(c->buf+c->bufpos,s,len); c->bufpos+=len; return C_OK; }
在這個函數中再次確保了client對象的reply鏈表長度不能大於0(if判斷,如果不滿足條件,則退出該函數)。reply鏈表存儲的是待發送的應答命令。應答命令被存儲在client對象的buf字段中,其長度被記錄在bufpos字段中。buf字段是一個固定大小的字節數組:
typedef struct client { uint64_t id; /* Client incremental unique ID. */ int fd; /* Client socket. */ redisDb *db; /* Pointer to currently SELECTed DB. */ robj *name; /* As set by CLIENT SETNAME. */ sds querybuf; /* Buffer we use to accumulate client queries. */ sds pending_querybuf; /* If this is a master, this buffer represents the yet not applied replication stream that we are receiving from the master. */ //省略部分字段... /* Response buffer */ int bufpos; char buf[PROTO_REPLY_CHUNK_BYTES]; } client;
PROTO_REPLY_CHUNK_BYTES在redis中的定義是16*1024,也就是說應答命令數據包最長是16k。
回到我們上面提的命令:*1\r\n$7\r\nCOMMAND\r\n,通過lookupCommand解析之後得到"command"命令,在gdb中顯示如下:
2345 c->cmd = c->lastcmd = lookupCommand(c->argv[0]->ptr); (gdb) n 2346 if (!c->cmd) { (gdb) p c->cmd $23 = (struct redisCommand *) 0x742db0 <redisCommandTable+13040> (gdb) p *c->cmd $24 = {name = 0x4fda67 "command", proc = 0x42d920 <commandCommand>, arity = 0, sflags = 0x50dc3e "lt", flags = 1536, getkeys_proc = 0x0, firstkey = 0, lastkey = 0, keystep = 0, microseconds = 1088, calls = 1}
如何處理可寫事件
上面我們介紹了redis-server如何處理可讀事件,整個流程就是註冊可讀事件回調函數,在回調函數中調用操作系統API read函數收取數據,然後解析數據得到redis命令,處理命令接着將應答數據包放到client對象的buf字段中去。那麼放入buf字段的數據何時發給客戶端呢?
還記得我們前面章節說的那個while事件循環嗎?我們再來回顧一下它的代碼:
void aeMain(aeEventLoop *eventLoop) { eventLoop->stop = 0; while (!eventLoop->stop) { if (eventLoop->beforesleep != NULL) eventLoop->beforesleep(eventLoop); aeProcessEvents(eventLoop, AE_ALL_EVENTS|AE_CALL_AFTER_SLEEP); } }
其中,先判斷eventLoop對象的beforesleep對象是否設置了,這是一個回調函數。在redis-server初始化時已經設置好了。
void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) { eventLoop->beforesleep = beforesleep; }
我們在aeSetBeforeSleepProc這個函數上設置一個斷點,然後重啓一下redis-server來驗證一下在何處設置的這個回調。
Breakpoint 2, aeSetBeforeSleepProc (eventLoop=0x7ffff083a0a0, beforesleep=beforesleep@entry=0x4294f0 <beforeSleep>) at ae.c:507 507 eventLoop->beforesleep = beforesleep; (gdb) bt #0 aeSetBeforeSleepProc (eventLoop=0x7ffff083a0a0, beforesleep=beforesleep@entry=0x4294f0 <beforeSleep>) at ae.c:507 #1 0x00000000004238d2 in main (argc=<optimized out>, argv=0x7fffffffe588) at server.c:3892
使用f 1命令切換到堆棧#1,並輸入l顯示斷點附近的代碼:
(gdb) l 3887 /* Warning the user about suspicious maxmemory setting. */ 3888 if (server.maxmemory > 0 && server.maxmemory < 1024*1024) { 3889 serverLog(LL_WARNING,"WARNING: You specified a maxmemory value that is less than 1MB (current value is %llu bytes). Are you sure this is what you really want?", server.maxmemory); 3890 } 3891 3892 aeSetBeforeSleepProc(server.el,beforeSleep); 3893 aeSetAfterSleepProc(server.el,afterSleep); 3894 aeMain(server.el); 3895 aeDeleteEventLoop(server.el); 3896 return 0;
3892行將這個回調設置成beforeSleep函數。因此每一輪循環都會調用這個beforeSleep函數。server.el前面也介紹過就是aeEventLoop對象。在這個beforeSleep函數中有一個handleClientsWithPendingWrites調用(位於文件server.c中):
void beforeSleep(struct aeEventLoop *eventLoop) { //省略無關代碼... /* Handle writes with pending output buffers. */ handleClientsWithPendingWrites(); //省略無關代碼... }
handleClientsWithPendingWrites函數調用即把記錄在每個client中的數據發送出去。我們具體看一下發送的邏輯(位於networking.c文件中):
/* This function is called just before entering the event loop, in the hope * we can just write the replies to the client output buffer without any * need to use a syscall in order to install the writable event handler, * get it called, and so forth. */ int handleClientsWithPendingWrites(void) { listIter li; listNode *ln; int processed = listLength(server.clients_pending_write); listRewind(server.clients_pending_write,&li); while((ln = listNext(&li))) { client *c = listNodeValue(ln); c->flags &= ~CLIENT_PENDING_WRITE; listDelNode(server.clients_pending_write,ln); /* Try to write buffers to the client socket. */ if (writeToClient(c->fd,c,0) == C_ERR) continue; /* If there is nothing left, do nothing. Otherwise install * the write handler. */ if (clientHasPendingReplies(c) && aeCreateFileEvent(server.el, c->fd, AE_WRITABLE, sendReplyToClient, c) == AE_ERR) { freeClientAsync(c); } } return processed; }
上面的代碼先從全局server對象(前面已經介紹過了)的clients_pending_write字段(存儲client對象的鏈表)挨個取出有數據要發送的client對象,然後調用writeToClient函數嘗試將client中存儲的應答數據發出去。