stream
struct pstream {
const struct pstream_class *class;
char *name;
};
pstream_class是一個類似的接口類,其實現根據底層socket的不同(unix domain socket, tcp socket, ssl socket)而不同,p表示passive,其接口定義如下,
struct pstream_class {
/* Prefix for connection names, e.g. "ptcp", "pssl", "punix". */
const char *name;
/* True if this pstream needs periodic probes to verify connectivty. For
* pstreams which need probes, it can take a long time to notice the
* connection was dropped. */
bool needs_probes;
/* Attempts to start listening for stream connections. 'name' is the full
* connection name provided by the user, e.g. "ptcp:1234". This name is
* useful for error messages but must not be modified.
*
* 'suffix' is a copy of 'name' following the colon and may be modified.
* 'dscp' is the DSCP value that the new connection should use in the IP
* packets it sends.
*
* Returns 0 if successful, otherwise a positive errno value. If
* successful, stores a pointer to the new connection in '*pstreamp'.
*
* The listen function must not block. If the connection cannot be
* completed immediately, it should return EAGAIN (not EINPROGRESS, as
* returned by the connect system call) and continue the connection in the
* background. */
int (*listen)(const char *name, char *suffix, struct pstream **pstreamp,
uint8_t dscp);
/* Closes 'pstream' and frees associated memory. */
void (*close)(struct pstream *pstream);
/* Tries to accept a new connection on 'pstream'. If successful, stores
* the new connection in '*new_streamp' and returns 0. Otherwise, returns
* a positive errno value.
*
* The accept function must not block waiting for a connection. If no
* connection is ready to be accepted, it should return EAGAIN. */
int (*accept)(struct pstream *pstream, struct stream **new_streamp);
/* Arranges for the poll loop to wake up when a connection is ready to be
* accepted on 'pstream'. */
void (*wait)(struct pstream *pstream);
};
可以看出pstream_class作爲server端的stream socket封裝,其接口包括了listen, accept, wait, close
struct stream {
const struct stream_class *class;
int state;
int error;
ovs_be32 remote_ip;
ovs_be16 remote_port;
ovs_be32 local_ip;
ovs_be16 local_port;
char *name;
};
類似的stream_class作爲active stream socket的接口類,接口包括了open, close, recv, send, wait等,請參考lib/stream-provider.h
總結下來,就是pstream是被動的流,只用來listen, accept新的連接,stream是用來讀寫交換數據的主動流,無論是pstream, stream,都用到了bridge模式,通過pstream_class, stream_class來做接口定義的操作。我們來看幾個典型的操作。
stream_connect,判斷當前流的狀態,如果還是CONNECTING,那麼調用stream_class->connect判斷連接結束沒有,如果是CONNECTED,那麼返回0
int stream_connect(struct stream *stream)
{
enum stream_state last_state;
do {
last_state = stream->state;
switch (stream->state) {
case SCS_CONNECTING:
scs_connecting(stream);
break;
case SCS_CONNECTED:
return 0;
case SCS_DISCONNECTED:
return stream->error;
default:
NOT_REACHED();
}
} while (stream->state != last_state);
return EAGAIN;
}
stream_wait,用來等待stream socket available,從而開始wait指定的操作,對於STREAM_SEND/STREAM_RECV而言,如果此時stream還是CONNECTING,會等到變爲CONNECTED
void
stream_wait(struct stream *stream, enum stream_wait_type wait)
{
assert(wait == STREAM_CONNECT || wait == STREAM_RECV
|| wait == STREAM_SEND);
switch (stream->state) {
case SCS_CONNECTING:
wait = STREAM_CONNECT;
break;
case SCS_DISCONNECTED:
poll_immediate_wake();
return;
}
(stream->class->wait)(stream, wait);
}
stream_wait用到了poll函數來做event demultiplex,包括相應的struct pollfd,由於沒有大量的併發連接,所以性能還不是問題
進程的poll機制如下,會有一個全局的poll_waiters list,每次有fd需要wait某些events時,會構造一個poll_waiter並掛到全局poll_waiters list中。同樣可以通過poll_cancel來把一個poll_waiter從全局list裏面去掉
void poll_block(void)
{
static struct pollfd *pollfds;
static size_t max_pollfds;
struct poll_waiter *pw, *next;
int n_waiters, n_pollfds;
int elapsed;
int retval;
/* Register fatal signal events before actually doing any real work for
* poll_block. */
fatal_signal_wait();
n_waiters = list_size(&waiters);
if (max_pollfds < n_waiters) {
max_pollfds = n_waiters;
pollfds = xrealloc(pollfds, max_pollfds * sizeof *pollfds);
}
n_pollfds = 0;
LIST_FOR_EACH (pw, node, &waiters) {
pw->pollfd = &pollfds[n_pollfds];
pollfds[n_pollfds].fd = pw->fd;
pollfds[n_pollfds].events = pw->events;
pollfds[n_pollfds].revents = 0;
n_pollfds++;
}
if (timeout_when == LLONG_MIN) {
COVERAGE_INC(poll_zero_timeout);
}
到此,把全局poll_waiter的所有poll_waiter,填到一個struct pollfd的數組pollfds中,
retval = time_poll(pollfds, n_pollfds, timeout_when, &elapsed);
if (retval < 0) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
VLOG_ERR_RL(&rl, "poll: %s", strerror(-retval));
} else if (!retval) {
log_wakeup(timeout_where, NULL, elapsed);
}
time_poll是一個event demultiplexer,我們後面會分析
LIST_FOR_EACH_SAFE (pw, next, node, &waiters) {
if (pw->pollfd->revents) {
log_wakeup(pw->where, pw->pollfd, 0);
}
poll_cancel(pw);
}
timeout_when = LLONG_MAX;
timeout_where = NULL;
/* Handle any pending signals before doing anything else. */
fatal_signal_run();
}
time_poll實際上在一個循環裏反覆調用poll,直到給的時間片用完爲止
下面我們拿unix domain socket, tcp socket爲例來stream這塊代碼的分析
對TCP而言,ptcp_open是pstream_class的listen函數,
static int ptcp_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp, uint8_t dscp) {
struct sockaddr_in sin;
char bound_name[128];
int fd;
fd = inet_open_passive(SOCK_STREAM, suffix, -1, &sin, dscp);
if (fd < 0) {
return -fd;
}
sprintf(bound_name, "ptcp:%"PRIu16":"IP_FMT,
ntohs(sin.sin_port), IP_ARGS(&sin.sin_addr.s_addr));
return new_fd_pstream(bound_name, fd, ptcp_accept, NULL, pstreamp);
}
inet_open_passive主要調用socket, setsockopt, bind等創建並配置好socket,new_fd_pstream創建並配置一個fd_pstream結構,這個結構包括了之前提到的struct pstream結構:
struct fd_pstream {
struct pstream pstream;
int fd;
int (*accept_cb)(int fd, const struct sockaddr *, size_t sa_len,
struct stream **);
char *unlink_path;
};
new_fd_pstream裏,原來pstream裏的pstream_class被換成了fd_pstream_class,這樣原來三個爲NULL的函數指針變成了pfd_close, pfd_accept, pfd_wait
pfd_accept會調用accept接收connect過來的fd,然後調用accept_cb這個callback函數,對於TCP而言,這是ptcp_accept
ptcp_accept會基於這個accept的fd,通過new_tcp_stream把connection封裝成一個stream,裏面調用setsockopt設置了TCP_NODELAY,之後調用new_fd_stream,同樣的原來stream裏的stream_class換成了stream_fd_class,原來爲NULL的函數指針變成了fd_close, fd_connect, fd_recv, fd_send, fd_wait
fd_send, fd_recv都是調用read, write來讀寫緩衝區,我們來看fd_wait,
static void fd_wait(struct stream *stream, enum stream_wait_type wait)
{
struct stream_fd *s = stream_fd_cast(stream);
switch (wait) {
case STREAM_CONNECT:
case STREAM_SEND:
poll_fd_wait(s->fd, POLLOUT);
break;
case STREAM_RECV:
poll_fd_wait(s->fd, POLLIN);
break;
default:
NOT_REACHED();
}
}
可以看出fd_wait只是把fd掛到了poll_waiter全局list裏,之後的事情都交給poll_block去做
對於unix domain socket而言,pstream_class爲punix_pstream_class,其listen函數爲punix_open。
punix_open首先調用make_unix_socket,傳入bind的path,該函數會創建unix domain socket,調用make_sockaddr_un配置好struct sockaddr_un,調用bind_unix_socket把unix domain socket綁定到bind path上的socket文件上;punix_open之後調用listen準備接受connection請求;最後調用new_fd_pstream;和TCP一樣,此時已經不區分pstream_class屬於TCP還是unix domain socket了,而是統一用fd_pstream_class來表示,相關的函數指針爲pfd_close, pfd_accept, pfd_wait
unixctl
對於vswitchd, ovsdb這類的daemon而言,會包含一個unixctl_server,用來接收控制流
struct unixctl_server {
struct pstream *listener;
struct list conns;
};
unixctl_server_create:傳入的path是server的unix domain socket所在的socket文件路徑,之後調用pstream_open,執行了punix_open,並把punix_pstream_class的指針通過listener傳出來,此時daemon已經調用完了listen,可以接收client的連接請求了。
struct unixctl_conn {
struct list node;
struct jsonrpc *rpc;
/* Only one request can be in progress at a time. While the request is
* being processed, 'request_id' is populated, otherwise it is null. */
struct json *request_id; /* ID of the currently active request. */
};
unixclt_server_run:調用pstream_accept接收connection請求,實際上調用pfd_accept,如果有新的connection,加入unixctl_server的conns連接list中,並初始化好connection對應的jsonrpc。之後對conns的每個請求調用run_connection
unixctl_server_wait:先把listen放入poll_waiter list,之後是有數據要發送的connection,最後是接收數據的connection
run_connection:先調用jsonrpc_run把緩衝區要發送的數據發送完畢,之後調用jsonrpc_recv接收一個jsonrpc_msg,unixctl_server只處理JSON REQUEST,對請求調用process_command。對於每個command而言,其必須先調用unixctl_command_register註冊之後纔可以被執行,process_command基於command method在hash表裏查找到這個command的配置,最後調用command註冊的callback函數執行
process
process.c是進程對象的utility實現,
struct process {
struct list node;
char *name;
pid_t pid;
/* Modified by signal handler. */
volatile bool exited;
volatile int status;
};
process_init:設置非阻塞管道xpipe_nonblocking(fds),設置SIGCHLD的信號處理函數sigchld_handler。struct list all_processes是該進程所有子進程的list,sigchld_handler對每個子進程調用waitpid,得到子進程退出返回的返回碼和狀態,並存到struct process的exited, status成員變量中
process_start:先調用process_prestart,該函數又會調用process_init,除此之外,還會驗證進程的binary是否存在。 之後fork子進程,對於父進程,fork成功之後,把自己註冊到all_processes的list上,回覆之前被block的SIGCHLD信號就退出了;對於子進程,主要調用execvp切換成真正的binary進程
process_destroy:把struct process* p從all_processes的list中移除
process_wait:如果進程要退出(p->exited),調用poll_immediate_wake,喚醒waiters上的所有fd;否則阻塞在poll_block上
process_run:可以看作process_start >> process->wait >> poll->block >> process->wait >> poll->block的循環
daemon
daemon的啓動參數包括了是否detach, 是否chdir,是否set_pidfile,是否忽略之前的pidfile,是否啓動monitor進程監控該daemon
/* If configured with set_pidfile() or set_detach(), creates the pid file and
* detaches from the foreground session. */
void
daemonize(void)
{
daemonize_start();
daemonize_complete();
}
先來看daemonize_start
/* If daemonization is configured, then starts daemonization, by forking and
* returning in the child process. The parent process hangs around until the
* child lets it know either that it completed startup successfully (by calling
* daemon_complete()) or that it failed to start up (by exiting with a nonzero
* exit code). */
void
daemonize_start(void)
{
daemonize_fd = -1;
if (detach) {
if (fork_and_wait_for_startup(&daemonize_fd) > 0) {
/* Running in parent process. */
exit(0);
}
/* Running in daemon or monitor process. */
}
fork_and_wait_for_startup,調用fork_and_clean_up,該函數是fork的一個封裝,父進程在fork之後取消之前的fatal_signal註冊的callback函數(主要是進程退出時刪除pid文件用的),子進程則在fork之後取消之前父進程的lockfile, timer等。OVS的父子進程之前有個管道機制,子進程啓動成功之後向管道寫一個字符,父進程收到之後認爲子進程啓動成功從而進入waitpid等待其退出
如果是detach模式,fork_and_wait_for_startup返回之後父進程就退出
if (monitor) {
int saved_daemonize_fd = daemonize_fd;
pid_t daemon_pid;
daemon_pid = fork_and_wait_for_startup(&daemonize_fd);
if (daemon_pid > 0) {
/* Running in monitor process. */
fork_notify_startup(saved_daemonize_fd);
close_standard_fds();
monitor_daemon(daemon_pid);
}
/* Running in daemon process. */
}
如果是monitor模式,意味有一個monitor進程來監控daemon,此時monitor進程成爲daemon的父進程,monitor調用fork_notify_startup通過管道向子進程寫一個字符,通知子進程開始run,之後調用monitor_daemon開始監控
monitor_daemon在一個死循環裏調用waitpid daemon_pid,一旦發現daemon進程異常退出,會嘗試重啓daemon:如果退出狀態是WCOREDUMP(status),調用setrlimit(RLIMIT_CORE)關閉coredump,以節省硬盤空間; 如果重啓頻率超出了throttle,sleep一小段時間;最後調用fork_and_wait_for_startup才啓動daemon進程
判斷異常退出的方法是通過waitpid返回的status,如果WIFSIGNALED(status)爲true,說明是信號導致的進程退出,這時通過WTERMSIG(status)得到退出前收到的信號,如果屬於{SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV等信號,則需要重新啓動daemon
if (pidfile) {
make_pidfile();
}
/* Make sure that the unixctl commands for vlog get registered in a
* daemon, even before the first log message. */
vlog_init();
}
最後子進程生成pidfile,初始化vlog
/* If daemonization is configured, then this function notifies the parent
* process that the child process has completed startup successfully. It also
* call daemonize_post_detach().
*
* Calling this function more than once has no additional effect. */
void
daemonize_complete(void)
{
if (!detached) {
detached = true;
fork_notify_startup(daemonize_fd);
daemonize_fd = -1;
daemonize_post_detach();
}
}
fork_notify_startup通知父進程啓動成功,此時父進程進入waitpid。daemonize_post_detach則做unix daemon創建例行的事情
void
daemonize_post_detach(void)
{
if (detach) {
setsid();
if (chdir_) {
ignore(chdir("/"));
}
close_standard_fds();
}
}
jsonrpc
/* Messages. */
enum jsonrpc_msg_type {
JSONRPC_REQUEST, /* Request. */
JSONRPC_NOTIFY, /* Notification. */
JSONRPC_REPLY, /* Successful reply. */
JSONRPC_ERROR /* Error reply. */
};
struct jsonrpc_msg {
enum jsonrpc_msg_type type;
char *method; /* Request or notification only. */
struct json *params; /* Request or notification only. */
struct json *result; /* Successful reply only. */
struct json *error; /* Error reply only. */
struct json *id; /* Request or reply only. */
};
struct jsonrpc {
struct stream *stream;
char *name;
int status;
/* Input. */
struct byteq input;
struct json_parser *parser;
struct jsonrpc_msg *received;
/* Output. */
struct list output; /* Contains "struct ofpbuf"s. */
size_t backlog;
};
jsonrpc的行爲基於stream, pstream,再加上json parsing,如果你對前面的stream, json都比較瞭解,這部分內容還是很簡單的
stream_open_with_default_ports, pstream_open_with_default_ports都通過stream_open, pstream_open來實現
jsonrpc_run,查看jsonrpc->output,把output隊列的消息通過stream_send發送出去
jsonrpc_wait,等待可以發送
jsonrpc_send,調用jsonrpc_msg_to_json把jsonrpc_msg轉化成json,調用json_to_string把json轉成string,創建一個基於該string的ofpbuf,並掛到rpc->output的鏈表上,最後調用jsonrpc_run發送
jsonrpc_recv,jsonrpc用一個環形的buffer來收數據,每次recv首先計算環形buffer還有多少空間,然後盡最大可能收數據;如果此時buffer還有數據,那麼調用json_parser_feed解析數據,如果parse完成(parser->done),調用jsonrpc_received把json轉成jsonrpc_msg
jsonrpc_transact_block,首先調用jsonrpc_send_block把一個jsonrpc_msg request發送出去,在調用jsonrpc_recv_block等待一個reply
jsonrpc_create,創建不同類型的jsonrpc_msg
jsonrpc_msg_from_json, jsonrpc_msg_to_json,json和jsonrpc_msg之間互相轉換
jsonrpc_session代表一個RPC connection
struct jsonrpc_session {
struct reconnect *reconnect;
struct jsonrpc *rpc;
struct stream *stream;
struct pstream *pstream;
unsigned int seqno;
uint8_t dscp;
};
這裏出現了一個struct reconnect結構,就是一些配置,狀態,統計信息的集合
struct reconnect {
/* Configuration. */
char *name;
int min_backoff;
int max_backoff;
int probe_interval;
bool passive;
enum vlog_level info; /* Used for informational messages. */
/* State. */
enum state state;
long long int state_entered;
int backoff;
long long int last_activity;
long long int last_connected;
long long int last_disconnected;
unsigned int max_tries;
/* These values are simply for statistics reporting, not otherwise used
* directly by anything internal. */
long long int creation_time;
unsigned int n_attempted_connections, n_successful_connections;
unsigned int total_connected_duration;
unsigned int seqno;
};
connection的狀態有 S_VOID, S_BACKOFF, S_CONNECTING, S_ACTIVE, S_IDLE, S_RECONNECT, S_LISTENING,其中S_ACTIVE, S_IDLE表示已經連接的狀態。
back_off表示在嘗試重新連接時,連續兩次嘗試之間間隔的時間
reconnect_transition__,是一個狀態轉換的自動機,代碼如下
static void reconnect_transition__(struct reconnect *fsm, long long int now,
enum state state)
{
if (fsm->state == S_CONNECTING) {
fsm->n_attempted_connections++;
if (state == S_ACTIVE) {
fsm->n_successful_connections++;
}
}
if (is_connected_state(fsm->state) != is_connected_state(state)) {
if (is_connected_state(fsm->state)) {
fsm->total_connected_duration += now - fsm->last_connected;
}
fsm->seqno++;
}
VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state));
fsm->state = state;
fsm->state_entered = now;
}
jsonrpc_session_open,初始化jsonrpc_session結構,解析name,如果是類似tcp:127.1.2.3,表示是一個active connection,如果是ptcp:127.1.2.3,表示是一個passive connection,用來listen新的connection。如果是passive connection,會調用reconnect_set_passive。最後調用reconnect_set_probe_interval,設置keepalive
jsonrpc_session_close,依次調用jsonrpc_close, reconnect_destroy, stream_close, pstream_close
jsonrpc_session_connect,如果s->reconnect是主動連接,那麼調用jsonrpc_stream_open連接connection,並把狀態變爲S_CONNECTING;否則如果s->pstream爲空,那麼調用jsonrpc_pstream_open開啓listen,同時把狀態變爲S_LISTENING
static void jsonrpc_session_connect(struct jsonrpc_session *s)
{
const char *name = reconnect_get_name(s->reconnect);
int error;
jsonrpc_session_disconnect(s);
if (!reconnect_is_passive(s->reconnect)) {
error = jsonrpc_stream_open(name, &s->stream, s->dscp);
if (!error) {
reconnect_connecting(s->reconnect, time_msec());
}
} else {
error = s->pstream ? 0 : jsonrpc_pstream_open(name, &s->pstream,
s->dscp);
if (!error) {
reconnect_listening(s->reconnect, time_msec());
}
}
if (error) {
reconnect_connect_failed(s->reconnect, time_msec(), error);
}
s->seqno++;
}
jsonrpc_session_disconnect,close已有s->stream, s->rpc結構
jsonrpc_session_run,首先,如果s->pstream不爲空,調用pstream_accept接收新的connection,如果此時jsonrpc_session已經有了connection,那麼用新的替換老的
error = pstream_accept(s->pstream, &stream);
if (!error) {
if (s->rpc || s->stream) {
VLOG_INFO_RL(&rl,
"%s: new connection replacing active connection",
reconnect_get_name(s->reconnect));
jsonrpc_session_disconnect(s);
}
reconnect_connected(s->reconnect, time_msec());
s->rpc = jsonrpc_open(stream);
}
如果s->rpc不爲空,會把數據發送出去,否則重新嘗試連接s->stream。
stream_run(s->stream);
error = stream_connect(s->stream);
if (!error) {
reconnect_connected(s->reconnect, time_msec());
s->rpc = jsonrpc_open(s->stream);
s->stream = NULL;
} else if (error != EAGAIN) {
reconnect_connect_failed(s->reconnect, time_msec(), error);
stream_close(s->stream);
s->stream = NULL;
}
jsonrpc_session_send,調用jsonrpc_send發送數據
jsonrpc_session_recv,調用jsonrpc_recv接收一個jsonrpc_msg,如果不是echo之類的測試報文,返回這個jsonrpc_msg