open vswitch研究:utility

stream


struct pstream {
    const struct pstream_class *class;
    char *name;
};

pstream_class是一個類似的接口類,其實現根據底層socket的不同(unix domain socket, tcp socket, ssl socket)而不同,p表示passive,其接口定義如下,

struct pstream_class {
    /* Prefix for connection names, e.g. "ptcp", "pssl", "punix". */
    const char *name;

    /* True if this pstream needs periodic probes to verify connectivty.  For
     * pstreams which need probes, it can take a long time to notice the
     * connection was dropped. */
    bool needs_probes;

    /* Attempts to start listening for stream connections.  'name' is the full
     * connection name provided by the user, e.g. "ptcp:1234".  This name is
     * useful for error messages but must not be modified.
     *
     * 'suffix' is a copy of 'name' following the colon and may be modified.
     * 'dscp' is the DSCP value that the new connection should use in the IP
     * packets it sends.
     *
     * Returns 0 if successful, otherwise a positive errno value.  If
     * successful, stores a pointer to the new connection in '*pstreamp'.
     *
     * The listen function must not block.  If the connection cannot be
     * completed immediately, it should return EAGAIN (not EINPROGRESS, as
     * returned by the connect system call) and continue the connection in the
     * background. */
    int (*listen)(const char *name, char *suffix, struct pstream **pstreamp,
                  uint8_t dscp);

    /* Closes 'pstream' and frees associated memory. */
    void (*close)(struct pstream *pstream);

   /* Tries to accept a new connection on 'pstream'.  If successful, stores
     * the new connection in '*new_streamp' and returns 0.  Otherwise, returns
     * a positive errno value.
     *
     * The accept function must not block waiting for a connection.  If no
     * connection is ready to be accepted, it should return EAGAIN. */
    int (*accept)(struct pstream *pstream, struct stream **new_streamp);

    /* Arranges for the poll loop to wake up when a connection is ready to be
     * accepted on 'pstream'. */
    void (*wait)(struct pstream *pstream);
};

可以看出pstream_class作爲server端的stream socket封裝,其接口包括了listen, accept, wait, close


struct stream {
    const struct stream_class *class;
    int state;
    int error;
    ovs_be32 remote_ip;
    ovs_be16 remote_port;
    ovs_be32 local_ip;
    ovs_be16 local_port;
    char *name;
};

類似的stream_class作爲active stream socket的接口類,接口包括了open, close, recv, send, wait等,請參考lib/stream-provider.h


總結下來,就是pstream是被動的流,只用來listen, accept新的連接,stream是用來讀寫交換數據的主動流,無論是pstream, stream,都用到了bridge模式,通過pstream_class, stream_class來做接口定義的操作。我們來看幾個典型的操作。


stream_connect,判斷當前流的狀態,如果還是CONNECTING,那麼調用stream_class->connect判斷連接結束沒有,如果是CONNECTED,那麼返回0

int stream_connect(struct stream *stream)
{
    enum stream_state last_state;

    do {
        last_state = stream->state;
        switch (stream->state) {
        case SCS_CONNECTING:
            scs_connecting(stream);
            break;

        case SCS_CONNECTED:
            return 0;
            
        case SCS_DISCONNECTED:
            return stream->error;

        default:
            NOT_REACHED();
        }
    } while (stream->state != last_state);

    return EAGAIN;
}


stream_wait,用來等待stream socket available,從而開始wait指定的操作,對於STREAM_SEND/STREAM_RECV而言,如果此時stream還是CONNECTING,會等到變爲CONNECTED

void
stream_wait(struct stream *stream, enum stream_wait_type wait)
{
    assert(wait == STREAM_CONNECT || wait == STREAM_RECV
           || wait == STREAM_SEND);

    switch (stream->state) {
    case SCS_CONNECTING:
        wait = STREAM_CONNECT;
        break;

    case SCS_DISCONNECTED:
        poll_immediate_wake(); 
        return;
    }   
    (stream->class->wait)(stream, wait);
}   

stream_wait用到了poll函數來做event demultiplex,包括相應的struct pollfd,由於沒有大量的併發連接,所以性能還不是問題


進程的poll機制如下,會有一個全局的poll_waiters list,每次有fd需要wait某些events時,會構造一個poll_waiter並掛到全局poll_waiters list中。同樣可以通過poll_cancel來把一個poll_waiter從全局list裏面去掉

void poll_block(void)
{   
    static struct pollfd *pollfds;
    static size_t max_pollfds;

    struct poll_waiter *pw, *next;
    int n_waiters, n_pollfds;
    int elapsed;
    int retval;

    /* Register fatal signal events before actually doing any real work for
     * poll_block. */
    fatal_signal_wait();

    n_waiters = list_size(&waiters);
    if (max_pollfds < n_waiters) {
        max_pollfds = n_waiters;
        pollfds = xrealloc(pollfds, max_pollfds * sizeof *pollfds);
    }

    n_pollfds = 0;
    LIST_FOR_EACH (pw, node, &waiters) {
        pw->pollfd = &pollfds[n_pollfds];
        pollfds[n_pollfds].fd = pw->fd;
        pollfds[n_pollfds].events = pw->events;
        pollfds[n_pollfds].revents = 0;
        n_pollfds++;
    }   

    if (timeout_when == LLONG_MIN) {
        COVERAGE_INC(poll_zero_timeout);
    }

到此,把全局poll_waiter的所有poll_waiter,填到一個struct pollfd的數組pollfds中,

    retval = time_poll(pollfds, n_pollfds, timeout_when, &elapsed);
    if (retval < 0) {
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
        VLOG_ERR_RL(&rl, "poll: %s", strerror(-retval));
    } else if (!retval) {
        log_wakeup(timeout_where, NULL, elapsed);
    }
time_poll是一個event demultiplexer,我們後面會分析

    LIST_FOR_EACH_SAFE (pw, next, node, &waiters) {
        if (pw->pollfd->revents) {
            log_wakeup(pw->where, pw->pollfd, 0);
        }
        poll_cancel(pw);
    }

    timeout_when = LLONG_MAX;
    timeout_where = NULL;

    /* Handle any pending signals before doing anything else. */
    fatal_signal_run();
}

time_poll實際上在一個循環裏反覆調用poll,直到給的時間片用完爲止


下面我們拿unix domain socket, tcp socket爲例來stream這塊代碼的分析

對TCP而言,ptcp_open是pstream_class的listen函數,

static int ptcp_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp, uint8_t dscp) {
    struct sockaddr_in sin;
    char bound_name[128];
    int fd;
    
    fd = inet_open_passive(SOCK_STREAM, suffix, -1, &sin, dscp);
    if (fd < 0) { 
        return -fd; 
    }

    sprintf(bound_name, "ptcp:%"PRIu16":"IP_FMT,
            ntohs(sin.sin_port), IP_ARGS(&sin.sin_addr.s_addr));
    return new_fd_pstream(bound_name, fd, ptcp_accept, NULL, pstreamp);
}   

inet_open_passive主要調用socket, setsockopt, bind等創建並配置好socket,new_fd_pstream創建並配置一個fd_pstream結構,這個結構包括了之前提到的struct pstream結構:

struct fd_pstream {   
    struct pstream pstream;
    int fd;
    int (*accept_cb)(int fd, const struct sockaddr *, size_t sa_len,
                     struct stream **);
    char *unlink_path;
};


new_fd_pstream裏,原來pstream裏的pstream_class被換成了fd_pstream_class,這樣原來三個爲NULL的函數指針變成了pfd_close, pfd_accept, pfd_wait

pfd_accept會調用accept接收connect過來的fd,然後調用accept_cb這個callback函數,對於TCP而言,這是ptcp_accept

ptcp_accept會基於這個accept的fd,通過new_tcp_stream把connection封裝成一個stream,裏面調用setsockopt設置了TCP_NODELAY,之後調用new_fd_stream,同樣的原來stream裏的stream_class換成了stream_fd_class,原來爲NULL的函數指針變成了fd_close, fd_connect, fd_recv, fd_send, fd_wait

fd_send, fd_recv都是調用read, write來讀寫緩衝區,我們來看fd_wait,

static void fd_wait(struct stream *stream, enum stream_wait_type wait)
{
    struct stream_fd *s = stream_fd_cast(stream);
    switch (wait) {
    case STREAM_CONNECT:
    case STREAM_SEND:
        poll_fd_wait(s->fd, POLLOUT);
        break;

    case STREAM_RECV:
        poll_fd_wait(s->fd, POLLIN);
        break;

    default:
        NOT_REACHED();
    }
}

可以看出fd_wait只是把fd掛到了poll_waiter全局list裏,之後的事情都交給poll_block去做


對於unix domain socket而言,pstream_class爲punix_pstream_class,其listen函數爲punix_open。

punix_open首先調用make_unix_socket,傳入bind的path,該函數會創建unix domain socket,調用make_sockaddr_un配置好struct sockaddr_un,調用bind_unix_socket把unix domain socket綁定到bind path上的socket文件上;punix_open之後調用listen準備接受connection請求;最後調用new_fd_pstream;和TCP一樣,此時已經不區分pstream_class屬於TCP還是unix domain socket了,而是統一用fd_pstream_class來表示,相關的函數指針爲pfd_close, pfd_accept, pfd_wait


unixctl


對於vswitchd, ovsdb這類的daemon而言,會包含一個unixctl_server,用來接收控制流

struct unixctl_server {
    struct pstream *listener;
    struct list conns;
};

unixctl_server_create:傳入的path是server的unix domain socket所在的socket文件路徑,之後調用pstream_open,執行了punix_open,並把punix_pstream_class的指針通過listener傳出來,此時daemon已經調用完了listen,可以接收client的連接請求了。

struct unixctl_conn {
    struct list node;
    struct jsonrpc *rpc;
 
    /* Only one request can be in progress at a time.  While the request is
     * being processed, 'request_id' is populated, otherwise it is null. */
    struct json *request_id;   /* ID of the currently active request. */
};          

unixclt_server_run:調用pstream_accept接收connection請求,實際上調用pfd_accept,如果有新的connection,加入unixctl_server的conns連接list中,並初始化好connection對應的jsonrpc。之後對conns的每個請求調用run_connection

unixctl_server_wait:先把listen放入poll_waiter list,之後是有數據要發送的connection,最後是接收數據的connection

run_connection:先調用jsonrpc_run把緩衝區要發送的數據發送完畢,之後調用jsonrpc_recv接收一個jsonrpc_msg,unixctl_server只處理JSON REQUEST,對請求調用process_command。對於每個command而言,其必須先調用unixctl_command_register註冊之後纔可以被執行,process_command基於command method在hash表裏查找到這個command的配置,最後調用command註冊的callback函數執行


process

process.c是進程對象的utility實現,

struct process {
    struct list node;
    char *name;
    pid_t pid;

    /* Modified by signal handler. */
    volatile bool exited;
    volatile int status;
};

process_init:設置非阻塞管道xpipe_nonblocking(fds),設置SIGCHLD的信號處理函數sigchld_handler。struct list all_processes是該進程所有子進程的list,sigchld_handler對每個子進程調用waitpid,得到子進程退出返回的返回碼和狀態,並存到struct process的exited, status成員變量中

process_start:先調用process_prestart,該函數又會調用process_init,除此之外,還會驗證進程的binary是否存在。 之後fork子進程,對於父進程,fork成功之後,把自己註冊到all_processes的list上,回覆之前被block的SIGCHLD信號就退出了;對於子進程,主要調用execvp切換成真正的binary進程

process_destroy:把struct process* p從all_processes的list中移除

process_wait:如果進程要退出(p->exited),調用poll_immediate_wake,喚醒waiters上的所有fd;否則阻塞在poll_block上

process_run:可以看作process_start >> process->wait >> poll->block >> process->wait >> poll->block的循環 


daemon

daemon的啓動參數包括了是否detach, 是否chdir,是否set_pidfile,是否忽略之前的pidfile,是否啓動monitor進程監控該daemon

/* If configured with set_pidfile() or set_detach(), creates the pid file and
 * detaches from the foreground session.  */
void 
daemonize(void)
{
    daemonize_start();
    daemonize_complete();
}

先來看daemonize_start

/* If daemonization is configured, then starts daemonization, by forking and
 * returning in the child process.  The parent process hangs around until the
 * child lets it know either that it completed startup successfully (by calling
 * daemon_complete()) or that it failed to start up (by exiting with a nonzero
 * exit code). */
void
daemonize_start(void)
{
    daemonize_fd = -1;

    if (detach) {
        if (fork_and_wait_for_startup(&daemonize_fd) > 0) {
            /* Running in parent process. */
            exit(0);
        }
        /* Running in daemon or monitor process. */
    }

fork_and_wait_for_startup,調用fork_and_clean_up,該函數是fork的一個封裝,父進程在fork之後取消之前的fatal_signal註冊的callback函數(主要是進程退出時刪除pid文件用的),子進程則在fork之後取消之前父進程的lockfile, timer等。OVS的父子進程之前有個管道機制,子進程啓動成功之後向管道寫一個字符,父進程收到之後認爲子進程啓動成功從而進入waitpid等待其退出

如果是detach模式,fork_and_wait_for_startup返回之後父進程就退出

    if (monitor) {
        int saved_daemonize_fd = daemonize_fd;
        pid_t daemon_pid;

        daemon_pid = fork_and_wait_for_startup(&daemonize_fd);
        if (daemon_pid > 0) {
            /* Running in monitor process. */
            fork_notify_startup(saved_daemonize_fd); 
            close_standard_fds();
            monitor_daemon(daemon_pid);
        } 
        /* Running in daemon process. */
    }   

如果是monitor模式,意味有一個monitor進程來監控daemon,此時monitor進程成爲daemon的父進程,monitor調用fork_notify_startup通過管道向子進程寫一個字符,通知子進程開始run,之後調用monitor_daemon開始監控

monitor_daemon在一個死循環裏調用waitpid daemon_pid,一旦發現daemon進程異常退出,會嘗試重啓daemon:如果退出狀態是WCOREDUMP(status),調用setrlimit(RLIMIT_CORE)關閉coredump,以節省硬盤空間; 如果重啓頻率超出了throttle,sleep一小段時間;最後調用fork_and_wait_for_startup才啓動daemon進程

判斷異常退出的方法是通過waitpid返回的status,如果WIFSIGNALED(status)爲true,說明是信號導致的進程退出,這時通過WTERMSIG(status)得到退出前收到的信號,如果屬於{SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV等信號,則需要重新啓動daemon

    
    if (pidfile) {
        make_pidfile();                
    }                    

    /* Make sure that the unixctl commands for vlog get registered in a
     * daemon, even before the first log message. */
    vlog_init();    
}   

最後子進程生成pidfile,初始化vlog


/* If daemonization is configured, then this function notifies the parent
 * process that the child process has completed startup successfully.  It also
 * call daemonize_post_detach().
 *
 * Calling this function more than once has no additional effect. */
void
daemonize_complete(void)
{
    if (!detached) {
        detached = true;

        fork_notify_startup(daemonize_fd);
        daemonize_fd = -1;
        daemonize_post_detach();
    }
}

fork_notify_startup通知父進程啓動成功,此時父進程進入waitpid。daemonize_post_detach則做unix daemon創建例行的事情

void
daemonize_post_detach(void)
{
    if (detach) {
        setsid();
        if (chdir_) {
            ignore(chdir("/"));
        }
        close_standard_fds();
    }
}


jsonrpc

/* Messages. */
enum jsonrpc_msg_type {
    JSONRPC_REQUEST,           /* Request. */
    JSONRPC_NOTIFY,            /* Notification. */
    JSONRPC_REPLY,             /* Successful reply. */
    JSONRPC_ERROR              /* Error reply. */
};
struct jsonrpc_msg {
    enum jsonrpc_msg_type type;
    char *method;               /* Request or notification only. */
    struct json *params;        /* Request or notification only. */
    struct json *result;        /* Successful reply only. */
    struct json *error;         /* Error reply only. */
    struct json *id;            /* Request or reply only. */
};

struct jsonrpc {
    struct stream *stream;
    char *name;
    int status;

    /* Input. */
    struct byteq input;
    struct json_parser *parser;
    struct jsonrpc_msg *received;

    /* Output. */
    struct list output;         /* Contains "struct ofpbuf"s. */
    size_t backlog;
};

jsonrpc的行爲基於stream, pstream,再加上json parsing,如果你對前面的stream, json都比較瞭解,這部分內容還是很簡單的

stream_open_with_default_ports, pstream_open_with_default_ports都通過stream_open, pstream_open來實現

jsonrpc_run,查看jsonrpc->output,把output隊列的消息通過stream_send發送出去

jsonrpc_wait,等待可以發送

jsonrpc_send,調用jsonrpc_msg_to_json把jsonrpc_msg轉化成json,調用json_to_string把json轉成string,創建一個基於該string的ofpbuf,並掛到rpc->output的鏈表上,最後調用jsonrpc_run發送

jsonrpc_recv,jsonrpc用一個環形的buffer來收數據,每次recv首先計算環形buffer還有多少空間,然後盡最大可能收數據;如果此時buffer還有數據,那麼調用json_parser_feed解析數據,如果parse完成(parser->done),調用jsonrpc_received把json轉成jsonrpc_msg

jsonrpc_transact_block,首先調用jsonrpc_send_block把一個jsonrpc_msg request發送出去,在調用jsonrpc_recv_block等待一個reply


jsonrpc_create,創建不同類型的jsonrpc_msg

jsonrpc_msg_from_json, jsonrpc_msg_to_json,json和jsonrpc_msg之間互相轉換


jsonrpc_session代表一個RPC connection

struct jsonrpc_session {
    struct reconnect *reconnect;
    struct jsonrpc *rpc;
    struct stream *stream;
    struct pstream *pstream;
    unsigned int seqno;
    uint8_t dscp;
};


這裏出現了一個struct reconnect結構,就是一些配置,狀態,統計信息的集合

struct reconnect {
    /* Configuration. */
    char *name;
    int min_backoff;
    int max_backoff;
    int probe_interval;
    bool passive;
    enum vlog_level info;       /* Used for informational messages. */

    /* State. */
    enum state state;
    long long int state_entered;
    int backoff;
    long long int last_activity;
    long long int last_connected;
    long long int last_disconnected;
    unsigned int max_tries;

    /* These values are simply for statistics reporting, not otherwise used
     * directly by anything internal. */
    long long int creation_time;
    unsigned int n_attempted_connections, n_successful_connections;
    unsigned int total_connected_duration;
    unsigned int seqno;
};

connection的狀態有 S_VOID, S_BACKOFF, S_CONNECTING, S_ACTIVE, S_IDLE, S_RECONNECT, S_LISTENING,其中S_ACTIVE, S_IDLE表示已經連接的狀態。

back_off表示在嘗試重新連接時,連續兩次嘗試之間間隔的時間

reconnect_transition__,是一個狀態轉換的自動機,代碼如下

static void reconnect_transition__(struct reconnect *fsm, long long int now,
                       enum state state)
{
    if (fsm->state == S_CONNECTING) {
        fsm->n_attempted_connections++;
        if (state == S_ACTIVE) {
            fsm->n_successful_connections++;
        }
    }


    if (is_connected_state(fsm->state) != is_connected_state(state)) {
        if (is_connected_state(fsm->state)) {
            fsm->total_connected_duration += now - fsm->last_connected;
        }
        fsm->seqno++;
    }

    VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state));
    fsm->state = state;
    fsm->state_entered = now;
}


jsonrpc_session_open,初始化jsonrpc_session結構,解析name,如果是類似tcp:127.1.2.3,表示是一個active connection,如果是ptcp:127.1.2.3,表示是一個passive connection,用來listen新的connection。如果是passive connection,會調用reconnect_set_passive。最後調用reconnect_set_probe_interval,設置keepalive

jsonrpc_session_close,依次調用jsonrpc_close, reconnect_destroy, stream_close, pstream_close

jsonrpc_session_connect,如果s->reconnect是主動連接,那麼調用jsonrpc_stream_open連接connection,並把狀態變爲S_CONNECTING;否則如果s->pstream爲空,那麼調用jsonrpc_pstream_open開啓listen,同時把狀態變爲S_LISTENING

static void jsonrpc_session_connect(struct jsonrpc_session *s)
{   
    const char *name = reconnect_get_name(s->reconnect);
    int error;

    jsonrpc_session_disconnect(s);
    if (!reconnect_is_passive(s->reconnect)) {
        error = jsonrpc_stream_open(name, &s->stream, s->dscp);
        if (!error) {
            reconnect_connecting(s->reconnect, time_msec());
        }  
    } else {
        error = s->pstream ? 0 : jsonrpc_pstream_open(name, &s->pstream,
                                                      s->dscp);
        if (!error) {
            reconnect_listening(s->reconnect, time_msec());
        }
    }   

    if (error) {
        reconnect_connect_failed(s->reconnect, time_msec(), error);
    }
    s->seqno++;
}


jsonrpc_session_disconnect,close已有s->stream, s->rpc結構

jsonrpc_session_run,首先,如果s->pstream不爲空,調用pstream_accept接收新的connection,如果此時jsonrpc_session已經有了connection,那麼用新的替換老的

        error = pstream_accept(s->pstream, &stream);
        if (!error) {
            if (s->rpc || s->stream) {
                VLOG_INFO_RL(&rl,
                             "%s: new connection replacing active connection",
                             reconnect_get_name(s->reconnect));
                jsonrpc_session_disconnect(s);
            }
            reconnect_connected(s->reconnect, time_msec());
            s->rpc = jsonrpc_open(stream);
        }

如果s->rpc不爲空,會把數據發送出去,否則重新嘗試連接s->stream。

        stream_run(s->stream);
        error = stream_connect(s->stream);
        if (!error) {
            reconnect_connected(s->reconnect, time_msec());
            s->rpc = jsonrpc_open(s->stream);
            s->stream = NULL;
        } else if (error != EAGAIN) {
            reconnect_connect_failed(s->reconnect, time_msec(), error);
            stream_close(s->stream);
            s->stream = NULL;
        }


jsonrpc_session_send,調用jsonrpc_send發送數據

jsonrpc_session_recv,調用jsonrpc_recv接收一個jsonrpc_msg,如果不是echo之類的測試報文,返回這個jsonrpc_msg



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章