先來介紹幾個地址結構.
struct sockaddr 其實相當於一個基類的地址結構,其他的結構都能夠直接轉到sockaddr.舉個例子比如當sa_family爲PF_INET時,sa_data就包含了端口號和ip地址(in_addr結構).
- struct sockaddr {
- sa_family_t sa_family; /* address family, AF_xxx */
- char sa_data[14]; /* 14 bytes of protocol address */
- };
接下來就是sockaddr_in ,它表示了所有的ipv4的地址結構.可以看到他也就相當於sockaddr 的一個子類.
- struct sockaddr_in {
- sa_family_t sin_family; /* Address family */
- __be16 sin_port; /* Port number */
- struct in_addr sin_addr; /* Internet address */
- /* Pad to size of `struct sockaddr'. */
- unsigned char __pad[__SOCK_SIZE__ - sizeof(short int) -
- sizeof(unsigned short int) - sizeof(struct in_addr)];
- };
這裏還有一個內核比較新的地質結構sockaddr_storage,他可以容納所有類型的套接口結構,比如ipv4,ipv6..可以看到它是強制對齊的,相比於sockaddr.
- struct __kernel_sockaddr_storage {
- unsigned short ss_family; /* address family */
- ///每個協議實現自己的地址結構.
- char __data[_K_SS_MAXSIZE - sizeof(unsigned short)];
- /* space to achieve desired size, */
- /* _SS_MAXSIZE value minus size of ss_family */
- } __attribute__ ((aligned(_K_SS_ALIGNSIZE))); /* force desired alignment */
接下來看幾個和bind相關的數據結構:
第一個是inet_hashinfo,它主要用來管理 tcp的bind hash bucket(在tcp的初始化函數中會將tcp_hashinfo初始化.然後在tcp_prot中會將tcp_hashinfo付給結構體h,然後相應的我們就可以通過sock中的sock_common域來存取這個值).後面我們會分析這個流程.
- struct inet_hashinfo {
- /* This is for sockets with full identity only. Sockets here will
- * always be without wildcards and will have the following invariant:
- *
- * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
- *
- * TIME_WAIT sockets use a separate chain (twchain).
- */
- ///下面會分析這個結構.
- struct inet_ehash_bucket *ehash;
- rwlock_t *ehash_locks;
- unsigned int ehash_size;
- unsigned int ehash_locks_mask;
- /* Ok, let's try this, I give up, we do need a local binding
- * TCP hash as well as the others for fast bind/connect.
- */
- ///表示所有的已經在使用的端口號的信息.這裏bhash也就是一個hash鏈表,而鏈表的元素是inet_bind_bucket,緊接着我們會分析這個結構.
- struct inet_bind_hashbucket *bhash;
- unsigned int bhash_size;
- /* Note : 4 bytes padding on 64 bit arches */
- /* All sockets in TCP_LISTEN state will be in here. This is the only
- * table where wildcard'd TCP sockets can exist. Hash function here
- * is just local port number.
- */
- ///listening_hash表示所有的處於listen狀態的socket.
- struct hlist_head listening_hash[INET_LHTABLE_SIZE];
- /* All the above members are written once at bootup and
- * never written again _or_ are predominantly read-access.
- *
- * Now align to a new cache line as all the following members
- * are often dirty.
- */
- rwlock_t lhash_lock ____cacheline_aligned;
- atomic_t lhash_users;
- wait_queue_head_t lhash_wait;
- struct kmem_cache *bind_bucket_cachep;
- };
struct inet_ehash_bucket管理所有的tcp狀態在TCP_ESTABLISHED和TCP_CLOSE之間的socket.這裏要注意,twchain表示處於TIME_WAIT的socket.
- struct inet_ehash_bucket {
- struct hlist_head chain;
- struct hlist_head twchain;
- };
inet_bind_bucket結構就是每個使用的端口的信息,最終會把它鏈接到bhash鏈表中.
- struct inet_bind_bucket {
- struct net *ib_net;
- ///端口號
- unsigned short port;
- ///表示這個端口是否能夠被重複使用.
- signed short fastreuse;
- ///指向下一個端口的inet_bind_bucket 結構.
- struct hlist_node node;
- ///也就是使用這個端口的socket鏈表
- struct hlist_head owners;
- };
最後一個結構是tcp_hashinfo他在 tcp_init中被初始化,而tcp_init是在inet_init中被初始化的.然後tcp_hashinfo會被賦值給tcp_proto和sock的sk_prot域.
- struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
- .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
- };
然後來看bind的實現,bind對應的系統調用是sys_bind:
- asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
- {
- struct socket *sock;
- struct sockaddr_storage address;
- int err, fput_needed;
- ///通過fd查找相應的socket,如果不存在則返回錯誤.
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock) {
- ///用戶空間和內核的地址拷貝.
- err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
- if (err >= 0) {
- err = security_socket_bind(sock,
- (struct sockaddr *)&address,
- addrlen);
- if (!err)
- ///調用inet_bind方法.
- err = sock->ops->bind(sock,
- (struct sockaddr *)
- &address, addrlen);
- }
- ///將socket對應的file結構的引用計數.
- fput_light(sock->file, fput_needed);
- }
- return err;
- }
sockfd_lookup_light主要是查找fd對應的socket
- static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
- {
- struct file *file;
- struct socket *sock;
- *err = -EBADF;
- ///通過fd得到對應的file結構
- file = fget_light(fd, fput_needed);
- if (file) {
- ///我們在sock_map_fd通過sock_attach_fd中已經把file的private域賦值爲socket,因此這裏就直接返回socket.
- sock = sock_from_file(file, err);
- if (sock)
- return sock;
- fput_light(file, *fput_needed);
- }
- return NULL;
- }
然後來看inet_bind的實現.
- int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
- {
- ///取得綁定地址.以及相關的socket和inet_sock.
- struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
- struct sock *sk = sock->sk;
- struct inet_sock *inet = inet_sk(sk);
- unsigned short snum;
- int chk_addr_ret;
- int err;
- /* If the socket has its own bind function then use it. (RAW) */
- if (sk->sk_prot->bind) {
- err = sk->sk_prot->bind(sk, uaddr, addr_len);
- goto out;
- }
- err = -EINVAL;
- if (addr_len < sizeof(struct sockaddr_in))
- goto out;
- ///得到地址類型,比如廣播地址之類的.
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
- err = -EADDRNOTAVAIL;
- ///主要是判斷綁定的地址不是本地時的一些條件判斷.
- if (!sysctl_ip_nonlocal_bind &&
- !inet->freebind &&
- addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
- goto out;
- ///得到端口號.
- snum = ntohs(addr->sin_port);
- err = -EACCES;
- ///主要是端口號小於prot_sock(1024)必須得有root權限.如果沒有則退出.capable就是用來判斷權限的.
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
- goto out;
- /* We keep a pair of addresses. rcv_saddr is the one
- * used by hash lookups, and saddr is used for transmit.
- *
- * In the BSD API these are the same except where it
- * would be illegal to use them (multicast/broadcast) in
- * which case the sending device address is used.
- */
- lock_sock(sk);
- /* Check these errors (active socket, double bind). */
- err = -EINVAL;
- ///檢測狀態是否爲close.如果是close狀態,說明這個socket前面已經bind過了.而num只有當raw socket時纔會不爲0
- if (sk->sk_state != TCP_CLOSE || inet->num)
- goto out_release_sock;
- ///設置相應的地址.rcv_saddr是通過hash查找的源地址,而saddr是ip層使用的源地址(ip頭的源地址).
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
- ///如果是多播或者廣播,設置saddr.
- if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
- ///這裏get_port用來發現我們綁定的端口,是否被允許使用.而get_port在tcp中,被實例化爲inet_csk_get_port,接近着我們會分析它的實現.
- if (sk->sk_prot->get_port(sk, snum)) {
- inet->saddr = inet->rcv_saddr = 0;
- err = -EADDRINUSE;
- goto out_release_sock;
- }
- ///這兩個鎖不太理解.不知道誰能解釋下.
- if (inet->rcv_saddr)
- sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
- if (snum)
- sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- ///設置源端口
- inet->sport = htons(inet->num);
- ///目的地址和目的端口,暫時設爲0
- inet->daddr = 0;
- inet->dport = 0;
- sk_dst_reset(sk);
- err = 0;
- out_release_sock:
- release_sock(sk);
- out:
- return err;
- }
這裏我先來介紹下inet_csk_get_port的流程.
當綁定的port爲0時,這時也就是說需要kernel來分配一個新的port.
1 首先得到系統的port範圍.
2 隨機分配一個port.
3 從bhash中得到當前隨機分配的端口的鏈表(也就是inet_bind_bucket鏈表).
4 遍歷這個鏈表(鏈表爲空的話,也說明這個port沒有被使用),如果這個端口已經被使用,則將端口號加一,繼續循環,直到找到當前沒有被使用的port,也就是沒有在bhash中存在的port.
5 新建一個inet_bind_bucket,並插入到bhash中.
當指定port時.
1 從bhash中根據hash值(port計算的)取得當前指定端口對應的inet_bind_bucket結構.
2 如果bhash中存在,則說明,這個端口已經在使用,因此需要判斷這個端口是否允許被reuse.
3 如果不存在,則步驟和上面的第5部一樣.
- int inet_csk_get_port(struct sock *sk, unsigned short snum)
- {
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_bind_hashbucket *head;
- struct hlist_node *node;
- struct inet_bind_bucket *tb;
- int ret;
- struct net *net = sock_net(sk);
- local_bh_disable();
- if (!snum) {
- ///端口爲0,也就是需要內核來分配端口.
- int remaining, rover, low, high;
- ///得到端口範圍.
- inet_get_local_port_range(&low, &high);
- remaining = (high - low) + 1;
- rover = net_random() % remaining + low;
- ///循環來得到一個當前沒有使用的端口.
- do {
- ///通過端口爲key,來得到相應的inet_bind_bucket
- head = &hashinfo->bhash[inet_bhashfn(net, rover,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->ib_net == net && tb->port == rover)
- ///說明這個端口已被使用,因此需要將端口加1,重新查找.
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- ///如果端口大於最大值,則將它賦值爲最小值(這是因爲我們這個端口是隨機值,因此有可能很多端口就被跳過了),重新查找.
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (remaining <= 0)
- goto fail;
- ///將要分配的端口號.
- snum = rover;
- } else {
- ///指定端口號的情況.和上面的方法差不多,只不過只需要一次.
- head = &hashinfo->bhash[inet_bhashfn(net, snum,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->ib_net == net && tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
- tb_found:
- ///用來處理端口號已經被使用的情況.他被使用的socket不爲空的情況.
- if (!hlist_empty(&tb->owners)) {
- ///fastreuse大於0說明其他的socket允許另外的socket也使用這個端口,而reuse表示當前的端口也允許和其他的端口分享這個port.並且socket的狀態必須是TCP_LISTEN,才能做這個判斷.
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
- ///如果出錯,調用inet_csk_bind_conflict.主要是有可能一些使用這個端口的socket,有可能使用不同的ip地址.此時,我們是可以使用這個端口的.
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
- tb_not_found:
- ret = 1;
- ///重新分配一個inet_bind_bucket,並鏈接到bhash.
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
- net, head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- ///設置當前端口的fastreuse,這個域也只能是處於listen的socket才能設置.
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
- success:
- ///將這個socket加到這個端口的ower中.
- if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
- WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
- ret = 0;
- fail_unlock:
- spin_unlock(&head->lock);
- fail:
- local_bh_enable();
- return ret;
- }
在看listen的代碼之前.我們也先來看相關的數據結構:
其中inet_connection_sock我們先前已經介紹過了,它包含了一個icsk_accept_queue的域,這個域是一個request_sock_queue類型,.我們就先來看這個結構:
request_sock_queue也就表示一個request_sock隊列.這裏我們知道,tcp中分爲半連接隊列(處於SYN_RECVD狀態)和已完成連接隊列(處於established狀態).這兩個一個是剛接到syn,等待三次握手完成,一個是已經完成三次握手,等待accept來讀取.
這裏每個syn分節到來都會新建一個request_sock結構,並將它加入到listen_sock的request_sock hash表中.然後3次握手完畢後,將它放入到request_sock_queue的rskq_accept_head和rskq_accept_tail隊列中.這樣當accept的時候就直接從這個隊列中讀取了.
- struct request_sock_queue {
- ///一個指向頭,一個指向結尾.
- struct request_sock *rskq_accept_head;
- struct request_sock *rskq_accept_tail;
- rwlock_t syn_wait_lock;
- u8 rskq_defer_accept;
- /* 3 bytes hole, try to pack */
- ///相應的listen_socket結構.
- struct listen_sock *listen_opt;
- };
listen_sock 表示一個處於listening狀態的socket.
- struct listen_sock {
- ///log_2 of maximal queued SYNs/REQUESTs ,這裏不太理解這個域的作用.
- u8 max_qlen_log;
- /* 3 bytes hole, try to use */
- ///當前的半連接隊列的長度.
- int qlen;
- ///也是指當前的半開連接隊列長度,不過這個值會當重傳syn/ack的時候(這裏要注意是這個syn/ack第一次重傳的時候纔會減一)自動減一.
- int qlen_young;
- int clock_hand;
- u32 hash_rnd;
- ///這個值表示了當前的syn_backlog(半開連接隊列)的最大值
- u32 nr_table_entries;
- ///半連接隊列.
- struct request_sock *syn_table[0];
- };
最後來看下request_sock,它保存了tcp雙方傳輸所必需的一些域,比如窗口大小,對端速率,對端數據包序列號等等這些值.
- struct request_sock {
- struct request_sock *dl_next; /* Must be first member! */
- ///mss值.
- u16 mss;
- u8 retrans;
- u8 cookie_ts; /* syncookie: encode tcpopts in timestamp */
- /* The following two fields can be easily recomputed I think -AK */
- u32 window_clamp; /* window clamp at creation time */
- ///窗口大小.
- u32 rcv_wnd; /* rcv_wnd offered first time */
- u32 ts_recent;
- unsigned long expires;
- ///這個域包含了發送ack的操作集合.
- const struct request_sock_ops *rsk_ops;
- struct sock *sk;
- u32 secid;
- u32 peer_secid;
- };
listen的對應的系統調用是sys_listen,它首先通過sockfd_lookup_light查找到相應的socket,然後調用inet_listen,大體流程和bind差不多,只不過中間調用的是inet_listen罷了.
這裏還有一個概念那就是backlog,在linux中,backlog的大小指的是已完成連接隊列的大小.而不是和半連接隊列之和.而半開連接的大小一般是和backlog差不多大小.
而半開連接隊列的最大長度是根據backlog計算的,我們後面會介紹這個.
因此我們直接來看inet_listen的實現,這個函數主要是進行一些合法性判斷,然後調用inet_csk_listen_start來對相關域進行處理:
- int inet_listen(struct socket *sock, int backlog)
- {
- struct sock *sk = sock->sk;
- unsigned char old_state;
- int err;
- lock_sock(sk);
- err = -EINVAL;
- ///判斷狀態(非連接狀態)以及socket類型.
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
- goto out;
- old_state = sk->sk_state;
- ///狀態必須爲close或者listen.
- if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- goto out;
- /* Really, if the socket is already in listen state
- * we can only allow the backlog to be adjusted.
- */
- ///非listen狀態,需要我們處理.
- if (old_state != TCP_LISTEN) {
- err = inet_csk_listen_start(sk, backlog);
- if (err)
- goto out;
- }
- ///將backlog賦值給sk_max_ack_backlog,也就是完全連接隊列最大值.
- sk->sk_max_ack_backlog = backlog;
- err = 0;
- out:
- release_sock(sk);
- return err;
- }
然後來看inet_csk_listen_start的實現.
它的主要工作是新分配一個listen socket,將它加入到inet_connection_sock的icsk_accept_queue域的listen_opt中.然後對當前使用端口進行判斷.最終返回:
- int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
- {
- struct inet_sock *inet = inet_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
- ///新分配一個listen socket.
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
- if (rc != 0)
- return rc;
- ///先將這兩個ack_backlog賦值爲0.
- sk->sk_max_ack_backlog = 0;
- sk->sk_ack_backlog = 0;
- inet_csk_delack_init(sk);
- /* There is race window here: we announce ourselves listening,
- * but this transition is still not validated by get_port().
- * It is OK, because this socket enters to hash table only
- * after validation is complete.
- */
- ///設置狀態.
- sk->sk_state = TCP_LISTEN;
- ///get_port上面已經分析過了.這裏之所以還要再次判斷一下端口,是爲了防止多線程,也就是另一個線程在我們調用listen之前改變了這個端口的信息.
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- //端口可用的情況,將端口值付給sport,並加入到inet_hashinfo(上面已經分析過)的listening_hash hash鏈表中.
- inet->sport = htons(inet->num);
- sk_dst_reset(sk);
- ///這裏調用__inet_hash實現的.
- sk->sk_prot->hash(sk);
- return 0;
- }
- ///不可用,則返回錯誤.
- sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
- return -EADDRINUSE;
- }
最後我們來看下reqsk_queue_alloc的實現:
- ///半開連接的最大長度.
- int sysctl_max_syn_backlog = 256;
- int reqsk_queue_alloc(struct request_sock_queue *queue,
- unsigned int nr_table_entries)
- {
- size_t lopt_size = sizeof(struct listen_sock);
- struct listen_sock *lopt;
- ///在當前的nr_table_entries(也就是listen傳進來的backlog)和sysctl_max_syn_backlog取一個較小的值.
- nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
- ///也就是說nr_table_entries不能小於8.
- nr_table_entries = max_t(u32, nr_table_entries, 8);
- ///其實也就是使nr_table_entries更接近於2的次冪
- nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
- ///最終所要分配的listen_sock 的大小.
- lopt_size += nr_table_entries * sizeof(struct request_sock *);
- if (lopt_size > PAGE_SIZE)
- lopt = __vmalloc(lopt_size,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
- else
- lopt = kzalloc(lopt_size, GFP_KERNEL);
- if (lopt == NULL)
- return -ENOMEM;
- ///計算max_qlen_log的值,他最小要爲3,最大爲對nr_table_entries求以2爲低的log..
- for (lopt->max_qlen_log = 3;
- (1 << lopt->max_qlen_log) < nr_table_entries;
- lopt->max_qlen_log++);
- get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
- rwlock_init(&queue->syn_wait_lock);
- queue->rskq_accept_head = NULL;
- ///給nr_table_entries賦值.
- lopt->nr_table_entries = nr_table_entries;
- write_lock_bh(&queue->syn_wait_lock);
- ///將listen_socket賦值給queue->listen_opt
- queue->listen_opt = lopt;
- write_unlock_bh(&queue->syn_wait_lock);
- return 0;
- }