數據結構
/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
* @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
* @ops: protocol specific socket operations
* @fasync_list: Asynchronous wake up list
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
* @wait: wait queue for several uses
* @type: socket type (%SOCK_STREAM, etc)
*/
struct socket {
socket_state state; //socket 的狀態
unsigned long flags; //socket 的標誌位
const struct proto_ops *ops; //socket 的函數操作表
struct fasync_struct *fasync_list; //socket 的異步喚醒隊列
struct file *file; // 與socket關聯的文件指針
struct sock *sk; // 代表具體協議內容的 sock 結構指針
wait_queue_head_t wait; // 等待隊列
short type; //socket 的類型
};
從 socket 結構體可以看出 socket 是通用的套接字結構體的公共部分,而其中的 sock 結構體則是與使用的具體協議相關的部分,可以理解成從 socket 中抽象出 sock 部分,sock 結構體是根據使用的協議掛入到 socket 中,下面瞭解下 sock 結構體。
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common; // 與 inet_timewait_sock 共享使用
#define sk_family __sk_common.skc_family // 地址族
#define sk_state __sk_common.skc_state // 連接狀態
#define sk_reuse __sk_common.skc_reuse // 確定複用地址
#define sk_bound_dev_if __sk_common.skc_bound_dev_if //綁定設備 ID
#define sk_node __sk_common.skc_node // 鏈入主哈希表
#define sk_bind_node __sk_common.skc_bind_node // 鏈入綁定哈希表
#define sk_refcnt __sk_common.skc_refcnt // 使用計數
#define sk_hash __sk_common.skc_hash // 哈希值
#define sk_prot __sk_common.skc_prot // 協議函數表
#define sk_net __sk_common.skc_net // 所屬的網絡空間
unsigned char sk_shutdown : 2, // 是否關閉,mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
sk_no_check : 2, // 是否檢查數據包
sk_userlocks : 4; // 用戶鎖,%SO_SNDBUF and %SO_RCVBUF settings
unsigned char sk_protocol; // 使用協議族的哪一種協議
unsigned short sk_type; // socket 的類型,例如 SOCK_STREAM 等
int sk_rcvbuf; // 接受緩衝區的長度(字節數)
socket_lock_t sk_lock; // 用於同步
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
*/
struct {
struct sk_buff *head; // 記錄最先接收到的數據包
struct sk_buff *tail; // 記錄最後接收到的數據包
} sk_backlog; // 後備隊列
wait_queue_head_t *sk_sleep; //sock 的等待隊列
struct dst_entry *sk_dst_cache; // 路由項緩存
struct xfrm_policy *sk_policy[2]; //流策略
rwlock_t sk_dst_lock; // 路由項緩存鎖
atomic_t sk_rmem_alloc; // 接受隊列的字節數
atomic_t sk_wmem_alloc; // 發送隊列的字節數
atomic_t sk_omem_alloc; // 可選擇/其他 的字節數
int sk_sndbuf; // 發送緩存的總長度
struct sk_buff_head sk_receive_queue; //接收隊列(接收到的數據包隊列)
struct sk_buff_head sk_write_queue; //發送隊列(正在發送的數據包隊列)
struct sk_buff_head sk_async_wait_queue; //DMA 複製的數據包 TODO
int sk_wmem_queued; //全部數據包占用內存計數
int sk_forward_alloc; //記錄可用內存長度
gfp_t sk_allocation; //分配模式
int sk_route_caps; //路由的兼容性標誌位
int sk_gso_type; //GSO 通用分段類型 TODO
unsigned int sk_gso_max_size; //用於建立 GSO 通用分段的最大長度
int sk_rcvlowat; //SO_RCVLOWAT 設置
unsigned long sk_flags; //SO_BROADCAST、SO_KEEPALIVE、SO_OOBINLINE、SO_LINGER 設置
unsigned long sk_lingertime; //停留時間,確定關閉時間
struct sk_buff_head sk_error_queue; // 錯誤數據包隊列
struct proto *sk_prot_creator; //sock 創建接口
rwlock_t sk_callback_lock; // 爲後半部處理使用的鎖
int sk_err, //出錯碼
sk_err_soft; //持續出現的錯誤
atomic_t sk_drops; //原始 socket 發送的計數器
unsigned short sk_ack_backlog; //當前監聽到的連接數量
unsigned short sk_max_ack_backlog; //在 listen() 函數中監聽到的連接數量
__u32 sk_priority; //優先級
struct ucred sk_peercred; // SO_PEERCRED 設置
long sk_rcvtimeo; // SO_RCVTIMEO 設置接受超時時間
long sk_sndtimeo; // SO_SNDTIMEO 設置發送超時時間
struct sk_filter *sk_filter; //sock 的過濾器
void *sk_protinfo; //私有區域,當不使用slab高速緩存時由協議族定義
struct timer_list sk_timer; //sock 的沖刷定時器
ktime_t sk_stamp; //最後接收數據包的時間
struct socket *sk_socket; //對應的 socket 指針
void *sk_user_data; //rpc 提供的數據
struct page *sk_sndmsg_page; // 發送數據塊所在的緩衝頁
struct sk_buff *sk_send_head; // 發送數據包的隊列頭
__u32 sk_sndmsg_off; //發送數據塊在緩衝頁的結尾
int sk_write_pending; //等待發送的數量
void *sk_security; //用於安全模式
__u32 sk_mark; //通用的數據包掩碼
/* XXX 4 bytes hole on 64 bit */
void (*sk_state_change)(struct sock *sk); //sock 狀態改變後調用的函數
void (*sk_data_ready)(struct sock *sk, int bytes); //在數據被處理完成後調用的函數
void (*sk_write_space)(struct sock *sk); //發送空間可以使用後調用的函數
void (*sk_error_report)(struct sock *sk); //處理錯誤的函數
int (*sk_backlog_rcv)(struct sock *sk, //處理庫存數據包函數
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk); //sock 的銷燬函數
};
與應用程序密切相關的共用部分放在了socket結構中,而與協議相關的內容則放在sock結構中,然後使socket與sock掛鉤,設計靈活巧妙。
我們看到sock中數據包的結構通過sk_buff來體現,每個協議都是通過sk_buff結構體用於封裝、載運數據包,我們可以看下其數據結構。
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next; //隊列中的下一個數據包
struct sk_buff *prev; //隊列中的前一個數據包
struct sock *sk; //指向所屬的 sock 數據包
ktime_t tstamp; //數據包到達的時間
struct net_device *dev; //接收數據包的網絡設備
union {
struct dst_entry *dst; //路由項
struct rtable *rtable; //路由表
};
struct sec_path *sp; //用於 xfrm 的安全路徑
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48]; // cb 控制塊
unsigned int len, //全部數據塊的總長度
data_len; //分段、分散數據塊的總長度
__u16 mac_len, //鏈路層頭部的長度
hdr_len; //在克隆數據包時可寫的頭部長度
union {
__wsum csum; //校驗和
struct {
__u16 csum_start; //校驗和在數據包頭部 skb->head 中的起始位置
__u16 csum_offset;//校驗和保存到 csum_start 中的位置
};
};
__u32 priority; //數據包在隊列中的優先級
__u8 local_df:1, //是否允許本地數據分段
cloned:1, //是否允許被克隆
ip_summed:2, //IP校驗和標誌
nohdr:1, //運載時使用,表示不能被修改頭部
nfctinfo:3; //數據包連接關係
__u8 pkt_type:3, //數據包的類型
fclone:2, //數據包克隆關係
ipvs_property:1,//數據包所屬的 ipvs
peeked:1, //數據包是否屬於操作狀態
nf_trace:1; //netfilter 對數據包的跟蹤標誌
__be16 protocol; //底層驅動使用的數據包協議
void (*destructor)(struct sk_buff *skb); //銷燬數據包的函數
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge; //關於網橋的數據
#endif
int iif;
#ifdef CONFIG_NETDEVICES_MULTIQUEUE
__u16 queue_mapping;
#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
/* 14 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
__u32 mark;
sk_buff_data_t transport_header; //指向數據塊中傳輸層頭部
sk_buff_data_t network_header; //指向數據塊中網絡層頭部
sk_buff_data_t mac_header; //指向數據塊中鏈路層頭部
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail; //指向數據塊的結束地址
sk_buff_data_t end; //指向緩衝塊的結束地址
unsigned char *head, //指向緩衝塊的開始地址
*data; //指向數據塊的開始地址
unsigned int truesize; //數據包的實際長度(結構長度與數據塊長度之和)
atomic_t users; //數據包的使用計數器
};
共用部分 socket 結構體、通用部分 sock 結構體、專用部分 inet_sock 結構體。
tcp_sock 內容與 tcp 協議緊密相關,我們看其內容
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn; //由註釋看到該結構體必須在 tcp_sock 頭部 TODO why?
u16 tcp_header_len; /* Bytes of tcp header to send 發送的 tcp 頭部字節數 */
u16 xmit_size_goal; /* Goal for segmenting output packets 分段傳送的數據包數量 */
/*
* Header prediction flags 頭部的預置位
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;
/*
* RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...)
* See RFC793 and RFC1122. The RFC writes these in capitals.
*/
u32 rcv_nxt; /* What we want to receive next 下一個要接收的目標 */
u32 copied_seq; /* Head of yet unread data 代表還沒有讀取的數據 */
u32 rcv_wup; /* rcv_nxt on last window update sent rcv_nxt 在最後一次窗口更新時內容 */
u32 snd_nxt; /* Next sequence we send 下一個要發送的目標 */
u32 snd_una; /* First byte we want an ack for 第一個要 ack 的字節 */
u32 snd_sml; /* Last byte of the most recently transmitted small packet 最近發送數據包中的尾字節 */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) 最後一次接收到 ack 的時間 */
u32 lsndtime; /* timestamp of last sent data packet (for restart window) 最後一次發送數據包的時間 */
/* Data for direct copy to user 直接複製給用戶的數據 */
struct {
struct sk_buff_head prequeue; //預處理隊列
struct task_struct *task; //預處理進程
struct iovec *iov; //用戶程序(應用程序)接收數據的緩衝區
int memory; //預處理數據包計數器
int len; //預處理長度
#ifdef CONFIG_NET_DMA
/* members for async copy 異步複製的內容 */
struct dma_chan *dma_chan;
int wakeup;
struct dma_pinned_list *pinned_list;
dma_cookie_t dma_cookie;
#endif
} ucopy;
u32 snd_wl1; /* Sequence for window update 窗口更新的順序 */
u32 snd_wnd; /* The window we expect to receive 期望接收的窗口 */
u32 max_window; /* Maximal window ever seen from peer 從對方獲得的最大窗口 */
u32 mss_cache; /* Cached effective mss, not including SACKS 有效的 mss,不包括 SACKS TODO mss、SACKS */
u32 window_clamp; /* Maximal window to advertise 對外公佈的最大窗口 */
u32 rcv_ssthresh; /* Current window clamp 當前窗口 */
u32 frto_highmark; /* snd_nxt when RTO occurred 在 rto 時的 snd_nxt */
u8 reordering; /* Packet reordering metric. 預設的數據包數量 */
u8 frto_counter; /* Number of new acks after RTO rto 後的 ack 次數 */
u8 nonagle; /* Disable Nagle algorithm? 是否使用 Nagle 算法 TODO Nagle */
u8 keepalive_probes; /* num of allowed keep alive probes 允許持有的數量 */
/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
u32 mdev; /* medium deviation */
u32 mdev_max; /* maximal mdev for the last rtt period */
u32 rttvar; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
u32 packets_out; /* Packets which are "in flight" 處於飛行中的數據包數量 */
u32 retrans_out; /* Retransmitted packets out 轉發的數據包數量 */
/*
* Options received (usually on last packet, some only on SYN packets).
*/
struct tcp_options_received rx_opt;
/*
* Slow start and congestion control (see also Nagle, and Karn & Partridge) TODO 慢啓動與阻塞控制
*/
u32 snd_ssthresh; /* Slow start size threshold 慢啓動的起點值 */
u32 snd_cwnd; /* Sending congestion window 發送的阻塞窗口 */
u32 snd_cwnd_cnt; /* Linear increase counter 線性計數器 */
u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this 不允許 snd_cwnd 超過的值 */
u32 snd_cwnd_used;
u32 snd_cwnd_stamp;
struct sk_buff_head out_of_order_queue; /* Out of order segments go here 超出分段規則的隊列 */
u32 rcv_wnd; /* Current receiver window 當前接收窗口 */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer tcp 發送數據的順序號 */
u32 pushed_seq; /* Last pushed seq, required to talk to windows 最後送出的順序號,需要通知窗口 */
/* SACKs data */
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
struct tcp_sack_block recv_sack_cache[4];
struct sk_buff *highest_sack; /* highest skb with SACK received
* (validity guaranteed only if
* sacked_out > 0)
*/
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
int lost_cnt_hint;
int retransmit_cnt_hint;
u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */
u16 advmss; /* Advertised MSS */
u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 lost_out; /* Lost packets */
u32 sacked_out; /* SACK'd packets */
u32 fackets_out; /* FACK'd packets */
u32 high_seq; /* snd_nxt at onset of congestion */
u32 retrans_stamp; /* Timestamp of the last retransmit,
* also used in SYN-SENT to remember stamp of
* the first SYN. */
u32 undo_marker; /* tracking retrans started here. */
int undo_retrans; /* number of undoable retransmissions. */
u32 urg_seq; /* Seq of received urgent pointer */
u16 urg_data; /* Saved octet of OOB data and control flags */
u8 urg_mode; /* In urgent mode */
u8 ecn_flags; /* ECN status bits. */
u32 snd_up; /* Urgent pointer */
u32 total_retrans; /* Total retransmits for entire connection */
u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */
unsigned int keepalive_time; /* time before keep alive takes place */
unsigned int keepalive_intvl; /* time interval between keep alive probes */
int linger2;
unsigned long last_synq_overflow;
u32 tso_deferred;
/* Receiver side RTT estimation */
struct {
u32 rtt;
u32 seq;
u32 time;
} rcv_rtt_est;
/* Receiver queue space 接受隊列空間 */
struct {
int space;
u32 seq;
u32 time;
} rcvq_space;
/* TCP-specific MTU probe information. TCP 指定的 MTU 檢驗內容 */
struct {
u32 probe_seq_start;
u32 probe_seq_end;
} mtu_probe;
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
struct tcp_sock_af_ops *af_specific;
/* TCP MD5 Signagure Option information */
struct tcp_md5sig_info *md5sig_info;
#endif
};
demo
瞭解了一些數據結構之後,下面將正式開始介紹 socket 相關的源碼。
我們先來看下正常的服務器使用的流程
int main()
{
struct sockaddr_in server_address;
struct sockaddr_in client_address;
server_fd = socket(AF_INET,SOCK_STREAM,0);
server_address.sin_family = AF_INET;
server_address.sin_addr.s_addr = inet_addr("192.168.1.1");
server_address.sin_port = htons(54188);
server_len = sizeof(server_address);
bind(server_fd,(struct sockaddr*)&server_address,server_len);
/*創建一個Socket的監聽隊列(允許接收10個連接),監聽客戶端Socket的連接請求*/
listen(server_fd,10);
while(1) {
char recv[20];
printf("server is waiting\n");
/*程序運行到此處時,說明客戶端的連接請求已經到來,接受它的連接請求,克隆出一個Socket與客戶端建立連接,並將客戶端的“電話號碼”記錄在client_address中,函數返回建立連接的ID號*/
client_len = sizeof(client_address);
client_fd = accept(server_fd,(struct sockaddr*)&client_address,&client_len);
/*使用read和write函數接收客戶端字符然後發回客戶端*/
read(client_fd,recv,20);
write(client_fd,back,20);
printf("received from client= %s\n",recv);
close(client_fd);
}
close(server_fd);
exit(0);
}
無非先是 socket() 創建服務器socket ,然後bind() 將地址結構與 socket 掛鉤起來,於是 listen()監聽客戶端的連接請求,然後通過accept()然後得到fd,根據vfs即訪問文件的方式訪問套接字,read/write。
socket 的創建
服務器調用socket()函數,其調用的庫函數在glibc源碼中找到
#include <errno.h>
#include <sys/socket.h>
/* Create a new socket of type TYPE in domain DOMAIN, using
protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically.
Returns a file descriptor for the new socket, or -1 for errors. */
int
__socket (domain, type, protocol)
int domain;
int type;
int protocol;
{
__set_errno (ENOSYS);
return -1;
}
weak_alias (__socket, socket)
stub_warning (socket)
#include <stub-tag.h>
這裏看到使用 weak_alias() 函數爲 socket() 函數聲明瞭一個“函數別名”_socket(),跟蹤其_socket.S彙編代碼,發現其通過調用system_call() 函數根據系統函數調用表sys_call_table最終執行的系統調用函數爲sys_socketcall(),它也是bind()、listen()、accept()等函數的系統調用入口。
/* Define unique numbers for the operations permitted on socket. Linux
uses a single system call for all these functions. The relevant code
file is /usr/include/linux/net.h.
We cannot use a enum here because the values are used in assembler
code. */
#define SOCKOP_socket 1
#define SOCKOP_bind 2
#define SOCKOP_connect 3
#define SOCKOP_listen 4
#define SOCKOP_accept 5
#define SOCKOP_getsockname 6
#define SOCKOP_getpeername 7
#define SOCKOP_socketpair 8
#define SOCKOP_send 9
#define SOCKOP_recv 10
#define SOCKOP_sendto 11
#define SOCKOP_recvfrom 12
#define SOCKOP_shutdown 13
#define SOCKOP_setsockopt 14
#define SOCKOP_getsockopt 15
#define SOCKOP_sendmsg 16
#define SOCKOP_recvmsg 17
asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
{
int ret;
u32 a[6];
u32 a0, a1;
if (call < SYS_SOCKET || call > SYS_RECVMSG)
return -EINVAL;
if (copy_from_user(a, args, nas[call]))
return -EFAULT;
a0 = a[0];
a1 = a[1];
switch (call) {
case SYS_SOCKET:
ret = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
ret = sys_bind(a0, compat_ptr(a1), a[2]);
break;
case SYS_CONNECT:
ret = sys_connect(a0, compat_ptr(a1), a[2]);
break;
case SYS_LISTEN:
ret = sys_listen(a0, a1);
break;
case SYS_ACCEPT:
ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_GETSOCKNAME:
ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_GETPEERNAME:
ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_SOCKETPAIR:
ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
break;
case SYS_SEND:
ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_SENDTO:
ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
break;
case SYS_RECV:
ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_RECVFROM:
ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5]));
break;
case SYS_SHUTDOWN:
ret = sys_shutdown(a0,a1);
break;
case SYS_SETSOCKOPT:
ret = compat_sys_setsockopt(a0, a1, a[2],
compat_ptr(a[3]), a[4]);
break;
case SYS_GETSOCKOPT:
ret = compat_sys_getsockopt(a0, a1, a[2],
compat_ptr(a[3]), compat_ptr(a[4]));
break;
case SYS_SENDMSG:
ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
break;
case SYS_RECVMSG:
ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
然後根據調用號 SOCKOP_socket 找到對應的系統調用函數 sys_socket()
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock);
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
先是通過sock_create() 函數創建 socket,然後通過 sock_map_fd 函數與vfs虛擬文件系統建立關聯,返回相應fd即retval統一管理。
分配並初始化 socket 結構
我們先跟蹤sock_create()函數,這函數負責分配並初始化 socket 結構。
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
可以看到sock_create函數前三個參數就是socket()函數傳入的參數,最後一個 socket** 參數負責接收socket結果,這裏繼續調用__sock_create() 函數
static int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc(); //分配 socket 結構空間
if (!sock) {
if (net_ratelimit())
printk(KERN_WARNING "socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type; //記錄socket 的類型
#if defined(CONFIG_KMOD)
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (net_families[family] == NULL) //檢查協議族操作表
request_module("net-pf-%d", family); //安裝協議族操作表
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //得到相應的協議族操作表
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol); //執行取得的協議族操作表的 create 函數
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock; // 返回創建結果
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
可以看到這個函數的操作,先是給socket結構分配相應空間,再通過family參數 AF_INET(2) 取得相應協議族操作表,再執行協議族操作表中的create函數將結果返回。我們先看 sock_alloc() 函數,它負責爲服務器程序分配 socket 結構和文件節點。
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode(sock_mnt->mnt_sb); //在文件系統中創建文件節點同時分配 socket 結構
if (!inode)
return NULL;
sock = SOCKET_I(inode); //取得 socket 結構指針
inode->i_mode = S_IFSOCK | S_IRWXUGO; //設置文件節點的模式
inode->i_uid = current->fsuid; //設置爲當前進程的uid
inode->i_gid = current->fsgid; //設置爲當前進程的gid
get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use); //設置當前的 sockets_in_use++
return sock;
}
這裏 sock_mnt 是 socket 網絡文件系統的根節點,這兒相當於在socket網絡文件系統中分配一個inode節點,於是服務器程序可以通過相應的inode節點通過read/write操作進行讀寫。我們先看new_inode() 函數。
struct inode *new_inode(struct super_block *sb)
{
/*
* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
* error if st_ino won't fit in target struct field. Use 32bit counter
* here to attempt to avoid that.
*/
static unsigned int last_ino;
struct inode * inode;
spin_lock_prefetch(&inode_lock);
inode = alloc_inode(sb); //調用超級塊函數操作表
if (inode) { //對分配得到inode處理
spin_lock(&inode_lock);
inodes_stat.nr_inodes++;
list_add(&inode->i_list, &inode_in_use);
list_add(&inode->i_sb_list, &sb->s_inodes);
inode->i_ino = ++last_ino;
inode->i_state = 0;
spin_unlock(&inode_lock);
}
return inode;
}
我們再看其 SOCKET_I(inode) 函數。
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
這兒有點亂,稍微整理下相當於
#define container_of(inode, struct socket_alloc, vfs_inode) ({ \
const typeof( ((struct socket_alloc *)0)->vfs_inode ) *__mptr = (inode); \
(struct socket_alloc *)( (char *)__mptr - offsetof(struct socket_alloc,vfs_inode) );})
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
#define OFFSETOF(strct, elem) ((long)&(((struct strct *)NULL)->elem))
這樣就很簡單了,offsetof宏相當於elem在struct中的偏移量,即vfs_inode在struct socket_alloc中的偏移量,再由inode即vfs_inode地址減去其在struct socket_alloc中的偏移量,得到了struct socket_alloc的首地址,同時又是socket的首地址。這個宏僅僅是計算指針偏移量得到socket首地址,socket_alloc結構的地址分配在new_inode() 函數中進行。
回到new_inode() 函數,我們看到其通過alloc_inode()函數調用超級塊的函數操作表。
static struct inode *alloc_inode(struct super_block *sb)
{
static const struct address_space_operations empty_aops;
static struct inode_operations empty_iops;
static const struct file_operations empty_fops;
struct inode *inode;
if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb);
else
inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
if (inode) {
...
}
return inode;
}
這時候的sb->s_op已經在sock_init()過程中,經過get_sb_pseudo()函數將其賦值爲 sockfs_ops,即此時調用的爲sockfs_ops->alloc_inode 函數。
static struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
.destroy_inode =sock_destroy_inode,
.statfs = simple_statfs,
};
查找sockfs_ops結構體此時調用的是sock_alloc_inode函數完成socket_alloc結構的分配。
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); //分配 socket_alloc 結構
if (!ei)
return NULL;
init_waitqueue_head(&ei->socket.wait); // 初始化等待隊列的頭
// 初始化socket
ei->socket.fasync_list = NULL;
ei->socket.state = SS_UNCONNECTED; //狀態設置爲未連接
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;
return &ei->vfs_inode;
}
這裏進行內存分配與socket結構的初始化。可以看到 kmem_cache_alloc 函數從slab高速緩存 sock_init_cache 直接進行分配,這個緩存塊是在sock_init()中通過init_inodecache()函數建立的。
[ TODO kmem_cache_alloc 跟 kmem_cache_create 兩個slab函數 ]
使用協議族的函數表初始化 socket
回到__sock_create() 函數,我們看到先是通過 net_families[2] 判斷是否爲NULL即是否安裝了AF_INET的協議族操作表(這個過程在內核初始化的時候進行),在這裏把相關過程列一下
inet_init -> fs_initcall(inet_init);
#define fs_initcall(fn) __define_initcall("5",fn,5)
static int __init inet_init(void)
{
...
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
...
}
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (net_families[ops->family])
err = -EEXIST;
else {
net_families[ops->family] = ops;
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
最終通過sock_register 函數將inet_family_ops註冊到net_families[PF_INET]中,在這裏PF_INET就是AF_INET
#define PF_INET AF_INET
繼續回到__sock_create() 函數,我們看到執行了協議族操作表inet_family_ops的create函數即 inet_create
static int inet_create(struct net *net, struct socket *sock, int protocol)
{
struct sock *sk;
struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
// 檢查 socket 類型及加密字符
if (sock->type != SOCK_RAW && //原始類型
sock->type != SOCK_DGRAM && //數據報類型,UDP協議
!inet_ehash_secret)
build_ehash_secret();
從socket()函數傳進來的socket類型參數爲SOCK_STREAM即流類型,並且判斷是否有了加密字符,否則調用 build_ehash_secret函數來設置
void build_ehash_secret(void)
{
u32 rnd;
do {
get_random_bytes(&rnd, sizeof(rnd)); //得到非 0 隨機數
} while (rnd == 0);
spin_lock_bh(&inetsw_lock);
if (!inet_ehash_secret)
inet_ehash_secret = rnd; //使用隨機數作爲加密字符
spin_unlock_bh(&inetsw_lock);
}
回到 inet_create 函數,注意到變量 struct inet_protosw *answer,inet_protosw結構體用於IP協議對應 socket 的接口,也就是靠近 socket 層的協議信息均保存在這個數據結構中,每一個IP協議都有這麼一個接口結構。
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. 下面兩個變量用於校對使用 */
unsigned short type; /* This is the 2nd argument to socket(2). 對應於socket的類型 */
unsigned short protocol; /* This is the L4 protocol number. IP協議編碼 */
struct proto *prot; /* 對應的協議結構體指針 */
const struct proto_ops *ops; /* 對應協議的函數操作表指針 */
int capability; /* Which (if any) capability do
* we need to use this socket
* interface?
*/
char no_check; /* checksum on rcv/xmit/none? 是否在接收/發送的過程中使用校驗和 */
unsigned char flags; /* See INET_PROTOSW_* below. 標誌位 */
};
繼續看 inet_create 函數
sock->state = SS_UNCONNECTED; //設置socket的狀態爲'未連接狀態'
/* Look for the requested type/protocol pair. */
answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock(); // rcu 鎖的操作,適合讀多寫少情況
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
/* Check the non-wild match. 檢查協議編碼是否與內核已經註冊的協議相同 */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. 檢查是否屬於虛擬IP協議 */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
answer = NULL;
}
if (unlikely(answer == NULL)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) 是否指定了名稱
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP) 否則就是通用的名稱
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
err = -EAFNOSUPPORT;
if (!inet_netns_ok(net, protocol))
goto out_rcu_unlock;
rcu_read_lock 跟 rcu_read_unlock 之間是讀臨界區
#define list_for_each_rcu(pos, head) \
for (pos = rcu_dereference((head)->next); \
prefetch(pos->next), pos != (head); \
pos = rcu_dereference(pos->next))
#define rcu_dereference(p) ({ \
typeof(p) _________p1 = ACCESS_ONCE(p); \
smp_read_barrier_depends(); \
(_________p1); \
})
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
可以看到宏 list_for_each_rcu 作用,在rcu保護下循環檢查inetsw數組,直到找到符合 socket 類型的隊列,這隊列是inet_protosw結構。inetsw隊列數組也是在inet_init() 函數中註冊完成的。
static int __init inet_init(void)
{
struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
...
(void)sock_register(&inet_family_ops);
...
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
...
}
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM, //TCP數據流協議
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM, //UDP數據報協議
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW, //RAW原始套接字
.protocol = IPPROTO_IP, /* wild card 虛擬IP類型*/
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
從inet_init()函數看到,使用 inet_register_protosw() 函數註冊這個數組。
static struct list_head inetsw[SOCK_MAX];
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. 檢查參數P的類型是否超越了內核範圍 */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol)
break;
last_perm = lh;
}
answer = NULL;
}
if (answer)
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
synchronize_net();
return;
out_permanent:
printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
protocol);
goto out;
out_illegal:
printk(KERN_ERR
"Ignoring attempt to register invalid socket type %d.\n",
p->type);
goto out;
}
該函數通過宏 list_for_each 循環inetsw數組,通過對比要插入的參數p是否INET_PROTOSW_PERMANENT標誌、並且與隊列屬於同一種協議,如果符合則鏈入p->list隊列中。
可以看到inet_init()函數將數組inetsw_array中的元素逐一鏈入到inetsw數組的隊列中。
註冊這塊告一段落,回到inet_create函數中。
還記得 server_fd = socket(AF_INET,SOCK_STREAM,0); 所以protocol 爲 0,且type爲SOCK_STREAM即TCP協議類型,所以answer指向TCP協議的inet_protosw結構,然後protocol爲IPPROTO_IP(0)那麼不等於TCP協議的inet_protosw結構的protocol
IPPROTO_IP = 0, /* Dummy protocol for TCP. */
#define IPPROTO_IP IPPROTO_IP
於是,protocol = answer->protocol即peotocol被賦值爲TCP協議的protocol 6。其中capability爲-1,然後inet_netns_ok判斷
我們繼續看 inet_create 函數
sock->ops = answer->ops; //inet_stream_ops
answer_prot = answer->prot; //tcp_prot
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
BUG_TRAP(answer_prot->slab != NULL);
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); //分配 sock 結構
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; //設置處理庫存函數
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
sk_refcnt_debug_inc(sk);
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares. 這裏允許用戶指定 socket 的編號,創建時自動共享
*/
inet->sport = htons(inet->num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk); //調用運輸層鉤子函數init tcp_prot
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
可以看到將tcp結構的操作函數 inet_stream_ops 掛鉤給了socket的協議操作函數,將answer->prot賦值給answer_prot,作爲型參傳遞給了 sk_alloc() 函數使用。[ TODO socket -- 傳輸層 proto -- 網絡層 inet_proto ]
分配並初始化 sock 結構
我們看 sk_alloc()函數,其中prot參數爲answer->prot,即tcp_prot
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sock_net_set(sk, get_net(net));
}
return sk;
}
用sk_prot_alloc() 函數分配一個通用的sock結構體
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;
slab = prot->slab;
if (slab != NULL)
sk = kmem_cache_alloc(slab, priority); //內存管理的slab分配函數,從sock高速緩衝池中分配
else
sk = kmalloc(prot->obj_size, priority); //通用的告訴緩衝池中分配空間結構
if (sk != NULL) {
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
goto out_free_sec;
}
return sk;
out_free_sec:
security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
else
kfree(sk);
return NULL;
}
根據prot結構是否提供了slab高速緩存來確定是在高速緩存分配或者在通用緩衝中分配。
分配成功後對family賦值、將tcp_prot結構賦值到 sk_prot 跟 sk_prot_creator上,然後在sock_lock_init 函數中對sock結構中的起同步作用的sk_lock鎖進行初始化。其中 sk_lock 是 socket_lock_t 類型的變量,可以說它是專用於 socket 的鎖
typedef struct {
spinlock_t slock;
int owned;
wait_queue_head_t wq;
/*
* We express the mutex-alike socket_lock semantics
* to the lock validator by explicitly managing
* the slock as a lock variant (in addition to
* the slock itself):
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} socket_lock_t;
可以看到其中包含了一個自旋鎖 slock 跟 一個等待隊列頭 wq。sock_lock_init_class_and_name 是對其內容的初始化。
static inline void sock_lock_init(struct sock *sk)
{
sock_lock_init_class_and_name(sk,
af_family_slock_key_strings[sk->sk_family],
af_family_slock_keys + sk->sk_family,
af_family_key_strings[sk->sk_family],
af_family_keys + sk->sk_family);
}
#define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
do { \
sk->sk_lock.owned = 0; \
init_waitqueue_head(&sk->sk_lock.wq); \
spin_lock_init(&(sk)->sk_lock.slock); \
debug_check_no_locks_freed((void *)&(sk)->sk_lock, \
sizeof((sk)->sk_lock)); \
lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
(skey), (sname)); \
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
} while (0)
回到sk_alloc中,傳入的net參數爲 current->nsproxy->net_ns,這個是當前進程中記錄的網絡空間的結構,調用sock_net_set(sk, get_net(net)) 函數記錄下所屬的 net 空間結構,get_net(net)則是增加 net 結構的計數器。
static inline
void sock_net_set(struct sock *sk, struct net *net)
{
#ifdef CONFIG_NET_NS
sk->sk_net = net;
#endif
}
static inline struct net *get_net(struct net *net)
{
atomic_inc(&net->count);
return net;
}
回到 inet_create 函數,sk_alloc 分配了 sock 結構並初始化之後,如果分配失敗則 sk == NULL 直接退出
接下來調用inet = inet_sk(sk),通過 sock 指針得到 struct inet_sock * inet指針
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
return (struct inet_sock *)sk;
}
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
__be32 daddr; //目標地址
__be32 rcv_saddr;
__be16 dport; //目標端口
__u16 num; //端口
__be32 saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options *opt;
__be16 sport;
__u16 id;
__u8 tos;
__u8 mc_ttl;
__u8 pmtudisc;
__u8 recverr:1,
is_icsk:1,
freebind:1,
hdrincl:1,
mc_loop:1;
int mc_index;
__be32 mc_addr;
struct ip_mc_socklist *mc_list;
struct {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct dst_entry *dst;
int length; /* Total length of all frames */
__be32 addr;
struct flowi fl;
} cork;
};
可以瞭解到這是 socket 的專用數據結構。
再往後,調用 sock_init_data(sock, sk) 對新分配的sock結構做進一步初始化,將socket與sock的內容掛鉤起來。
void sock_init_data(struct socket *sock, struct sock *sk)
{ /* 隊列並非採用通用的 list_head 來維護,而是使用 skb_buffer 隊列: */
skb_queue_head_init(&sk->sk_receive_queue); //初始化接收隊列
skb_queue_head_init(&sk->sk_write_queue); //初始化發送隊列
skb_queue_head_init(&sk->sk_error_queue); //初始化錯誤數據包隊列
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue); //DMA 複製的數據包隊列
#endif
sk->sk_send_head = NULL; //發送數據包的隊列頭
init_timer(&sk->sk_timer); //初始化 sock 的沖刷定時器
sk->sk_allocation = GFP_KERNEL; //分配模式,無內存可用時可引起休眠
sk->sk_rcvbuf = sysctl_rmem_default; //接受緩衝區的長度 32767
sk->sk_sndbuf = sysctl_wmem_default; //發送緩存的總長度 32767
sk->sk_state = TCP_CLOSE;
sk->sk_socket = sock; //指向對應的 socket 結構
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
sk->sk_sleep = &sock->wait;
sock->sk = sk; //回指對應的 scok 結構
} else
sk->sk_sleep = NULL;
rwlock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
sk->sk_peercred.pid = 0;
sk->sk_peercred.uid = -1;
sk->sk_peercred.gid = -1;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = ktime_set(-1L, 0);
atomic_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
注意到這裏對三個重要數據包隊列頭的初始化,是 sk_buff_head 結構
struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen;
spinlock_t lock;
};
可以看到這是一個雙向隊列結構,其中qlen是隊列長度、lock用於併發控制的鎖。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
};
然後調用了sk->sk_prot->init(sk)即tcp_prot->init()函數,即 tcp_v4_init_sock 函數
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
tp->snd_cwnd = 2;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_af_ops = &ipv4_specific;
icsk->icsk_sync_mss = tcp_sync_mss;
#ifdef CONFIG_TCP_MD5SIG
tp->af_specific = &tcp_sock_ipv4_specific;
#endif
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
atomic_inc(&tcp_sockets_allocated);
return 0;
}
inet_connection_sock 結構是不是很熟悉?是 tcp_sock 結構體的第一個成員。這裏也都是一些初始化賦值操作,最後遞增tcp_sockets_allocated。
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet; //INET 協議族的 sock 結構
struct request_sock_queue icsk_accept_queue; //確定接收隊列
struct inet_bind_bucket *icsk_bind_hash; //綁定的桶結構
unsigned long icsk_timeout; //超時
struct timer_list icsk_retransmit_timer; //沒有 ACK 時的重發定時器
struct timer_list icsk_delack_timer; //確定刪除定時器
__u32 icsk_rto; //重發超時
__u32 icsk_pmtu_cookie; //最近的 pmtu
const struct tcp_congestion_ops *icsk_ca_ops; //擁擠情況時的處理函數
const struct inet_connection_sock_af_ops *icsk_af_ops; //AF_INET指定的函數操作表
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); //同步 mss 的函數指針
__u8 icsk_ca_state; //擁擠情況的處理狀態
__u8 icsk_retransmits; //重發數量
__u8 icsk_pending; //掛起
__u8 icsk_backoff; //允許連接的數量
__u8 icsk_syn_retries; //允許重新SYN的數量
__u8 icsk_probes_out; //探測到未應答的窗口
__u16 icsk_ext_hdr_len; //網絡協議頭部的長度
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;
/* Range of MTUs to search */
int search_high;
int search_low;
/* Information on the current probe. */
int probe_size;
} icsk_mtup;
u32 icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
到這裏 socket 算是創建並初始化完成,我們可以看到上圖的數據結構,此時socket的 state 爲未連接狀態,sock 的 sk_state 爲關閉狀態。然後回到 sys_socket 函數中,執行retval = sock_map_fd(sock)。
socket與文件系統
int sock_map_fd(struct socket *sock)
{
struct file *newfile;
int fd = sock_alloc_fd(&newfile); //爲 socket 分配文件號跟文件結構
if (likely(fd >= 0)) {
int err = sock_attach_fd(sock, newfile); //掛載 socket 跟文件結構
if (unlikely(err < 0)) { //出錯則釋放文件跟文件號
put_filp(newfile);
put_unused_fd(fd);
return err;
}
fd_install(fd, newfile); //使文件與文件號掛鉤
}
return fd;
}
先通過sock_alloc_fd申請文件結構空間與文件號
static int sock_alloc_fd(struct file **filep)
{
int fd;
fd = get_unused_fd(); //得到空閒文件號
if (likely(fd >= 0)) {
struct file *file = get_empty_filp(); //分配文件結構空間
*filep = file;
if (unlikely(!file)) {
put_unused_fd(fd);
return -ENFILE;
}
} else
*filep = NULL;
return fd;
}
這兒涉及到文件系統的操作,分別從當前進程獲取到空閒的fd,再從文件系統分配空閒的文件結構空間,如果分配失敗則釋放。
分配申請成功,然後執行sock_attach_fd
static int sock_attach_fd(struct socket *sock, struct file *file)
{
struct dentry *dentry;
struct qstr name = { .name = "" };
dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); //創建一個 socket 的文件系統目錄項,sock_mnt是 vfsmount 類型
if (unlikely(!dentry))
return -ENOMEM;
dentry->d_op = &sockfs_dentry_operations; //將 socket文件系統的目錄操作表掛入到目錄項的操作表中
/*
* We dont want to push this dentry into global dentry hash table.
* We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
* This permits a working /proc/$pid/fd/XXX on sockets
*/
dentry->d_flags &= ~DCACHE_UNHASHED;
d_instantiate(dentry, SOCK_INODE(sock));
sock->file = file;
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
&socket_file_ops); // socket 文件結構進行初始化,傳入的是 socket_file_ops 操作表
SOCK_INODE(sock)->i_fop = &socket_file_ops;
file->f_flags = O_RDWR;
file->f_pos = 0;
file->private_data = sock; //可以在文件系統中通過 private_data 找到對應的 socket
return 0;
}
我們看一下 sockfs_dentry_operations socket文件系統的目錄操作表
static struct dentry_operations sockfs_dentry_operations = {
.d_delete = sockfs_delete_dentry,
.d_dname = sockfs_dname,
};
同時看一下 socket_file_ops 文件操作表
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
其實我們通過read/write對socket進行讀寫,但內部是通過這個函數表映射到具體的socket操作,給用戶一種操作文件的方便性,統一性。
bind()
再看demo的服務器代碼,在socket()創建完畢後,通過bind(server_fd,(struct sockaddr*)&server_address,server_len)綁定地址給socket。
我們還是跟蹤其實現,bind直接在sys_socketcall()函數中,對照參數SYS_BIND,找到系統調用sys_bind()
asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed); //通過 fd 找到對應的 socket
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, address); //將傳入的地址從用戶空間複製到內核空間
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)address,
addrlen);
if (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
address, addrlen); //調用具體協議的綁定函數,inet_stream_ops->bind()
}
fput_light(sock->file, fput_needed);
}
return err;
}
我們繼續看 sockfd_lookup_light 函數
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
struct file *file;
struct socket *sock;
*err = -EBADF;
file = fget_light(fd, fput_needed); //根據 fd 找到文件指針
if (file) {
sock = sock_from_file(file, err); //在文件指針中獲得 socket 指針
if (sock)
return sock;
fput_light(file, *fput_needed);
}
return NULL;
}
fget_light/fput_light 是文件操作,fget_light從當前進程的files_struct 結構中找到文件系統中file文件指針,增加計數,fput_light減計數,如果sock結構取到則直接返回。在這裏我們重點看下 sock_from_file 函數
static struct socket *sock_from_file(struct file *file, int *err)
{
if (file->f_op == &socket_file_ops)
return file->private_data; /* set in sock_map_fd */
*err = -ENOTSOCK;
return NULL;
}
之前提過的 file->private_data域是存儲 socket 指針。通過sockfd_lookup_light函數我們得到了之前創建並初始化的socket,然後通過 move_addr_to_kernel 函數將地址複製到內核空間。
int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
{
if (ulen < 0 || ulen > MAX_SOCK_ADDR)
return -EINVAL;
if (ulen == 0)
return 0;
if (copy_from_user(kaddr, uaddr, ulen))
return -EFAULT;
return audit_sockaddr(ulen, kaddr);
}
再往下看 security_socket_bind 涉及到 security ,沒有設置直接返回 0 。於是調用了 sock->ops->bind()方法,由於我們的socket->ops之前綁定了answer->ops(忘記了可以翻看上面的inet_create函數),即這裏實際上調用的是inet_stream_ops->bind
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
對照上面這個結構體我們找到.bind執行的是inet_bind()函數。
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
unsigned short snum;
int chk_addr_ret;
int err;
/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len); //如果 socket 提供了自己的綁定函數就使用它,這裏的sk->sk_prot爲tcp_prot
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); //在路由中檢查地址類型
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
!inet->freebind &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL && //是否單播類型
chk_addr_ret != RTN_MULTICAST && //是否組播類型
chk_addr_ret != RTN_BROADCAST) //是否廣播類型
goto out;
snum = ntohs(addr->sin_port); //取得端口號
err = -EACCES;
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk); //加鎖
/* Check these errors (active socket, double bind). */
err = -EINVAL;
if (sk->sk_state != TCP_CLOSE || inet->num) //檢查狀態、端口是否已經指定
goto out_release_sock;
inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; //rcv_saddr用於哈希查找、saddr用於發送(賦值爲ip地址)
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. 檢查是否允許綁定 */
if (sk->sk_prot->get_port(sk, snum)) { // inet_csk_get_port()
inet->saddr = inet->rcv_saddr = 0; // 檢查失敗就情況設置的地址
err = -EADDRINUSE;
goto out_release_sock;
}
if (inet->rcv_saddr) //如果已經設置了地址就增加鎖標誌,表示已經綁定了地址
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum) //如果已經設置了端口就增加鎖標誌,表示已經綁定了端口
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->sport = htons(inet->num); //記錄端口
inet->daddr = 0; //初始化目標地址
inet->dport = 0; //初始化目標端口
sk_dst_reset(sk); //初始化緩存的路由內容
err = 0;
out_release_sock:
release_sock(sk); //解鎖
out:
return err;
}
我們可以看到sk->prot爲tcp_prot,查找該結構我們並未發現.bind項,於是繼續往下執行。這裏涉及到兩個數據結構sockaddr_in跟sockaddr
struct sockaddr_in {
sa_family_t sin_family; /* Address family */
__be16 sin_port; /* Port number */
struct in_addr sin_addr; /* Internet address */
/* Pad to size of `struct sockaddr'. */
unsigned char __pad[__SOCK_SIZE__ - sizeof(short int) -
sizeof(unsigned short int) - sizeof(struct in_addr)];
};
struct sockaddr {
sa_family_t sin_family; /* Address family */
char sa_data[14];
}
可以看到因爲兩個結構體長度相同,結構相似可以互相強制類型轉換,可能考慮到兼容性問題,在inet_bind()函數中將之前sockaddr類型轉回sockaddr_in類型。
通過sock_net(sk)返回了sk->sk_net指針,如果用戶沒有自定義網絡空間則返回系統默認init_net 結構指針,然後調用 inet_addr_type() 函數檢查地址的類型。
unsigned int inet_addr_type(struct net *net, __be32 addr)
{
return __inet_dev_addr_type(net, NULL, addr);
}
static inline unsigned __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
__be32 addr)
{
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
struct fib_result res;
unsigned ret = RTN_BROADCAST;
struct fib_table *local_table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) //檢查地址是否是零地址或廣播地址
return RTN_BROADCAST;
if (ipv4_is_multicast(addr)) //檢查地址是否是組播地址
return RTN_MULTICAST;
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
#endif
local_table = fib_get_table(net, RT_TABLE_LOCAL); //查找本地路由函數表
if (local_table) {
ret = RTN_UNICAST;
if (!local_table->tb_lookup(local_table, &fl, &res)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
fib_res_put(&res);
}
}
return ret;
}
代碼中出現了 struct flowi 結構用於路由鍵值。flowi.nl_u 是一個聯合體,包含了ip4_u、ip6_u、dn_u這三個結構體,於是可以理解 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } } 將ip地址賦值給路由鍵值 fl的目標地址。我們可以稍微看下flowi的結構體
struct flowi { //路由鍵值結構
int oif; //負責發送的網絡設備
int iif; //負責接收的網絡設備
__u32 mark; //子網掩碼
union {
struct {
__be32 daddr; //目標地址
__be32 saddr; //源地址,即發送方地址
__u8 tos; //服務類型TOS
__u8 scope; //範圍
} ip4_u;
struct {
struct in6_addr daddr;
struct in6_addr saddr;
__be32 flowlabel;
} ip6_u;
struct {
__le16 daddr;
__le16 saddr;
__u8 scope;
} dn_u;
} nl_u; //該聯合體主要用於網絡層
#define fld_dst nl_u.dn_u.daddr
#define fld_src nl_u.dn_u.saddr
#define fld_scope nl_u.dn_u.scope
#define fl6_dst nl_u.ip6_u.daddr
#define fl6_src nl_u.ip6_u.saddr
#define fl6_flowlabel nl_u.ip6_u.flowlabel
#define fl4_dst nl_u.ip4_u.daddr
#define fl4_src nl_u.ip4_u.saddr
#define fl4_tos nl_u.ip4_u.tos
#define fl4_scope nl_u.ip4_u.scope
__u8 proto; //傳輸層協議
__u8 flags; //標誌位
union {
struct {
__be16 sport; //源端口,發送方端口
__be16 dport; //目標端口,接收方端口
} ports;
struct {
__u8 type;
__u8 code;
} icmpt; //ICMP 類型
struct {
__le16 sport;
__le16 dport;
} dnports;
__be32 spi;
struct {
__u8 type;
} mht;
} uli_u; //該聯合體主要用於傳輸層
#define fl_ip_sport uli_u.ports.sport
#define fl_ip_dport uli_u.ports.dport
#define fl_icmp_type uli_u.icmpt.type
#define fl_icmp_code uli_u.icmpt.code
#define fl_ipsec_spi uli_u.spi
#define fl_mh_type uli_u.mht.type
__u32 secid; /* used by xfrm; see secid.txt */
} __attribute__((__aligned__(BITS_PER_LONG/8)));
struct fib_result 結構是路由查找結果,struct fib_table 則是路由函數表結構體。函數中先檢查ip地址addr是否是零地址、本地的廣播地址、組播地址
static inline bool ipv4_is_zeronet(__be32 addr)
{
return (addr & htonl(0xff000000)) == htonl(0x00000000);
}
可以看到該函數是檢查addr的高8位是否爲零判斷是否是零網地址。
static inline bool ipv4_is_lbcast(__be32 addr)
{
/* limited broadcast */
return addr == htonl(INADDR_BROADCAST);
}
#define INADDR_BROADCAST ((unsigned long int) 0xffffffff)
是否全1判斷爲廣播地址
static inline bool ipv4_is_multicast(__be32 addr)
{
return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}
addr的高4位爲1110則是屬於多播地址
零網地址、廣播地址則直接返回RTN_BROADCAST,多播地址返回RTN_MULTICAST,若都不是則查找具體的函數路由表返回查找結果。
fib_get_table 函數在內核中有兩塊地方,根據是否配置了CONFIG_IP_MULTIPLT_TABLES,我們挑簡單的單路由函數表分析。
static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct hlist_head *ptr;
ptr = id == RT_TABLE_LOCAL ?
&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}
此時傳進來的net爲sock_net(sk),即系統默認的init_net,id則是 RT_TABLE_LOCAL ,net->ipv4是netns_ipv4結構類型,裝載着IPV4協議在網絡空間中的信息。
struct netns_ipv4 {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
struct ctl_table_header *frags_hdr;
struct ctl_table_header *ipv4_hdr;
#endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops;
#endif
struct hlist_head *fib_table_hash;
struct sock *fibnl;
struct sock **icmp_sk;
struct sock *tcp_sock;
struct netns_frags frags;
#ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter;
struct xt_table *iptable_mangle;
struct xt_table *iptable_raw;
struct xt_table *arptable_filter;
#endif
int sysctl_icmp_echo_ignore_all;
int sysctl_icmp_echo_ignore_broadcasts;
int sysctl_icmp_ignore_bogus_error_responses;
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_errors_use_inbound_ifaddr;
};
IPV4所有的路由函數表都會鏈入到 fib_table_hash 數組中,數組的每個元素爲hlist_head即隊列,每個路由函數表通過其內部結構 tb_hlist 頭鏈入到對應的隊列中。
[TODO]
如果找到了本地路由函數表,那麼調用本地路由函數表的 local_table->tb_lookup(local_table, &fl, &res) 函數根據鍵值 fl 返回 struct fib_result 結構,傳遞下來的dev參數爲null,所以此時會修改ret = res.type,然後返回。
回到inet_bind函數中,snum = ntohs(addr->sin_port) 取得端口號,檢查是否小於1024(系統保留了0~1023端口號)以及是否有綁定權限,然後檢查狀態、端口是否已經指定,再將ip地址賦值給inet的接收地址和源地址上。如果 ip地址類型是組播或者廣播或者零網地址,則將源地址改爲0,而接收地址不變。
然後調用tcp_prot->get_prot 即inet_csk_get_port檢查端口是否允許綁定。
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //tcp_prot->h.hashinfo tcp_hashinfo
struct inet_bind_hashbucket *head;
struct hlist_node *node;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk); //通過sock得到net結構
代碼過長,我們一點一點看。我們看到先得到hashinfo結構,它是inet_hashinfo結構的指針,通過tcp_prot.h.hashinfo得到的tcp_hashinfo,我們看下inet_hashinfo結構,用來封裝各種協議的綁定哈希表。
struct inet_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
*
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
*
* TIME_WAIT sockets use a separate chain (twchain).
*/ //已經連接的sock結構都鏈入到該哈希桶,它有兩個隊列,一個是連接的sock 隊列,一個爲定時等待的sock隊列
struct inet_ehash_bucket *ehash; //已經建立連接的哈希桶
rwlock_t *ehash_locks; //隊列鎖
unsigned int ehash_size; //隊列長度
unsigned int ehash_locks_mask; //鎖掩碼
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
struct inet_bind_hashbucket *bhash; //管理端口號的哈希桶
unsigned int bhash_size; //哈希桶長度
/* Note : 4 bytes padding on 64 bit arches */
/* All sockets in TCP_LISTEN state will be in here. This is the only
* table where wildcard'd TCP sockets can exist. Hash function here
* is just local port number.
*/
struct hlist_head listening_hash[INET_LHTABLE_SIZE]; //監聽哈希隊列
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* are often dirty.
*/
rwlock_t lhash_lock ____cacheline_aligned;
atomic_t lhash_users;
wait_queue_head_t lhash_wait; //等待隊列頭
struct kmem_cache *bind_bucket_cachep; //高速緩存
};
可以看到這個結構是爲了維護INET協議族的hash表使用的。
我們還看到一個數據結構
struct inet_bind_hashbucket { //哈希桶結構
spinlock_t lock; //自旋鎖
struct hlist_head chain; //桶隊列
};
這是一個帶着自旋鎖的哈希桶,chain代表着各個桶的哈希隊列。
再往下struct hlist_node *node ,它是hash表的鏈頭,被鏈入到hlist_head結構中。
下一行還有struct inet_bind_bucket *tb
struct inet_bind_bucket { //桶結構
struct net *ib_net; //網絡空間指針
unsigned short port; //端口號
signed short fastreuse; //可以重複使用
struct hlist_node node; //鏈入哈希桶的chain中的哈希頭
struct hlist_head owners; //sock 結構隊列
};
這個結構鏈入到哈希桶inet_bind_hashbucket 結構中
繼續inet_csk_get_port函數
local_bh_disable(); //是net指向內核的init_net網絡空間
if (!snum) { //如果端口號沒有指定
int remaining, rover, low, high;
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
rover = net_random() % remaining + low;
do { //在內核中查找一個端口號
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
if (++rover > high)
rover = low;
} while (--remaining > 0);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0)
goto fail;
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
可以看到snum我們是指定了的,但如果沒有指定(端口號爲0)則進入這個條件分支,表示由內核分配一個端口號。
先調用inet_get_local_port_range()函數取得端口號的取值範圍。
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;
do {
seq = read_seqbegin(&sysctl_port_range_lock);
*low = sysctl_local_port_range[0];
*high = sysctl_local_port_range[1];
} while (read_seqretry(&sysctl_port_range_lock, seq));
}
通過樂觀鎖的方式從內核的端口範圍數組sysctl_local_port_range讀出兩個值{32768, 61000}
然後通過隨機數的方式計算出推薦端口號 rover = net_random() % remaining + low。並且保證推薦端口號是未使用過的,如果全部使用過,則ret爲1,go fail,否則就以適合的推薦端口號爲端口號。
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == rover)
goto next;
這一塊的邏輯類似於hashmap.get(rover),就是將rover取哈希取餘爲下標,在tcp_hashinfo的bhash哈希桶中,取出對應的哈希隊列chain,然後遍歷隊列如果有相同的則說明綁定過了。
#define inet_bind_bucket_for_each(tb, node, head) \
hlist_for_each_entry(tb, node, head, node)
#define hlist_for_each_entry(tpos, pos, head, member) \
for (pos = (head)->first; \
pos && ({ prefetch(pos->next); 1;}) && \
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
pos = pos->next)
在這裏我們可以看到細節,桶結構 inet_bind_bucket 是通過node節點鏈入到哈希桶 inet_bind_hashbucket 的chain隊列上去。
我們的服務器程序指定了端口號,那麼不需要內核分配端口號,我們繼續inet_csk_get_port函數
} else { //在哈希桶隊列中查找相同端口的桶結構
head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == snum)
goto tb_found;
}
tb = NULL;
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) { //檢查 sock 隊列是否爲空
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
goto success; //複用
} else {
ret = 1; //桶結構中的 sock 隊列是否存在衝突
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
goto fail_unlock;
}
}
tb_not_found: //桶結構不存在則創建
ret = 1;
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1; //設置桶結構可以複用
else
tb->fastreuse = 0;
} else if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
success:
if (!inet_csk(sk)->icsk_bind_hash) //還沒有綁定桶結構
inet_bind_hash(sk, tb, snum); //綁定
BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
ret = 0;
fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
現在哈希桶結構中查找指定端口對應的桶結構tb,如果找到則tb_found處,tb->owners是一個sock隊列頭,如果這個隊列不爲空,就檢查其是否支持快速複用即fastreuse爲1,然後再看我們的sock是否也允許複用,且未處於監聽狀態那麼跳入sucess處。否則執行inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb) 函數,即ipv4_specific->bind_conflict,即 inet_csk_bind_conflict 函數
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb)
{
const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
struct sock *sk2;
struct hlist_node *node;
int reuse = sk->sk_reuse;
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/
sk_for_each_bound(sk2, node, &tb->owners) {
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { //是否同一設備
if (!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) {
const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr ||
sk2_rcv_saddr == sk_rcv_saddr) //是否綁定地址相同
break;
}
}
}
return node != NULL;
}
代碼很簡單,宏sk_for_each_bound 是遍歷tb->owners隊列,其中每個sock結構爲sk2,然後對比sk跟sk2,如果設備相同、綁定的地址也相同就”衝突“了。
回到inet_csk_get_port函數,如果沒找到桶結構轉到tb_not_found處,通過 inet_bind_bucket_create 函數創建桶結構,並將端口號等內容記錄到新建的桶結構中,並將桶結構鏈入到哈希桶中。
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
*/
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); //申請桶結構空間
if (tb != NULL) {
tb->ib_net = hold_net(net); //記錄網絡空間
tb->port = snum; //記錄端口號
tb->fastreuse = 0; //快速複用初始化0,根據sock調整
INIT_HLIST_HEAD(&tb->owners); //初始化 sock 隊列
hlist_add_head(&tb->node, &head->chain);//將桶結構鏈入到哈希桶中
}
return tb;
}
在sucess處通過 inet_csk(sk)->icsk_bind_hash 判斷是否還沒有綁定桶結構。inet_csk直接將sock指針強轉爲 inet_connection_sock,然後判斷其icsk_bind_hash是否有值。inet_connection_sock用於INET協議族連接sock,前面有出現。如果sock沒有綁定桶結構則通過 inet_bind_hash 函數將 sock 鏈入到桶結構的sock隊列中。
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
inet_sk(sk)->num = snum;
sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb;
}
首先將inet_connection_sock的端口號賦值,然後將當前sock綁定到桶結構的owners隊列中,然後inet_connection_sock的icsk_bind_hash記下tb桶。綁定工作就完畢了。
回到inet_bind()函數
/* Make sure we are allowed to bind here. 檢查是否允許綁定 */
if (sk->sk_prot->get_port(sk, snum)) { // inet_csk_get_port()
inet->saddr = inet->rcv_saddr = 0; // 檢查失敗就情況設置的地址
err = -EADDRINUSE;
goto out_release_sock;
}
if (inet->rcv_saddr) //如果已經設置了地址就增加鎖標誌,表示已經綁定了地址
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum) //如果已經設置了端口就增加鎖標誌,表示已經綁定了端口
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->sport = htons(inet->num); //記錄端口
inet->daddr = 0; //初始化目標地址
inet->dport = 0; //初始化目標端口
sk_dst_reset(sk); //初始化緩存的路由內容
err = 0;
out_release_sock:
release_sock(sk); //解鎖
out:
return err;
}
bind()到這裏就告一段落,中間我們遺留了本地路由函數表、local_table->tb_lookup(local_table, &fl, &res)相關的內容。