socket 源碼解析之創建

數據結構 

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @fasync_list: Asynchronous wake up list
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wait: wait queue for several uses
 *  @type: socket type (%SOCK_STREAM, etc)
 */
struct socket {
	socket_state		state;	//socket 的狀態
	unsigned long		flags;	//socket 的標誌位
	const struct proto_ops	*ops; //socket 的函數操作表
	struct fasync_struct	*fasync_list;	//socket 的異步喚醒隊列
	struct file		*file;	// 與socket關聯的文件指針
	struct sock		*sk;	// 代表具體協議內容的 sock 結構指針
	wait_queue_head_t	wait;	// 等待隊列
	short			type;	//socket 的類型
};

從 socket 結構體可以看出 socket 是通用的套接字結構體的公共部分,而其中的 sock 結構體則是與使用的具體協議相關的部分,可以理解成從 socket 中抽象出 sock 部分,sock 結構體是根據使用的協議掛入到 socket 中,下面瞭解下 sock 結構體。

struct sock {
	/*
	 * Now struct inet_timewait_sock also uses sock_common, so please just
	 * don't add nothing before this first member (__sk_common) --acme
	 */
	struct sock_common	__sk_common;	// 與 inet_timewait_sock 共享使用
#define sk_family		__sk_common.skc_family	// 地址族
#define sk_state		__sk_common.skc_state	// 連接狀態
#define sk_reuse		__sk_common.skc_reuse	// 確定複用地址
#define sk_bound_dev_if		__sk_common.skc_bound_dev_if	//綁定設備 ID
#define sk_node			__sk_common.skc_node	// 鏈入主哈希表
#define sk_bind_node		__sk_common.skc_bind_node	// 鏈入綁定哈希表
#define sk_refcnt		__sk_common.skc_refcnt	// 使用計數
#define sk_hash			__sk_common.skc_hash	// 哈希值
#define sk_prot			__sk_common.skc_prot	// 協議函數表
#define sk_net			__sk_common.skc_net	// 所屬的網絡空間
	unsigned char		sk_shutdown : 2,	// 是否關閉,mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
				sk_no_check : 2,	// 是否檢查數據包
				sk_userlocks : 4;	// 用戶鎖,%SO_SNDBUF and %SO_RCVBUF settings
	unsigned char		sk_protocol; // 使用協議族的哪一種協議
	unsigned short		sk_type;	// socket 的類型,例如 SOCK_STREAM 等
	int			sk_rcvbuf;	// 接受緩衝區的長度(字節數)
	socket_lock_t		sk_lock;	// 用於同步
	/*
	 * The backlog queue is special, it is always used with
	 * the per-socket spinlock held and requires low latency
	 * access. Therefore we special case it's implementation.
	 */
	struct {
		struct sk_buff *head;	// 記錄最先接收到的數據包
		struct sk_buff *tail;	// 記錄最後接收到的數據包
	} sk_backlog; // 後備隊列
	wait_queue_head_t	*sk_sleep;	//sock 的等待隊列
	struct dst_entry	*sk_dst_cache;	// 路由項緩存
	struct xfrm_policy	*sk_policy[2];	//流策略
	rwlock_t		sk_dst_lock;	// 路由項緩存鎖
	atomic_t		sk_rmem_alloc;	// 接受隊列的字節數
	atomic_t		sk_wmem_alloc;	// 發送隊列的字節數
	atomic_t		sk_omem_alloc;	// 可選擇/其他 的字節數
	int			sk_sndbuf;	// 發送緩存的總長度
	struct sk_buff_head	sk_receive_queue;	//接收隊列(接收到的數據包隊列)
	struct sk_buff_head	sk_write_queue;		//發送隊列(正在發送的數據包隊列)
	struct sk_buff_head	sk_async_wait_queue;	//DMA 複製的數據包 TODO
	int			sk_wmem_queued;	//全部數據包占用內存計數
	int			sk_forward_alloc;	//記錄可用內存長度
	gfp_t			sk_allocation;	//分配模式
	int			sk_route_caps;	//路由的兼容性標誌位
	int			sk_gso_type;	//GSO 通用分段類型 TODO
	unsigned int		sk_gso_max_size; //用於建立 GSO 通用分段的最大長度
	int			sk_rcvlowat;	//SO_RCVLOWAT 設置
	unsigned long 		sk_flags;	//SO_BROADCAST、SO_KEEPALIVE、SO_OOBINLINE、SO_LINGER 設置
	unsigned long	        sk_lingertime;	//停留時間,確定關閉時間
	struct sk_buff_head	sk_error_queue;	// 錯誤數據包隊列
	struct proto		*sk_prot_creator;	//sock 創建接口
	rwlock_t		sk_callback_lock;	// 爲後半部處理使用的鎖
	int			sk_err,			//出錯碼
				sk_err_soft;	//持續出現的錯誤
	atomic_t		sk_drops;	//原始 socket 發送的計數器
	unsigned short		sk_ack_backlog;		//當前監聽到的連接數量
	unsigned short		sk_max_ack_backlog;	//在 listen() 函數中監聽到的連接數量
	__u32			sk_priority;	//優先級
	struct ucred		sk_peercred;	// SO_PEERCRED 設置
	long			sk_rcvtimeo;	// SO_RCVTIMEO 設置接受超時時間
	long			sk_sndtimeo;	// SO_SNDTIMEO 設置發送超時時間
	struct sk_filter      	*sk_filter;	//sock 的過濾器
	void			*sk_protinfo;	//私有區域,當不使用slab高速緩存時由協議族定義
	struct timer_list	sk_timer;	//sock 的沖刷定時器
	ktime_t			sk_stamp;		//最後接收數據包的時間
	struct socket		*sk_socket;	//對應的 socket 指針
	void			*sk_user_data;	//rpc 提供的數據
	struct page		*sk_sndmsg_page;	// 發送數據塊所在的緩衝頁
	struct sk_buff		*sk_send_head;	// 發送數據包的隊列頭
	__u32			sk_sndmsg_off;	//發送數據塊在緩衝頁的結尾
	int			sk_write_pending;	//等待發送的數量
	void			*sk_security;	//用於安全模式
	__u32			sk_mark;	//通用的數據包掩碼
	/* XXX 4 bytes hole on 64 bit */
	void			(*sk_state_change)(struct sock *sk);			//sock 狀態改變後調用的函數
	void			(*sk_data_ready)(struct sock *sk, int bytes);	//在數據被處理完成後調用的函數
	void			(*sk_write_space)(struct sock *sk);				//發送空間可以使用後調用的函數
	void			(*sk_error_report)(struct sock *sk);			//處理錯誤的函數
  	int			(*sk_backlog_rcv)(struct sock *sk,					//處理庫存數據包函數
						  struct sk_buff *skb);  
	void                    (*sk_destruct)(struct sock *sk);		//sock 的銷燬函數
};

與應用程序密切相關的共用部分放在了socket結構中,而與協議相關的內容則放在sock結構中,然後使socket與sock掛鉤,設計靈活巧妙。

我們看到sock中數據包的結構通過sk_buff來體現,每個協議都是通過sk_buff結構體用於封裝、載運數據包,我們可以看下其數據結構。

struct sk_buff {
	/* These two members must be first. */
	struct sk_buff		*next;	//隊列中的下一個數據包
	struct sk_buff		*prev;	//隊列中的前一個數據包

	struct sock		*sk;	//指向所屬的 sock 數據包
	ktime_t			tstamp;	//數據包到達的時間
	struct net_device	*dev;	//接收數據包的網絡設備

	union {
		struct  dst_entry	*dst;	//路由項
		struct  rtable		*rtable;	//路由表
	};
	struct	sec_path	*sp;	//用於 xfrm 的安全路徑

	/*
	 * This is the control buffer. It is free to use for every
	 * layer. Please put your private variables there. If you
	 * want to keep them across layers you have to do a skb_clone()
	 * first. This is owned by whoever has the skb queued ATM.
	 */
	char			cb[48];	// cb 控制塊

	unsigned int		len,	//全部數據塊的總長度
				data_len;		//分段、分散數據塊的總長度
	__u16			mac_len,	//鏈路層頭部的長度
				hdr_len;		//在克隆數據包時可寫的頭部長度
	union {
		__wsum		csum;		//校驗和
		struct {
			__u16	csum_start;	//校驗和在數據包頭部 skb->head 中的起始位置
			__u16	csum_offset;//校驗和保存到 csum_start 中的位置
		};
	};
	__u32			priority;	//數據包在隊列中的優先級
	__u8			local_df:1,	//是否允許本地數據分段
				cloned:1,		//是否允許被克隆
				ip_summed:2,	//IP校驗和標誌
				nohdr:1,		//運載時使用,表示不能被修改頭部
				nfctinfo:3;		//數據包連接關係
	__u8			pkt_type:3,	//數據包的類型
				fclone:2,		//數據包克隆關係
				ipvs_property:1,//數據包所屬的 ipvs
				peeked:1,		//數據包是否屬於操作狀態
				nf_trace:1;		//netfilter 對數據包的跟蹤標誌
	__be16			protocol;	//底層驅動使用的數據包協議

	void			(*destructor)(struct sk_buff *skb);					//銷燬數據包的函數
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
	struct nf_conntrack	*nfct;
	struct sk_buff		*nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
	struct nf_bridge_info	*nf_bridge;	//關於網橋的數據
#endif

	int			iif;
#ifdef CONFIG_NETDEVICES_MULTIQUEUE
	__u16			queue_mapping;
#endif
#ifdef CONFIG_NET_SCHED
	__u16			tc_index;	/* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
	__u16			tc_verd;	/* traffic control verdict */
#endif
#endif
#ifdef CONFIG_IPV6_NDISC_NODETYPE
	__u8			ndisc_nodetype:2;
#endif
	/* 14 bit hole */

#ifdef CONFIG_NET_DMA
	dma_cookie_t		dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
	__u32			secmark;
#endif

	__u32			mark;

	sk_buff_data_t		transport_header;	//指向數據塊中傳輸層頭部
	sk_buff_data_t		network_header;		//指向數據塊中網絡層頭部
	sk_buff_data_t		mac_header;			//指向數據塊中鏈路層頭部
	/* These elements must be at the end, see alloc_skb() for details.  */
	sk_buff_data_t		tail;	//指向數據塊的結束地址
	sk_buff_data_t		end;	//指向緩衝塊的結束地址
	unsigned char		*head,	//指向緩衝塊的開始地址
				*data;			//指向數據塊的開始地址
	unsigned int		truesize;	//數據包的實際長度(結構長度與數據塊長度之和)
	atomic_t		users;	//數據包的使用計數器
};

共用部分 socket 結構體、通用部分 sock 結構體、專用部分 inet_sock 結構體。

tcp_sock 內容與 tcp 協議緊密相關,我們看其內容

struct tcp_sock {
	/* inet_connection_sock has to be the first member of tcp_sock */
	struct inet_connection_sock	inet_conn;	//由註釋看到該結構體必須在 tcp_sock 頭部 TODO why?
	u16	tcp_header_len;	/* Bytes of tcp header to send	發送的 tcp 頭部字節數	*/
	u16	xmit_size_goal;	/* Goal for segmenting output packets 分段傳送的數據包數量 	*/

/*
 *	Header prediction flags 頭部的預置位
 *	0x5?10 << 16 + snd_wnd in net byte order
 */
	__be32	pred_flags;

/*
 *	RFC793 variables by their proper names. This means you can
 *	read the code and the spec side by side (and laugh ...)
 *	See RFC793 and RFC1122. The RFC writes these in capitals.
 */
 	u32	rcv_nxt;	/* What we want to receive next 下一個要接收的目標	*/
	u32	copied_seq;	/* Head of yet unread data	代表還沒有讀取的數據	*/
	u32	rcv_wup;	/* rcv_nxt on last window update sent rcv_nxt 在最後一次窗口更新時內容	*/
 	u32	snd_nxt;	/* Next sequence we send	下一個要發送的目標	*/

 	u32	snd_una;	/* First byte we want an ack for 第一個要 ack 的字節	*/
 	u32	snd_sml;	/* Last byte of the most recently transmitted small packet 最近發送數據包中的尾字節 */
	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) 最後一次接收到 ack 的時間 */
	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) 最後一次發送數據包的時間 */

	/* Data for direct copy to user 直接複製給用戶的數據 */
	struct {
		struct sk_buff_head	prequeue;			//預處理隊列
		struct task_struct	*task;				//預處理進程
		struct iovec		*iov;				//用戶程序(應用程序)接收數據的緩衝區
		int			memory;						//預處理數據包計數器
		int			len;						//預處理長度
#ifdef CONFIG_NET_DMA
		/* members for async copy 異步複製的內容 */
		struct dma_chan		*dma_chan;
		int			wakeup;
		struct dma_pinned_list	*pinned_list;
		dma_cookie_t		dma_cookie;
#endif
	} ucopy;

	u32	snd_wl1;	/* Sequence for window update  窗口更新的順序		*/
	u32	snd_wnd;	/* The window we expect to receive 期望接收的窗口	*/
	u32	max_window;	/* Maximal window ever seen from peer 從對方獲得的最大窗口	*/
	u32	mss_cache;	/* Cached effective mss, not including SACKS 有效的 mss,不包括 SACKS TODO mss、SACKS */

	u32	window_clamp;	/* Maximal window to advertise	對外公佈的最大窗口	*/
	u32	rcv_ssthresh;	/* Current window clamp		當前窗口	*/

	u32	frto_highmark;	/* snd_nxt when RTO occurred 在 rto 時的 snd_nxt */
	u8	reordering;	/* Packet reordering metric.	預設的數據包數量	*/
	u8	frto_counter;	/* Number of new acks after RTO  rto 後的 ack 次數 */
	u8	nonagle;	/* Disable Nagle algorithm?      是否使用 Nagle 算法 TODO Nagle    */
	u8	keepalive_probes; /* num of allowed keep alive probes 允許持有的數量	*/

/* RTT measurement */
	u32	srtt;		/* smoothed round trip time << 3 	*/
	u32	mdev;		/* medium deviation			*/
	u32	mdev_max;	/* maximal mdev for the last rtt period	*/
	u32	rttvar;		/* smoothed mdev_max			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/

	u32	packets_out;	/* Packets which are "in flight" 處於飛行中的數據包數量	*/
	u32	retrans_out;	/* Retransmitted packets out	轉發的數據包數量	*/
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
	struct tcp_options_received rx_opt;

/*
 *	Slow start and congestion control (see also Nagle, and Karn & Partridge) TODO 慢啓動與阻塞控制
 */
 	u32	snd_ssthresh;	/* Slow start size threshold	慢啓動的起點值	*/
 	u32	snd_cwnd;	/* Sending congestion window	發送的阻塞窗口	*/
	u32	snd_cwnd_cnt;	/* Linear increase counter	線性計數器	*/
	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this 不允許 snd_cwnd 超過的值 */
	u32	snd_cwnd_used;
	u32	snd_cwnd_stamp;

	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here 超出分段規則的隊列 */

 	u32	rcv_wnd;	/* Current receiver window	當前接收窗口	*/
	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer tcp 發送數據的順序號 */
	u32	pushed_seq;	/* Last pushed seq, required to talk to windows 最後送出的順序號,需要通知窗口 */

/*	SACKs data	*/
	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

	struct tcp_sack_block recv_sack_cache[4];

	struct sk_buff *highest_sack;   /* highest skb with SACK received
					 * (validity guaranteed only if
					 * sacked_out > 0)
					 */

	/* from STCP, retrans queue hinting */
	struct sk_buff* lost_skb_hint;

	struct sk_buff *scoreboard_skb_hint;
	struct sk_buff *retransmit_skb_hint;
	struct sk_buff *forward_skb_hint;

	int     lost_cnt_hint;
	int     retransmit_cnt_hint;

	u32	lost_retrans_low;	/* Sent seq after any rxmit (lowest) */

	u16	advmss;		/* Advertised MSS			*/
	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
	u32	lost_out;	/* Lost packets			*/
	u32	sacked_out;	/* SACK'd packets			*/
	u32	fackets_out;	/* FACK'd packets			*/
	u32	high_seq;	/* snd_nxt at onset of congestion	*/

	u32	retrans_stamp;	/* Timestamp of the last retransmit,
				 * also used in SYN-SENT to remember stamp of
				 * the first SYN. */
	u32	undo_marker;	/* tracking retrans started here. */
	int	undo_retrans;	/* number of undoable retransmissions. */
	u32	urg_seq;	/* Seq of received urgent pointer */
	u16	urg_data;	/* Saved octet of OOB data and control flags */
	u8	urg_mode;	/* In urgent mode		*/
	u8	ecn_flags;	/* ECN status bits.			*/
	u32	snd_up;		/* Urgent pointer		*/

	u32	total_retrans;	/* Total retransmits for entire connection */
	u32	bytes_acked;	/* Appropriate Byte Counting - RFC3465 */

	unsigned int		keepalive_time;	  /* time before keep alive takes place */
	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
	int			linger2;

	unsigned long last_synq_overflow; 

	u32	tso_deferred;

/* Receiver side RTT estimation */
	struct {
		u32	rtt;
		u32	seq;
		u32	time;
	} rcv_rtt_est;

/* Receiver queue space 接受隊列空間 */
	struct {
		int	space;
		u32	seq;
		u32	time;
	} rcvq_space;

/* TCP-specific MTU probe information. TCP 指定的 MTU 檢驗內容 */
	struct {
		u32		  probe_seq_start;
		u32		  probe_seq_end;
	} mtu_probe;

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
	struct tcp_sock_af_ops	*af_specific;

/* TCP MD5 Signagure Option information */
	struct tcp_md5sig_info	*md5sig_info;
#endif
};

demo

瞭解了一些數據結構之後,下面將正式開始介紹 socket 相關的源碼。

我們先來看下正常的服務器使用的流程


int main()
{
    struct sockaddr_in server_address;
    struct sockaddr_in client_address;

    server_fd = socket(AF_INET,SOCK_STREAM,0);
 
    server_address.sin_family = AF_INET;
    server_address.sin_addr.s_addr = inet_addr("192.168.1.1");
    server_address.sin_port = htons(54188);
    server_len = sizeof(server_address);
 
    bind(server_fd,(struct sockaddr*)&server_address,server_len);
    
    /*創建一個Socket的監聽隊列(允許接收10個連接),監聽客戶端Socket的連接請求*/
    listen(server_fd,10);
    
    while(1) {
        char recv[20];
        printf("server is waiting\n");
        /*程序運行到此處時,說明客戶端的連接請求已經到來,接受它的連接請求,克隆出一個Socket與客戶端建立連接,並將客戶端的“電話號碼”記錄在client_address中,函數返回建立連接的ID號*/
        client_len = sizeof(client_address);
        client_fd = accept(server_fd,(struct sockaddr*)&client_address,&client_len);
        /*使用read和write函數接收客戶端字符然後發回客戶端*/
    
        read(client_fd,recv,20);
    
        write(client_fd,back,20);
        printf("received from client= %s\n",recv);
    
        close(client_fd);
    }
    close(server_fd);
    exit(0);
}

無非先是 socket() 創建服務器socket ,然後bind() 將地址結構與 socket 掛鉤起來,於是 listen()監聽客戶端的連接請求,然後通過accept()然後得到fd,根據vfs即訪問文件的方式訪問套接字,read/write。

socket 的創建

服務器調用socket()函數,其調用的庫函數在glibc源碼中找到

#include <errno.h>
#include <sys/socket.h>

/* Create a new socket of type TYPE in domain DOMAIN, using
   protocol PROTOCOL.  If PROTOCOL is zero, one is chosen automatically.
   Returns a file descriptor for the new socket, or -1 for errors.  */
int
__socket (domain, type, protocol)
     int domain;
     int type;
     int protocol;
{
  __set_errno (ENOSYS);
  return -1;
}


weak_alias (__socket, socket)
stub_warning (socket)
#include <stub-tag.h>

這裏看到使用 weak_alias() 函數爲 socket() 函數聲明瞭一個“函數別名”_socket(),跟蹤其_socket.S彙編代碼,發現其通過調用system_call() 函數根據系統函數調用表sys_call_table最終執行的系統調用函數爲sys_socketcall(),它也是bind()、listen()、accept()等函數的系統調用入口。

/* Define unique numbers for the operations permitted on socket.  Linux
   uses a single system call for all these functions.  The relevant code
   file is /usr/include/linux/net.h.
   We cannot use a enum here because the values are used in assembler
   code.  */

#define SOCKOP_socket		1
#define SOCKOP_bind		2
#define SOCKOP_connect		3
#define SOCKOP_listen		4
#define SOCKOP_accept		5
#define SOCKOP_getsockname	6
#define SOCKOP_getpeername	7
#define SOCKOP_socketpair	8
#define SOCKOP_send		9
#define SOCKOP_recv		10
#define SOCKOP_sendto		11
#define SOCKOP_recvfrom		12
#define SOCKOP_shutdown		13
#define SOCKOP_setsockopt	14
#define SOCKOP_getsockopt	15
#define SOCKOP_sendmsg		16
#define SOCKOP_recvmsg		17

asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
{
	int ret;
	u32 a[6];
	u32 a0, a1;

	if (call < SYS_SOCKET || call > SYS_RECVMSG)
		return -EINVAL;
	if (copy_from_user(a, args, nas[call]))
		return -EFAULT;
	a0 = a[0];
	a1 = a[1];

	switch (call) {
	case SYS_SOCKET:
		ret = sys_socket(a0, a1, a[2]);
		break;
	case SYS_BIND:
		ret = sys_bind(a0, compat_ptr(a1), a[2]);
		break;
	case SYS_CONNECT:
		ret = sys_connect(a0, compat_ptr(a1), a[2]);
		break;
	case SYS_LISTEN:
		ret = sys_listen(a0, a1);
		break;
	case SYS_ACCEPT:
		ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2]));
		break;
	case SYS_GETSOCKNAME:
		ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
		break;
	case SYS_GETPEERNAME:
		ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
		break;
	case SYS_SOCKETPAIR:
		ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
		break;
	case SYS_SEND:
		ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
		break;
	case SYS_SENDTO:
		ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
		break;
	case SYS_RECV:
		ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]);
		break;
	case SYS_RECVFROM:
		ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5]));
		break;
	case SYS_SHUTDOWN:
		ret = sys_shutdown(a0,a1);
		break;
	case SYS_SETSOCKOPT:
		ret = compat_sys_setsockopt(a0, a1, a[2],
				compat_ptr(a[3]), a[4]);
		break;
	case SYS_GETSOCKOPT:
		ret = compat_sys_getsockopt(a0, a1, a[2],
				compat_ptr(a[3]), compat_ptr(a[4]));
		break;
	case SYS_SENDMSG:
		ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
		break;
	case SYS_RECVMSG:
		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
		break;
	default:
		ret = -EINVAL;
		break;
	}
	return ret;
}

然後根據調用號 SOCKOP_socket 找到對應的系統調用函數 sys_socket()

asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

先是通過sock_create() 函數創建 socket,然後通過 sock_map_fd 函數與vfs虛擬文件系統建立關聯,返回相應fd即retval統一管理。

分配並初始化 socket 結構

我們先跟蹤sock_create()函數,這函數負責分配並初始化 socket 結構。

int sock_create(int family, int type, int protocol, struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

可以看到sock_create函數前三個參數就是socket()函數傳入的參數,最後一個 socket** 參數負責接收socket結果,這裏繼續調用__sock_create() 函數

static int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 *      Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			       current->comm);
		}
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	sock = sock_alloc();	//分配 socket 結構空間
	if (!sock) {
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type;		//記錄socket 的類型

#if defined(CONFIG_KMOD)
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (net_families[family] == NULL)			//檢查協議族操作表
		request_module("net-pf-%d", family);	//安裝協議族操作表
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);	//得到相應的協議族操作表
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(net, sock, protocol);		//執行取得的協議族操作表的 create 函數
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;		// 返回創建結果

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

可以看到這個函數的操作,先是給socket結構分配相應空間,再通過family參數 AF_INET(2) 取得相應協議族操作表,再執行協議族操作表中的create函數將結果返回。我們先看 sock_alloc() 函數,它負責爲服務器程序分配 socket 結構和文件節點。

static struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	inode = new_inode(sock_mnt->mnt_sb);	//在文件系統中創建文件節點同時分配 socket 結構
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode);					//取得 socket 結構指針

	inode->i_mode = S_IFSOCK | S_IRWXUGO;	//設置文件節點的模式
	inode->i_uid = current->fsuid;			//設置爲當前進程的uid
	inode->i_gid = current->fsgid;			//設置爲當前進程的gid

	get_cpu_var(sockets_in_use)++;			
	put_cpu_var(sockets_in_use);			//設置當前的 sockets_in_use++
	return sock;
}

這裏 sock_mnt 是 socket 網絡文件系統的根節點,這兒相當於在socket網絡文件系統中分配一個inode節點,於是服務器程序可以通過相應的inode節點通過read/write操作進行讀寫。我們先看new_inode() 函數。

struct inode *new_inode(struct super_block *sb)
{
	/*
	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
	 * error if st_ino won't fit in target struct field. Use 32bit counter
	 * here to attempt to avoid that.
	 */
	static unsigned int last_ino;
	struct inode * inode;

	spin_lock_prefetch(&inode_lock);
	
	inode = alloc_inode(sb);	//調用超級塊函數操作表
	if (inode) {				//對分配得到inode處理
		spin_lock(&inode_lock);
		inodes_stat.nr_inodes++;
		list_add(&inode->i_list, &inode_in_use);
		list_add(&inode->i_sb_list, &sb->s_inodes);
		inode->i_ino = ++last_ino;
		inode->i_state = 0;
		spin_unlock(&inode_lock);
	}
	return inode;
}

我們再看其 SOCKET_I(inode) 函數。

static inline struct socket *SOCKET_I(struct inode *inode)
{
	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

#define container_of(ptr, type, member) ({			\
	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
	(type *)( (char *)__mptr - offsetof(type,member) );})

這兒有點亂,稍微整理下相當於

#define container_of(inode, struct socket_alloc, vfs_inode) ({			\
	const typeof( ((struct socket_alloc *)0)->vfs_inode ) *__mptr = (inode);	\
	(struct socket_alloc *)( (char *)__mptr - offsetof(struct socket_alloc,vfs_inode) );})

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};

#define OFFSETOF(strct, elem)	((long)&(((struct strct *)NULL)->elem))

這樣就很簡單了,offsetof宏相當於elem在struct中的偏移量,即vfs_inode在struct socket_alloc中的偏移量,再由inode即vfs_inode地址減去其在struct socket_alloc中的偏移量,得到了struct socket_alloc的首地址,同時又是socket的首地址。這個宏僅僅是計算指針偏移量得到socket首地址,socket_alloc結構的地址分配在new_inode() 函數中進行。

回到new_inode() 函數,我們看到其通過alloc_inode()函數調用超級塊的函數操作表。

static struct inode *alloc_inode(struct super_block *sb)
{
	static const struct address_space_operations empty_aops;
	static struct inode_operations empty_iops;
	static const struct file_operations empty_fops;
	struct inode *inode;

	if (sb->s_op->alloc_inode)
		inode = sb->s_op->alloc_inode(sb);
	else
		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);

	if (inode) {
		...
	}
	return inode;
}

這時候的sb->s_op已經在sock_init()過程中,經過get_sb_pseudo()函數將其賦值爲 sockfs_ops,即此時調用的爲sockfs_ops->alloc_inode 函數。

static struct super_operations sockfs_ops = {
	.alloc_inode =	sock_alloc_inode,
	.destroy_inode =sock_destroy_inode,
	.statfs =	simple_statfs,
};

查找sockfs_ops結構體此時調用的是sock_alloc_inode函數完成socket_alloc結構的分配。

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;

	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);	//分配 socket_alloc 結構
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);	// 初始化等待隊列的頭
	// 初始化socket
	ei->socket.fasync_list = NULL;
	ei->socket.state = SS_UNCONNECTED;    //狀態設置爲未連接
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}

這裏進行內存分配與socket結構的初始化。可以看到 kmem_cache_alloc 函數從slab高速緩存 sock_init_cache 直接進行分配,這個緩存塊是在sock_init()中通過init_inodecache()函數建立的。

[ TODO kmem_cache_alloc 跟 kmem_cache_create 兩個slab函數 ]

使用協議族的函數表初始化 socket

回到__sock_create() 函數,我們看到先是通過 net_families[2] 判斷是否爲NULL即是否安裝了AF_INET的協議族操作表(這個過程在內核初始化的時候進行),在這裏把相關過程列一下

inet_init -> fs_initcall(inet_init);

#define fs_initcall(fn)			__define_initcall("5",fn,5)

static int __init inet_init(void)
{
    ...

	/*
	 *	Tell SOCKET that we are alive...
	 */

	(void)sock_register(&inet_family_ops);

    ...
}

int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
		       NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);
	if (net_families[ops->family])
		err = -EEXIST;
	else {
		net_families[ops->family] = ops;
		err = 0;
	}
	spin_unlock(&net_family_lock);

	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
	return err;
}

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};

最終通過sock_register 函數將inet_family_ops註冊到net_families[PF_INET]中,在這裏PF_INET就是AF_INET

#define PF_INET		AF_INET

繼續回到__sock_create() 函數,我們看到執行了協議族操作表inet_family_ops的create函數即 inet_create

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
	struct sock *sk;
	struct list_head *p;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;
	// 檢查 socket 類型及加密字符
	if (sock->type != SOCK_RAW &&	//原始類型
	    sock->type != SOCK_DGRAM &&	//數據報類型,UDP協議
	    !inet_ehash_secret)
		build_ehash_secret();

從socket()函數傳進來的socket類型參數爲SOCK_STREAM即流類型,並且判斷是否有了加密字符,否則調用 build_ehash_secret函數來設置

void build_ehash_secret(void)
{
	u32 rnd;
	do {
		get_random_bytes(&rnd, sizeof(rnd)); //得到非 0 隨機數
	} while (rnd == 0);
	spin_lock_bh(&inetsw_lock);
	if (!inet_ehash_secret)
		inet_ehash_secret = rnd;	//使用隨機數作爲加密字符
	spin_unlock_bh(&inetsw_lock);
}

回到 inet_create 函數,注意到變量 struct inet_protosw *answer,inet_protosw結構體用於IP協議對應 socket 的接口,也就是靠近 socket 層的協議信息均保存在這個數據結構中,每一個IP協議都有這麼一個接口結構。

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
	struct list_head list;

        /* These two fields form the lookup key. 下面兩個變量用於校對使用 */
	unsigned short	 type;	   /* This is the 2nd argument to socket(2). 對應於socket的類型 */
	unsigned short	 protocol; /* This is the L4 protocol number. IP協議編碼 */

	struct proto	 *prot;	/* 對應的協議結構體指針 */
	const struct proto_ops *ops; /* 對應協議的函數操作表指針 */
  
	int              capability; /* Which (if any) capability do
				      * we need to use this socket
				      * interface?
                                      */
	char             no_check;   /* checksum on rcv/xmit/none? 是否在接收/發送的過程中使用校驗和 */
	unsigned char	 flags;      /* See INET_PROTOSW_* below. 標誌位 */
};

繼續看 inet_create 函數

	sock->state = SS_UNCONNECTED; //設置socket的狀態爲'未連接狀態'

	/* Look for the requested type/protocol pair. */
	answer = NULL;
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();	// rcu 鎖的操作,適合讀多寫少情況
	list_for_each_rcu(p, &inetsw[sock->type]) {
		answer = list_entry(p, struct inet_protosw, list);

		/* Check the non-wild match. 檢查協議編碼是否與內核已經註冊的協議相同 */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. 檢查是否屬於虛擬IP協議 */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
		answer = NULL;
	}

	if (unlikely(answer == NULL)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) 是否指定了名稱
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP) 否則就是通用的名稱
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (answer->capability > 0 && !capable(answer->capability))
		goto out_rcu_unlock;

	err = -EAFNOSUPPORT;
	if (!inet_netns_ok(net, protocol))
		goto out_rcu_unlock;
rcu_read_lock 跟 rcu_read_unlock 之間是讀臨界區
#define list_for_each_rcu(pos, head) \
	for (pos = rcu_dereference((head)->next); \
		prefetch(pos->next), pos != (head); \
		pos = rcu_dereference(pos->next))

#define rcu_dereference(p)     ({ \
				typeof(p) _________p1 = ACCESS_ONCE(p); \
				smp_read_barrier_depends(); \
				(_________p1); \
				})

#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))

可以看到宏 list_for_each_rcu 作用,在rcu保護下循環檢查inetsw數組,直到找到符合 socket 類型的隊列,這隊列是inet_protosw結構。inetsw隊列數組也是在inet_init() 函數中註冊完成的。

static int __init inet_init(void)
{
	struct sk_buff *dummy_skb;
	struct inet_protosw *q;
	struct list_head *r;
	int rc = -EINVAL;

    ...

    (void)sock_register(&inet_family_ops);
    
    ...

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

    ...
}

static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,				//TCP數據流協議
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.capability = -1,
		.no_check =   0,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,				//UDP數據報協議
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.capability = -1,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_PERMANENT,
       },


       {
	       .type =       SOCK_RAW,				//RAW原始套接字
	       .protocol =   IPPROTO_IP,	/* wild card  虛擬IP類型*/
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .capability = CAP_NET_RAW,
	       .no_check =   UDP_CSUM_DEFAULT,
	       .flags =      INET_PROTOSW_REUSE,
       }
};

從inet_init()函數看到,使用 inet_register_protosw() 函數註冊這個數組。

static struct list_head inetsw[SOCK_MAX];

void inet_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	int protocol = p->protocol;
	struct list_head *last_perm;

	spin_lock_bh(&inetsw_lock);

	if (p->type >= SOCK_MAX)
		goto out_illegal;

	/* If we are trying to override a permanent protocol, bail. 檢查參數P的類型是否超越了內核範圍 */
	answer = NULL;
	last_perm = &inetsw[p->type];
	list_for_each(lh, &inetsw[p->type]) {
		answer = list_entry(lh, struct inet_protosw, list);

		/* Check only the non-wild match. */
		if (INET_PROTOSW_PERMANENT & answer->flags) {
			if (protocol == answer->protocol)
				break;
			last_perm = lh;
		}

		answer = NULL;
	}
	if (answer)
		goto out_permanent;

	/* Add the new entry after the last permanent entry if any, so that
	 * the new entry does not override a permanent entry when matched with
	 * a wild-card protocol. But it is allowed to override any existing
	 * non-permanent entry.  This means that when we remove this entry, the
	 * system automatically returns to the old behavior.
	 */
	list_add_rcu(&p->list, last_perm);
out:
	spin_unlock_bh(&inetsw_lock);

	synchronize_net();

	return;

out_permanent:
	printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
	       protocol);
	goto out;

out_illegal:
	printk(KERN_ERR
	       "Ignoring attempt to register invalid socket type %d.\n",
	       p->type);
	goto out;
}

該函數通過宏 list_for_each 循環inetsw數組,通過對比要插入的參數p是否INET_PROTOSW_PERMANENT標誌、並且與隊列屬於同一種協議,如果符合則鏈入p->list隊列中。

可以看到inet_init()函數將數組inetsw_array中的元素逐一鏈入到inetsw數組的隊列中。

註冊這塊告一段落,回到inet_create函數中。

還記得 server_fd = socket(AF_INET,SOCK_STREAM,0); 所以protocol 爲 0,且type爲SOCK_STREAM即TCP協議類型,所以answer指向TCP協議的inet_protosw結構,然後protocol爲IPPROTO_IP(0)那麼不等於TCP協議的inet_protosw結構的protocol

    IPPROTO_IP = 0,	   /* Dummy protocol for TCP.  */
#define IPPROTO_IP		IPPROTO_IP

於是,protocol = answer->protocol即peotocol被賦值爲TCP協議的protocol 6。其中capability爲-1,然後inet_netns_ok判斷

我們繼續看 inet_create 函數

	sock->ops = answer->ops;	//inet_stream_ops
	answer_prot = answer->prot;	//tcp_prot
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
	rcu_read_unlock();

	BUG_TRAP(answer_prot->slab != NULL);

	err = -ENOBUFS;
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);	//分配 sock 結構
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;

	inet = inet_sk(sk);		
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	if (SOCK_RAW == sock->type) {
		inet->num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->id = 0;

	sock_init_data(sock, sk);

	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_family	   = PF_INET;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;	//設置處理庫存函數

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;

	sk_refcnt_debug_inc(sk);

	if (inet->num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares. 這裏允許用戶指定 socket 的編號,創建時自動共享
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);	//調用運輸層鉤子函數init tcp_prot 
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

可以看到將tcp結構的操作函數 inet_stream_ops 掛鉤給了socket的協議操作函數,將answer->prot賦值給answer_prot,作爲型參傳遞給了 sk_alloc() 函數使用。[ TODO socket  -- 傳輸層 proto -- 網絡層 inet_proto ]

分配並初始化 sock 結構

我們看 sk_alloc()函數,其中prot參數爲answer->prot,即tcp_prot

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot)
{
	struct sock *sk;

	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	if (sk) {
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		sock_lock_init(sk);
		sock_net_set(sk, get_net(net));
	}

	return sk;
}

用sk_prot_alloc() 函數分配一個通用的sock結構體

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
		int family)
{
	struct sock *sk;
	struct kmem_cache *slab;

	slab = prot->slab;
	if (slab != NULL)
		sk = kmem_cache_alloc(slab, priority);	//內存管理的slab分配函數,從sock高速緩衝池中分配
	else
		sk = kmalloc(prot->obj_size, priority);	//通用的告訴緩衝池中分配空間結構

	if (sk != NULL) {
		if (security_sk_alloc(sk, family, priority))
			goto out_free;

		if (!try_module_get(prot->owner))
			goto out_free_sec;
	}

	return sk;

out_free_sec:
	security_sk_free(sk);
out_free:
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
	return NULL;
}

根據prot結構是否提供了slab高速緩存來確定是在高速緩存分配或者在通用緩衝中分配。

分配成功後對family賦值、將tcp_prot結構賦值到 sk_prot 跟 sk_prot_creator上,然後在sock_lock_init 函數中對sock結構中的起同步作用的sk_lock鎖進行初始化。其中 sk_lock 是 socket_lock_t 類型的變量,可以說它是專用於 socket 的鎖

typedef struct {
	spinlock_t		slock;
	int			owned;
	wait_queue_head_t	wq;
	/*
	 * We express the mutex-alike socket_lock semantics
	 * to the lock validator by explicitly managing
	 * the slock as a lock variant (in addition to
	 * the slock itself):
	 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	struct lockdep_map dep_map;
#endif
} socket_lock_t;

可以看到其中包含了一個自旋鎖 slock 跟 一個等待隊列頭 wq。sock_lock_init_class_and_name 是對其內容的初始化。

static inline void sock_lock_init(struct sock *sk)
{
	sock_lock_init_class_and_name(sk,
			af_family_slock_key_strings[sk->sk_family],
			af_family_slock_keys + sk->sk_family,
			af_family_key_strings[sk->sk_family],
			af_family_keys + sk->sk_family);
}

#define sock_lock_init_class_and_name(sk, sname, skey, name, key) 	\
do {									\
	sk->sk_lock.owned = 0;					\
	init_waitqueue_head(&sk->sk_lock.wq);				\
	spin_lock_init(&(sk)->sk_lock.slock);				\
	debug_check_no_locks_freed((void *)&(sk)->sk_lock,		\
			sizeof((sk)->sk_lock));				\
	lockdep_set_class_and_name(&(sk)->sk_lock.slock,		\
		       	(skey), (sname));				\
	lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);	\
} while (0)

回到sk_alloc中,傳入的net參數爲 current->nsproxy->net_ns,這個是當前進程中記錄的網絡空間的結構,調用sock_net_set(sk, get_net(net)) 函數記錄下所屬的 net 空間結構,get_net(net)則是增加 net 結構的計數器。

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
#ifdef CONFIG_NET_NS
	sk->sk_net = net;
#endif
}

static inline struct net *get_net(struct net *net)
{
	atomic_inc(&net->count);
	return net;
}

回到 inet_create 函數,sk_alloc 分配了 sock 結構並初始化之後,如果分配失敗則 sk == NULL 直接退出

接下來調用inet = inet_sk(sk),通過 sock 指針得到 struct inet_sock * inet指針

static inline struct inet_sock *inet_sk(const struct sock *sk)
{
	return (struct inet_sock *)sk;
}

struct inet_sock {
	/* sk and pinet6 has to be the first two members of inet_sock */
	struct sock		sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
	struct ipv6_pinfo	*pinet6;
#endif
	/* Socket demultiplex comparisons on incoming packets. */
	__be32			daddr;        //目標地址
	__be32			rcv_saddr;    
	__be16			dport;        //目標端口
	__u16			num;          //端口
	__be32			saddr;
	__s16			uc_ttl;
	__u16			cmsg_flags;
	struct ip_options	*opt;
	__be16			sport;
	__u16			id;
	__u8			tos;
	__u8			mc_ttl;
	__u8			pmtudisc;
	__u8			recverr:1,
				is_icsk:1,
				freebind:1,
				hdrincl:1,
				mc_loop:1;
	int			mc_index;
	__be32			mc_addr;
	struct ip_mc_socklist	*mc_list;
	struct {
		unsigned int		flags;
		unsigned int		fragsize;
		struct ip_options	*opt;
		struct dst_entry	*dst;
		int			length; /* Total length of all frames */
		__be32			addr;
		struct flowi		fl;
	} cork;
};

可以瞭解到這是 socket 的專用數據結構。

再往後,調用 sock_init_data(sock, sk) 對新分配的sock結構做進一步初始化,將socket與sock的內容掛鉤起來。

void sock_init_data(struct socket *sock, struct sock *sk)
{	/* 隊列並非採用通用的 list_head 來維護,而是使用 skb_buffer 隊列: */
	skb_queue_head_init(&sk->sk_receive_queue);	//初始化接收隊列
	skb_queue_head_init(&sk->sk_write_queue);	//初始化發送隊列
	skb_queue_head_init(&sk->sk_error_queue);	//初始化錯誤數據包隊列
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue); //DMA 複製的數據包隊列
#endif

	sk->sk_send_head	=	NULL;	//發送數據包的隊列頭

	init_timer(&sk->sk_timer);		//初始化 sock 的沖刷定時器

	sk->sk_allocation	=	GFP_KERNEL;				//分配模式,無內存可用時可引起休眠
	sk->sk_rcvbuf		=	sysctl_rmem_default;	//接受緩衝區的長度 32767
	sk->sk_sndbuf		=	sysctl_wmem_default;	//發送緩存的總長度 32767
	sk->sk_state		=	TCP_CLOSE;
	sk->sk_socket		=	sock;					//指向對應的 socket 結構

	sock_set_flag(sk, SOCK_ZAPPED);

	if (sock) {
		sk->sk_type	=	sock->type;
		sk->sk_sleep	=	&sock->wait;
		sock->sk	=	sk;							//回指對應的 scok 結構
	} else
		sk->sk_sleep	=	NULL;

	rwlock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class_and_name(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);

	sk->sk_state_change	=	sock_def_wakeup;
	sk->sk_data_ready	=	sock_def_readable;
	sk->sk_write_space	=	sock_def_write_space;
	sk->sk_error_report	=	sock_def_error_report;
	sk->sk_destruct		=	sock_def_destruct;

	sk->sk_sndmsg_page	=	NULL;
	sk->sk_sndmsg_off	=	0;

	sk->sk_peercred.pid 	=	0;
	sk->sk_peercred.uid	=	-1;
	sk->sk_peercred.gid	=	-1;
	sk->sk_write_pending	=	0;
	sk->sk_rcvlowat		=	1;
	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp = ktime_set(-1L, 0);

	atomic_set(&sk->sk_refcnt, 1);
	atomic_set(&sk->sk_drops, 0);
}

注意到這裏對三個重要數據包隊列頭的初始化,是 sk_buff_head 結構

struct sk_buff_head {
	/* These two members must be first. */
	struct sk_buff	*next;
	struct sk_buff	*prev;

	__u32		qlen;
	spinlock_t	lock;
};

可以看到這是一個雙向隊列結構,其中qlen是隊列長度、lock用於併發控制的鎖。

struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.recvmsg		= tcp_recvmsg,
	.backlog_rcv		= tcp_v4_do_rcv,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_mem		= sysctl_tcp_mem,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
};

然後調用了sk->sk_prot->init(sk)即tcp_prot->init()函數,即 tcp_v4_init_sock 函數

static int tcp_v4_init_sock(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	skb_queue_head_init(&tp->out_of_order_queue);
	tcp_init_xmit_timers(sk);
	tcp_prequeue_init(tp);

	icsk->icsk_rto = TCP_TIMEOUT_INIT;
	tp->mdev = TCP_TIMEOUT_INIT;

	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
	 * algorithms that we must have the following bandaid to talk
	 * efficiently to them.  -DaveM
	 */
	tp->snd_cwnd = 2;

	/* See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
	tp->snd_cwnd_clamp = ~0;
	tp->mss_cache = 536;

	tp->reordering = sysctl_tcp_reordering;
	icsk->icsk_ca_ops = &tcp_init_congestion_ops;

	sk->sk_state = TCP_CLOSE;

	sk->sk_write_space = sk_stream_write_space;
	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

	icsk->icsk_af_ops = &ipv4_specific;
	icsk->icsk_sync_mss = tcp_sync_mss;
#ifdef CONFIG_TCP_MD5SIG
	tp->af_specific = &tcp_sock_ipv4_specific;
#endif

	sk->sk_sndbuf = sysctl_tcp_wmem[1];
	sk->sk_rcvbuf = sysctl_tcp_rmem[1];

	atomic_inc(&tcp_sockets_allocated);

	return 0;
}

inet_connection_sock 結構是不是很熟悉?是 tcp_sock 結構體的第一個成員。這裏也都是一些初始化賦值操作,最後遞增tcp_sockets_allocated。

struct inet_connection_sock {
	/* inet_sock has to be the first member! */
	struct inet_sock	  icsk_inet;				//INET 協議族的 sock 結構
	struct request_sock_queue icsk_accept_queue;	//確定接收隊列
	struct inet_bind_bucket	  *icsk_bind_hash;		//綁定的桶結構
	unsigned long		  icsk_timeout;				//超時
 	struct timer_list	  icsk_retransmit_timer;	//沒有 ACK 時的重發定時器
 	struct timer_list	  icsk_delack_timer;		//確定刪除定時器
	__u32			  icsk_rto;						//重發超時
	__u32			  icsk_pmtu_cookie;				//最近的 pmtu		
	const struct tcp_congestion_ops *icsk_ca_ops;	//擁擠情況時的處理函數
	const struct inet_connection_sock_af_ops *icsk_af_ops;	//AF_INET指定的函數操作表
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);	//同步 mss 的函數指針
	__u8			  icsk_ca_state;		//擁擠情況的處理狀態
	__u8			  icsk_retransmits;		//重發數量
	__u8			  icsk_pending;			//掛起
	__u8			  icsk_backoff;			//允許連接的數量
	__u8			  icsk_syn_retries;		//允許重新SYN的數量
	__u8			  icsk_probes_out;		//探測到未應答的窗口
	__u16			  icsk_ext_hdr_len;		//網絡協議頭部的長度
	struct {
		__u8		  pending;	 /* ACK is pending			   */
		__u8		  quick;	 /* Scheduled number of quick acks	   */
		__u8		  pingpong;	 /* The session is interactive		   */
		__u8		  blocked;	 /* Delayed ACK was blocked by socket lock */
		__u32		  ato;		 /* Predicted tick of soft clock	   */
		unsigned long	  timeout;	 /* Currently scheduled timeout		   */
		__u32		  lrcvtime;	 /* timestamp of last received data packet */
		__u16		  last_seg_size; /* Size of last incoming segment	   */
		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
	} icsk_ack;
	struct {
		int		  enabled;

		/* Range of MTUs to search */
		int		  search_high;
		int		  search_low;

		/* Information on the current probe. */
		int		  probe_size;
	} icsk_mtup;
	u32			  icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
};

到這裏 socket 算是創建並初始化完成,我們可以看到上圖的數據結構,此時socket的 state 爲未連接狀態,sock 的 sk_state 爲關閉狀態。然後回到 sys_socket 函數中,執行retval = sock_map_fd(sock)。

socket與文件系統

int sock_map_fd(struct socket *sock)
{
	struct file *newfile;
	int fd = sock_alloc_fd(&newfile);			//爲 socket 分配文件號跟文件結構

	if (likely(fd >= 0)) {
		int err = sock_attach_fd(sock, newfile);	//掛載 socket 跟文件結構

		if (unlikely(err < 0)) {				//出錯則釋放文件跟文件號
			put_filp(newfile);
			put_unused_fd(fd);
			return err;
		}
		fd_install(fd, newfile);				//使文件與文件號掛鉤
	}
	return fd;
}

先通過sock_alloc_fd申請文件結構空間與文件號

static int sock_alloc_fd(struct file **filep)
{
	int fd;

	fd = get_unused_fd();	//得到空閒文件號
	if (likely(fd >= 0)) {
		struct file *file = get_empty_filp();	//分配文件結構空間

		*filep = file;
		if (unlikely(!file)) {
			put_unused_fd(fd);
			return -ENFILE;
		}
	} else
		*filep = NULL;
	return fd;
}

這兒涉及到文件系統的操作,分別從當前進程獲取到空閒的fd,再從文件系統分配空閒的文件結構空間,如果分配失敗則釋放。

分配申請成功,然後執行sock_attach_fd

static int sock_attach_fd(struct socket *sock, struct file *file)
{
	struct dentry *dentry;
	struct qstr name = { .name = "" };

	dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);	//創建一個 socket 的文件系統目錄項,sock_mnt是 vfsmount 類型
	if (unlikely(!dentry))
		return -ENOMEM;

	dentry->d_op = &sockfs_dentry_operations;	//將 socket文件系統的目錄操作表掛入到目錄項的操作表中
	/*
	 * We dont want to push this dentry into global dentry hash table.
	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
	 * This permits a working /proc/$pid/fd/XXX on sockets
	 */
	dentry->d_flags &= ~DCACHE_UNHASHED;
	d_instantiate(dentry, SOCK_INODE(sock));

	sock->file = file;
	init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
		  &socket_file_ops);		// socket 文件結構進行初始化,傳入的是 socket_file_ops 操作表
	SOCK_INODE(sock)->i_fop = &socket_file_ops;
	file->f_flags = O_RDWR;
	file->f_pos = 0;
	file->private_data = sock;		//可以在文件系統中通過 private_data 找到對應的 socket

	return 0;
}

我們看一下 sockfs_dentry_operations socket文件系統的目錄操作表

static struct dentry_operations sockfs_dentry_operations = {
	.d_delete = sockfs_delete_dentry,
	.d_dname  = sockfs_dname,
};

同時看一下 socket_file_ops 文件操作表

static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read,
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.open =		sock_no_open,	/* special open code to disallow open via /proc */
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};

其實我們通過read/write對socket進行讀寫,但內部是通過這個函數表映射到具體的socket操作,給用戶一種操作文件的方便性,統一性。

bind()

再看demo的服務器代碼,在socket()創建完畢後,通過bind(server_fd,(struct sockaddr*)&server_address,server_len)綁定地址給socket。

我們還是跟蹤其實現,bind直接在sys_socketcall()函數中,對照參數SYS_BIND,找到系統調用sys_bind()

asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
	struct socket *sock;
	char address[MAX_SOCK_ADDR];
	int err, fput_needed;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);     //通過 fd 找到對應的 socket
	if (sock) {
		err = move_addr_to_kernel(umyaddr, addrlen, address);   //將傳入的地址從用戶空間複製到內核空間
		if (err >= 0) {
			err = security_socket_bind(sock,
						   (struct sockaddr *)address,
						   addrlen);
			if (!err)
				err = sock->ops->bind(sock,
						      (struct sockaddr *)
						      address, addrlen);                  //調用具體協議的綁定函數,inet_stream_ops->bind()
		}
		fput_light(sock->file, fput_needed);
	}
	return err;
}

我們繼續看 sockfd_lookup_light 函數

static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
	struct file *file;
	struct socket *sock;

	*err = -EBADF;
	file = fget_light(fd, fput_needed);		//根據 fd 找到文件指針
	if (file) {
		sock = sock_from_file(file, err);	//在文件指針中獲得 socket 指針
		if (sock)
			return sock;
		fput_light(file, *fput_needed);
	}
	return NULL;
}

fget_light/fput_light 是文件操作,fget_light從當前進程的files_struct 結構中找到文件系統中file文件指針,增加計數,fput_light減計數,如果sock結構取到則直接返回。在這裏我們重點看下 sock_from_file 函數

static struct socket *sock_from_file(struct file *file, int *err)
{
	if (file->f_op == &socket_file_ops)
		return file->private_data;	/* set in sock_map_fd */

	*err = -ENOTSOCK;
	return NULL;
}

之前提過的 file->private_data域是存儲 socket 指針。通過sockfd_lookup_light函數我們得到了之前創建並初始化的socket,然後通過 move_addr_to_kernel 函數將地址複製到內核空間。

int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
{
	if (ulen < 0 || ulen > MAX_SOCK_ADDR)
		return -EINVAL;
	if (ulen == 0)
		return 0;
	if (copy_from_user(kaddr, uaddr, ulen))
		return -EFAULT;
	return audit_sockaddr(ulen, kaddr);
}

再往下看 security_socket_bind 涉及到 security ,沒有設置直接返回 0 。於是調用了 sock->ops->bind()方法,由於我們的socket->ops之前綁定了answer->ops(忘記了可以翻看上面的inet_create函數),即這裏實際上調用的是inet_stream_ops->bind

const struct proto_ops inet_stream_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = inet_accept,
	.getname	   = inet_getname,
	.poll		   = tcp_poll,
	.ioctl		   = inet_ioctl,
	.listen		   = inet_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = tcp_sendmsg,
	.recvmsg	   = sock_common_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = tcp_sendpage,
	.splice_read	   = tcp_splice_read,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_sock_common_setsockopt,
	.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};

對照上面這個結構體我們找到.bind執行的是inet_bind()函數。

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;
	struct inet_sock *inet = inet_sk(sk);
	unsigned short snum;
	int chk_addr_ret;
	int err;

	/* If the socket has its own bind function then use it. (RAW) */
	if (sk->sk_prot->bind) {
		err = sk->sk_prot->bind(sk, uaddr, addr_len);   //如果 socket 提供了自己的綁定函數就使用它,這裏的sk->sk_prot爲tcp_prot
		goto out;
	}
	err = -EINVAL;
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); //在路由中檢查地址類型

	/* Not specified by any standard per-se, however it breaks too
	 * many applications when removed.  It is unfortunate since
	 * allowing applications to make a non-local bind solves
	 * several problems with systems using dynamic addressing.
	 * (ie. your servers still start up even if your ISDN link
	 *  is temporarily down)
	 */
	err = -EADDRNOTAVAIL;
	if (!sysctl_ip_nonlocal_bind &&
	    !inet->freebind &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
	    chk_addr_ret != RTN_LOCAL &&                //是否單播類型
	    chk_addr_ret != RTN_MULTICAST &&            //是否組播類型
	    chk_addr_ret != RTN_BROADCAST)              //是否廣播類型
		goto out;

	snum = ntohs(addr->sin_port);                   //取得端口號
	err = -EACCES;
	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
		goto out;

	/*      We keep a pair of addresses. rcv_saddr is the one
	 *      used by hash lookups, and saddr is used for transmit.
	 *
	 *      In the BSD API these are the same except where it
	 *      would be illegal to use them (multicast/broadcast) in
	 *      which case the sending device address is used.
	 */
	lock_sock(sk);  //加鎖

	/* Check these errors (active socket, double bind). */
	err = -EINVAL;
	if (sk->sk_state != TCP_CLOSE || inet->num) //檢查狀態、端口是否已經指定
		goto out_release_sock;

	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;          //rcv_saddr用於哈希查找、saddr用於發送(賦值爲ip地址)
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
		inet->saddr = 0;  /* Use device */

	/* Make sure we are allowed to bind here. 檢查是否允許綁定 */
	if (sk->sk_prot->get_port(sk, snum)) {              // inet_csk_get_port()
		inet->saddr = inet->rcv_saddr = 0;              // 檢查失敗就情況設置的地址
		err = -EADDRINUSE;
		goto out_release_sock;
	}

	if (inet->rcv_saddr)                        //如果已經設置了地址就增加鎖標誌,表示已經綁定了地址
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)                                   //如果已經設置了端口就增加鎖標誌,表示已經綁定了端口
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->sport = htons(inet->num);             //記錄端口
	inet->daddr = 0;                            //初始化目標地址
	inet->dport = 0;                            //初始化目標端口
	sk_dst_reset(sk);                           //初始化緩存的路由內容
	err = 0;
out_release_sock:
	release_sock(sk);   //解鎖
out:
	return err;
}

我們可以看到sk->prot爲tcp_prot,查找該結構我們並未發現.bind項,於是繼續往下執行。這裏涉及到兩個數據結構sockaddr_in跟sockaddr

struct sockaddr_in {
  sa_family_t		sin_family;	/* Address family		*/
  __be16		sin_port;	/* Port number			*/
  struct in_addr	sin_addr;	/* Internet address		*/

  /* Pad to size of `struct sockaddr'. */
  unsigned char		__pad[__SOCK_SIZE__ - sizeof(short int) -
			sizeof(unsigned short int) - sizeof(struct in_addr)];
};

struct sockaddr {
  sa_family_t		sin_family;	/* Address family		*/
  char            sa_data[14];
}

可以看到因爲兩個結構體長度相同,結構相似可以互相強制類型轉換,可能考慮到兼容性問題,在inet_bind()函數中將之前sockaddr類型轉回sockaddr_in類型。

通過sock_net(sk)返回了sk->sk_net指針,如果用戶沒有自定義網絡空間則返回系統默認init_net 結構指針,然後調用 inet_addr_type() 函數檢查地址的類型。

unsigned int inet_addr_type(struct net *net, __be32 addr)
{
	return __inet_dev_addr_type(net, NULL, addr);
}

static inline unsigned __inet_dev_addr_type(struct net *net,
					    const struct net_device *dev,
					    __be32 addr)
{
	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
	struct fib_result	res;
	unsigned ret = RTN_BROADCAST;
	struct fib_table *local_table;

	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))		//檢查地址是否是零地址或廣播地址
		return RTN_BROADCAST;
	if (ipv4_is_multicast(addr))							//檢查地址是否是組播地址
		return RTN_MULTICAST;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif

	local_table = fib_get_table(net, RT_TABLE_LOCAL);		//查找本地路由函數表
	if (local_table) {
		ret = RTN_UNICAST;
		if (!local_table->tb_lookup(local_table, &fl, &res)) {
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
			fib_res_put(&res);
		}
	}
	return ret;
}

代碼中出現了 struct flowi 結構用於路由鍵值。flowi.nl_u 是一個聯合體,包含了ip4_u、ip6_u、dn_u這三個結構體,於是可以理解 struct flowi        fl = { .nl_u = { .ip4_u = { .daddr = addr } } } 將ip地址賦值給路由鍵值 fl的目標地址。我們可以稍微看下flowi的結構體

struct flowi {		//路由鍵值結構
	int	oif;		//負責發送的網絡設備
	int	iif;		//負責接收的網絡設備
	__u32	mark;	//子網掩碼

	union {
		struct {
			__be32			daddr;		//目標地址
			__be32			saddr;		//源地址,即發送方地址
			__u8			tos;		//服務類型TOS
			__u8			scope;		//範圍
		} ip4_u;
		
		struct {
			struct in6_addr		daddr;
			struct in6_addr		saddr;
			__be32			flowlabel;
		} ip6_u;

		struct {
			__le16			daddr;
			__le16			saddr;
			__u8			scope;
		} dn_u;
	} nl_u;								//該聯合體主要用於網絡層
#define fld_dst		nl_u.dn_u.daddr		
#define fld_src		nl_u.dn_u.saddr
#define fld_scope	nl_u.dn_u.scope
#define fl6_dst		nl_u.ip6_u.daddr
#define fl6_src		nl_u.ip6_u.saddr
#define fl6_flowlabel	nl_u.ip6_u.flowlabel
#define fl4_dst		nl_u.ip4_u.daddr
#define fl4_src		nl_u.ip4_u.saddr
#define fl4_tos		nl_u.ip4_u.tos
#define fl4_scope	nl_u.ip4_u.scope

	__u8	proto;		//傳輸層協議
	__u8	flags;		//標誌位
	union {
		struct {
			__be16	sport;	//源端口,發送方端口
			__be16	dport;	//目標端口,接收方端口
		} ports;

		struct {
			__u8	type;
			__u8	code;
		} icmpt;			//ICMP 類型

		struct {
			__le16	sport;
			__le16	dport;
		} dnports;

		__be32		spi;

		struct {
			__u8	type;
		} mht;
	} uli_u;							//該聯合體主要用於傳輸層
#define fl_ip_sport	uli_u.ports.sport
#define fl_ip_dport	uli_u.ports.dport
#define fl_icmp_type	uli_u.icmpt.type
#define fl_icmp_code	uli_u.icmpt.code
#define fl_ipsec_spi	uli_u.spi
#define fl_mh_type	uli_u.mht.type
	__u32           secid;	/* used by xfrm; see secid.txt */
} __attribute__((__aligned__(BITS_PER_LONG/8)));

struct fib_result 結構是路由查找結果,struct fib_table 則是路由函數表結構體。函數中先檢查ip地址addr是否是零地址、本地的廣播地址、組播地址

static inline bool ipv4_is_zeronet(__be32 addr)
{
	return (addr & htonl(0xff000000)) == htonl(0x00000000);
}

可以看到該函數是檢查addr的高8位是否爲零判斷是否是零網地址。

static inline bool ipv4_is_lbcast(__be32 addr)
{
	/* limited broadcast */
	return addr == htonl(INADDR_BROADCAST);
}

#define	INADDR_BROADCAST	((unsigned long int) 0xffffffff)

是否全1判斷爲廣播地址

static inline bool ipv4_is_multicast(__be32 addr)
{
	return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

addr的高4位爲1110則是屬於多播地址

零網地址、廣播地址則直接返回RTN_BROADCAST,多播地址返回RTN_MULTICAST,若都不是則查找具體的函數路由表返回查找結果。

fib_get_table 函數在內核中有兩塊地方,根據是否配置了CONFIG_IP_MULTIPLT_TABLES,我們挑簡單的單路由函數表分析。

static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
	struct hlist_head *ptr;

	ptr = id == RT_TABLE_LOCAL ?
		&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
		&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
	return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}

此時傳進來的net爲sock_net(sk),即系統默認的init_net,id則是 RT_TABLE_LOCAL ,net->ipv4是netns_ipv4結構類型,裝載着IPV4協議在網絡空間中的信息。

struct netns_ipv4 {
#ifdef CONFIG_SYSCTL
	struct ctl_table_header	*forw_hdr;
	struct ctl_table_header	*frags_hdr;
	struct ctl_table_header	*ipv4_hdr;
#endif
	struct ipv4_devconf	*devconf_all;
	struct ipv4_devconf	*devconf_dflt;
#ifdef CONFIG_IP_MULTIPLE_TABLES
	struct fib_rules_ops	*rules_ops;
#endif
	struct hlist_head	*fib_table_hash;
	struct sock		*fibnl;

	struct sock		**icmp_sk;
	struct sock		*tcp_sock;

	struct netns_frags	frags;
#ifdef CONFIG_NETFILTER
	struct xt_table		*iptable_filter;
	struct xt_table		*iptable_mangle;
	struct xt_table		*iptable_raw;
	struct xt_table		*arptable_filter;
#endif

	int sysctl_icmp_echo_ignore_all;
	int sysctl_icmp_echo_ignore_broadcasts;
	int sysctl_icmp_ignore_bogus_error_responses;
	int sysctl_icmp_ratelimit;
	int sysctl_icmp_ratemask;
	int sysctl_icmp_errors_use_inbound_ifaddr;
};

IPV4所有的路由函數表都會鏈入到 fib_table_hash 數組中,數組的每個元素爲hlist_head即隊列,每個路由函數表通過其內部結構 tb_hlist 頭鏈入到對應的隊列中。

[TODO]

如果找到了本地路由函數表,那麼調用本地路由函數表的 local_table->tb_lookup(local_table, &fl, &res) 函數根據鍵值 fl 返回 struct fib_result 結構,傳遞下來的dev參數爲null,所以此時會修改ret = res.type,然後返回。

回到inet_bind函數中,snum = ntohs(addr->sin_port) 取得端口號,檢查是否小於1024(系統保留了0~1023端口號)以及是否有綁定權限,然後檢查狀態、端口是否已經指定,再將ip地址賦值給inet的接收地址和源地址上。如果 ip地址類型是組播或者廣播或者零網地址,則將源地址改爲0,而接收地址不變。

然後調用tcp_prot->get_prot 即inet_csk_get_port檢查端口是否允許綁定。

/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;	//tcp_prot->h.hashinfo tcp_hashinfo
	struct inet_bind_hashbucket *head;
	struct hlist_node *node;
	struct inet_bind_bucket *tb;
	int ret;
	struct net *net = sock_net(sk);		//通過sock得到net結構

代碼過長,我們一點一點看。我們看到先得到hashinfo結構,它是inet_hashinfo結構的指針,通過tcp_prot.h.hashinfo得到的tcp_hashinfo,我們看下inet_hashinfo結構,用來封裝各種協議的綁定哈希表。

struct inet_hashinfo {
	/* This is for sockets with full identity only.  Sockets here will
	 * always be without wildcards and will have the following invariant:
	 *
	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
	 *
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */ //已經連接的sock結構都鏈入到該哈希桶,它有兩個隊列,一個是連接的sock 隊列,一個爲定時等待的sock隊列
	struct inet_ehash_bucket	*ehash;	//已經建立連接的哈希桶
	rwlock_t			*ehash_locks;	//隊列鎖
	unsigned int			ehash_size;	//隊列長度
	unsigned int			ehash_locks_mask;	//鎖掩碼

	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 */
	struct inet_bind_hashbucket	*bhash;	//管理端口號的哈希桶

	unsigned int			bhash_size;	//哈希桶長度
	/* Note : 4 bytes padding on 64 bit arches */

	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct hlist_head		listening_hash[INET_LHTABLE_SIZE];	//監聽哈希隊列

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * are often dirty.
	 */
	rwlock_t			lhash_lock ____cacheline_aligned;
	atomic_t			lhash_users;
	wait_queue_head_t		lhash_wait;	//等待隊列頭
	struct kmem_cache			*bind_bucket_cachep;	//高速緩存
};

 可以看到這個結構是爲了維護INET協議族的hash表使用的。

我們還看到一個數據結構

struct inet_bind_hashbucket {	//哈希桶結構
	spinlock_t		lock;		//自旋鎖
	struct hlist_head	chain;	//桶隊列
};

 這是一個帶着自旋鎖的哈希桶,chain代表着各個桶的哈希隊列。

再往下struct hlist_node *node ,它是hash表的鏈頭,被鏈入到hlist_head結構中。

下一行還有struct inet_bind_bucket *tb

struct inet_bind_bucket {			//桶結構
	struct net		*ib_net;		//網絡空間指針
	unsigned short		port;		//端口號
	signed short		fastreuse;	//可以重複使用
	struct hlist_node	node;		//鏈入哈希桶的chain中的哈希頭
	struct hlist_head	owners;		//sock 結構隊列
};

這個結構鏈入到哈希桶inet_bind_hashbucket 結構中

繼續inet_csk_get_port函數

	local_bh_disable();					//是net指向內核的init_net網絡空間
	if (!snum) {						//如果端口號沒有指定
		int remaining, rover, low, high;

		inet_get_local_port_range(&low, &high);
		remaining = (high - low) + 1;
		rover = net_random() % remaining + low;

		do {	//在內核中查找一個端口號
			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
			spin_lock(&head->lock);
			inet_bind_bucket_for_each(tb, node, &head->chain)
				if (tb->ib_net == net && tb->port == rover)
					goto next;
			break;
		next:
			spin_unlock(&head->lock);
			if (++rover > high)
				rover = low;
		} while (--remaining > 0);

		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0)
			goto fail;

		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover;

可以看到snum我們是指定了的,但如果沒有指定(端口號爲0)則進入這個條件分支,表示由內核分配一個端口號。

先調用inet_get_local_port_range()函數取得端口號的取值範圍。

void inet_get_local_port_range(int *low, int *high)
{
	unsigned seq;
	do {
		seq = read_seqbegin(&sysctl_port_range_lock);

		*low = sysctl_local_port_range[0];
		*high = sysctl_local_port_range[1];
	} while (read_seqretry(&sysctl_port_range_lock, seq));
}

通過樂觀鎖的方式從內核的端口範圍數組sysctl_local_port_range讀出兩個值{32768, 61000}

然後通過隨機數的方式計算出推薦端口號 rover = net_random() % remaining + low。並且保證推薦端口號是未使用過的,如果全部使用過,則ret爲1,go fail,否則就以適合的推薦端口號爲端口號。

head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
			spin_lock(&head->lock);
			inet_bind_bucket_for_each(tb, node, &head->chain)
				if (tb->ib_net == net && tb->port == rover)
					goto next;

這一塊的邏輯類似於hashmap.get(rover),就是將rover取哈希取餘爲下標,在tcp_hashinfo的bhash哈希桶中,取出對應的哈希隊列chain,然後遍歷隊列如果有相同的則說明綁定過了。

#define inet_bind_bucket_for_each(tb, node, head) \
	hlist_for_each_entry(tb, node, head, node)

#define hlist_for_each_entry(tpos, pos, head, member)			 \
	for (pos = (head)->first;					 \
	     pos && ({ prefetch(pos->next); 1;}) &&			 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = pos->next)

在這裏我們可以看到細節,桶結構 inet_bind_bucket 是通過node節點鏈入到哈希桶 inet_bind_hashbucket 的chain隊列上去。

我們的服務器程序指定了端口號,那麼不需要內核分配端口號,我們繼續inet_csk_get_port函數

	} else {	//在哈希桶隊列中查找相同端口的桶結構
		head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, node, &head->chain)
			if (tb->ib_net == net && tb->port == snum)
				goto tb_found;
	}
	tb = NULL;
	goto tb_not_found;
tb_found:
	if (!hlist_empty(&tb->owners)) {	//檢查 sock 隊列是否爲空
		if (tb->fastreuse > 0 &&
		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
			goto success;	//複用
		} else {
			ret = 1;	//桶結構中的 sock 隊列是否存在衝突
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
				goto fail_unlock;
		}
	}
tb_not_found:	//桶結構不存在則創建
	ret = 1;
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) {
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
			tb->fastreuse = 1;	//設置桶結構可以複用
		else
			tb->fastreuse = 0;
	} else if (tb->fastreuse &&
		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
		tb->fastreuse = 0;
success:
	if (!inet_csk(sk)->icsk_bind_hash) //還沒有綁定桶結構
		inet_bind_hash(sk, tb, snum);	//綁定
	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}

現在哈希桶結構中查找指定端口對應的桶結構tb,如果找到則tb_found處,tb->owners是一個sock隊列頭,如果這個隊列不爲空,就檢查其是否支持快速複用即fastreuse爲1,然後再看我們的sock是否也允許複用,且未處於監聽狀態那麼跳入sucess處。否則執行inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb) 函數,即ipv4_specific->bind_conflict,即 inet_csk_bind_conflict 函數

int inet_csk_bind_conflict(const struct sock *sk,
			   const struct inet_bind_bucket *tb)
{
	const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
	struct sock *sk2;
	struct hlist_node *node;
	int reuse = sk->sk_reuse;

	/*
	 * Unlike other sk lookup places we do not check
	 * for sk_net here, since _all_ the socks listed
	 * in tb->owners list belong to the same net - the
	 * one this bucket belongs to.
	 */

	sk_for_each_bound(sk2, node, &tb->owners) {
		if (sk != sk2 &&
		    !inet_v6_ipv6only(sk2) &&
		    (!sk->sk_bound_dev_if ||
		     !sk2->sk_bound_dev_if ||
		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {	//是否同一設備
			if (!reuse || !sk2->sk_reuse ||
			    sk2->sk_state == TCP_LISTEN) {
				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
				    sk2_rcv_saddr == sk_rcv_saddr)				//是否綁定地址相同
					break;
			}
		}
	}
	return node != NULL;
}

代碼很簡單,宏sk_for_each_bound 是遍歷tb->owners隊列,其中每個sock結構爲sk2,然後對比sk跟sk2,如果設備相同、綁定的地址也相同就”衝突“了。

回到inet_csk_get_port函數,如果沒找到桶結構轉到tb_not_found處,通過 inet_bind_bucket_create 函數創建桶結構,並將端口號等內容記錄到新建的桶結構中,並將桶結構鏈入到哈希桶中。

/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
						 struct net *net,
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);	//申請桶結構空間

	if (tb != NULL) {
		tb->ib_net       = hold_net(net);		//記錄網絡空間
		tb->port      = snum;					//記錄端口號
		tb->fastreuse = 0;						//快速複用初始化0,根據sock調整
		INIT_HLIST_HEAD(&tb->owners);			//初始化 sock 隊列
		hlist_add_head(&tb->node, &head->chain);//將桶結構鏈入到哈希桶中
	}
	return tb;
}

在sucess處通過 inet_csk(sk)->icsk_bind_hash 判斷是否還沒有綁定桶結構。inet_csk直接將sock指針強轉爲 inet_connection_sock,然後判斷其icsk_bind_hash是否有值。inet_connection_sock用於INET協議族連接sock,前面有出現。如果sock沒有綁定桶結構則通過 inet_bind_hash 函數將 sock 鏈入到桶結構的sock隊列中。

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum)
{
	inet_sk(sk)->num = snum;
	sk_add_bind_node(sk, &tb->owners);
	inet_csk(sk)->icsk_bind_hash = tb;
}

首先將inet_connection_sock的端口號賦值,然後將當前sock綁定到桶結構的owners隊列中,然後inet_connection_sock的icsk_bind_hash記下tb桶。綁定工作就完畢了。

回到inet_bind()函數

	/* Make sure we are allowed to bind here. 檢查是否允許綁定 */
	if (sk->sk_prot->get_port(sk, snum)) {              // inet_csk_get_port()
		inet->saddr = inet->rcv_saddr = 0;              // 檢查失敗就情況設置的地址
		err = -EADDRINUSE;
		goto out_release_sock;
	}

	if (inet->rcv_saddr)                        //如果已經設置了地址就增加鎖標誌,表示已經綁定了地址
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)                                   //如果已經設置了端口就增加鎖標誌,表示已經綁定了端口
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->sport = htons(inet->num);             //記錄端口
	inet->daddr = 0;                            //初始化目標地址
	inet->dport = 0;                            //初始化目標端口
	sk_dst_reset(sk);                           //初始化緩存的路由內容
	err = 0;
out_release_sock:
	release_sock(sk);   //解鎖
out:
	return err;
}

bind()到這裏就告一段落,中間我們遺留了本地路由函數表、local_table->tb_lookup(local_table, &fl, &res)相關的內容。

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章