Linux Kernel 2.6.9源碼分析 – send/recieve 報文
可用戶socket報文讀寫的函數有以下幾對:
-
ssize_t read(int fd, void *buf, size_t count);
ssize_t write(int fd, const void *buf, size_t count); -
ssize_t send(int sockfd, const void *buf, size_t len, int flags);
ssize_t recv(int sockfd, void *buf, size_t len, int flags); -
ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags); -
ssize_t sendto(int sockfd, const void *buf, size_t len, int flags,const struct sockaddr *dest_addr, socklen_t addrlen);
ssize_t recvfrom(int sockfd, void *buf, size_t len, int flags,struct sockaddr *src_addr, socklen_t *addrlen);
區別:
通過“有鏈接”,“無連接”來區分:
上述1,2,3是面向“有鏈接”的,因爲其參數中沒有帶上目標地址
上述4 是面向“無連接”的,其參數中有帶上目標地址
共同點:
1,2,3,4對雖然對應的系統調用有所不同,但最終在內核中調用的函數是相同的:
發送:static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size)
—>sock->ops->sendmsg
接收:static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags)
–> sock->ops->recvmsg
不同類型的socket,在創建的時候會註冊不同的ops,對於TCP而言:
net/ipv4/tcp.c -->
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,size_t size)
int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,size_t len, int nonblock, int flags, int *addr_len)
首先我們來看下報文的接收(待深入分析…):
- 首先不是很明白TCP中的prequeue,receive queue,backlog queue三個的作用有什麼不同,所以下面的這個代碼難以理解.
- 當前能夠理解的是:TCP的數據包是一個個struct sk_buff結構的list.而面向傳入到用戶層的數據結構是struct msghdr
- 將skb中的數據copy到struct msghdr.iovec
int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len)
{
struct tcp_opt *tp = tcp_sk(sk);
int copied = 0;
u32 peek_seq;
u32 *seq;
unsigned long used;
int err;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
lock_sock(sk);
TCP_CHECK_TIMER(sk);
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
goto recv_urg;
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
seq = &peek_seq;
}
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
do {
struct sk_buff *skb;
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
if (tp->urg_data && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
}
/* Next get a buffer. */
skb = skb_peek(&sk->sk_receive_queue);
do {
if (!skb)
break;
/* Now that we have two receive queues this
* shouldn't happen.
*/
if (before(*seq, TCP_SKB_CB(skb)->seq)) {
printk(KERN_INFO "recvmsg bug: copied %X "
"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
break;
}
offset = *seq - TCP_SKB_CB(skb)->seq;
if (skb->h.th->syn)
offset--;
if (offset < skb->len)
goto found_ok_skb;
if (skb->h.th->fin)
goto found_fin_ok;
BUG_TRAP(flags & MSG_PEEK);
skb = skb->next;
} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
/* Well, if we have backlog, try to process it now yet. */
if (copied >= target && !sk->sk_backlog.tail)
break;
if (copied) {
if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current) ||
(flags & MSG_PEEK))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
* from never connected socket.
*/
copied = -ENOTCONN;
break;
}
break;
}
if (!timeo) {
copied = -EAGAIN;
break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
cleanup_rbuf(sk, copied);
if (tp->ucopy.task == user_recv) {
/* Install new reader */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
tp->ucopy.task = user_recv;
tp->ucopy.iov = msg->msg_iov;
}
tp->ucopy.len = len;
BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
(flags & (MSG_PEEK | MSG_TRUNC)));
/* Ugly... If prequeue is not empty, we have to
* process it before releasing socket, otherwise
* order will be broken at second iteration.
* More elegant solution is required!!!
*
* Look: we have the following (pseudo)queues:
*
* 1. packets in flight
* 2. backlog
* 3. prequeue
* 4. receive_queue
*
* Each queue can be processed only if the next ones
* are empty. At this point we have empty receive_queue.
* But prequeue _can_ be not empty after 2nd iteration,
* when we jumped to start of loop because backlog
* processing added something to receive_queue.
* We cannot release_sock(), because backlog contains
* packets arrived _after_ prequeued ones.
*
* Shortly, algorithm is clear --- to process all
* the queues in order. We could make it more directly,
* requeueing packets from backlog to prequeue, if
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
if (skb_queue_len(&tp->ucopy.prequeue))
goto do_prequeue;
/* __ Set realtime policy in scheduler __ */
}
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
} else
sk_wait_data(sk, &timeo);
if (user_recv) {
int chunk;
/* __ Restore normal policy in scheduler __ */
if ((chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
}
if (tp->rcv_nxt == tp->copied_seq &&
skb_queue_len(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk);
if ((chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
}
if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
if (net_ratelimit())
printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
current->comm, current->pid);
peek_seq = tp->copied_seq;
}
continue;
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* Do we have urgent data here? */
if (tp->urg_data) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
offset++;
used--;
if (!used)
goto skip_copy;
}
} else
used = urg_offset;
}
}
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_iovec(skb, offset,
msg->msg_iov, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
}
*seq += used;
copied += used;
len -= used;
tcp_rcv_space_adjust(sk);
skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
tp->urg_data = 0;
tcp_fast_path_check(sk, tp);
}
if (used + offset < skb->len)
continue;
if (skb->h.th->fin)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
break;
} while (len > 0);
if (user_recv) {
if (skb_queue_len(&tp->ucopy.prequeue)) {
int chunk;
tp->ucopy.len = copied > 0 ? len : 0;
tcp_prequeue_process(sk);
if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
}
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
/* Clean up data we have read: This will do ACK frames. */
cleanup_rbuf(sk, copied);
TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
out:
TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
recv_urg:
err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
goto out;
}
TCP 數據發送:(後續分析…)
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_opt *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
int mss_now;
int err, copied;
long timeo;
lock_sock(sk);
TCP_CHECK_TIMER(sk);
flags = msg->msg_flags;
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
iov = msg->msg_iov;
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
while (--iovlen >= 0) {
int seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
while (seglen > 0) {
int copy;
skb = sk->sk_write_queue.prev;
if (!sk->sk_send_head ||
(copy = mss_now - skb->len) <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
0, sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps &
(NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
NETIF_F_HW_CSUM))
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
copy = mss_now;
}
/* Try to append data to the end of skb. */
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
if (skb_tailroom(skb) > 0) {
/* We have some space in skb head. Superb! */
if (copy > skb_tailroom(skb))
copy = skb_tailroom(skb);
if ((err = skb_add_data(skb, from, copy)) != 0)
goto do_fault;
} else {
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = TCP_PAGE(sk);
int off = TCP_OFF(sk);
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
merge = 1;
} else if (i == MAX_SKB_FRAGS ||
(!i &&
!(sk->sk_route_caps & NETIF_F_SG))) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
* busy. */
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
/* If page is cached, align
* offset to L1 cache boundary
*/
off = (off + L1_CACHE_BYTES - 1) &
~(L1_CACHE_BYTES - 1);
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
}
}
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
off = 0;
}
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page(sk, from, skb, page,
off, copy);
if (err) {
/* If this page was new, give it to the
* socket so it does not get leaked.
*/
if (!TCP_PAGE(sk)) {
TCP_PAGE(sk) = page;
TCP_OFF(sk) = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge) {
skb_shinfo(skb)->frags[i - 1].size +=
copy;
} else {
skb_fill_page_desc(skb, i, page, off, copy);
if (TCP_PAGE(sk)) {
get_page(page);
} else if (off + copy < PAGE_SIZE) {
get_page(page);
TCP_PAGE(sk) = page;
}
}
TCP_OFF(sk) = off + copy;
}
if (!copied)
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->tso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
if (skb->len != mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
} else if (skb == sk->sk_send_head)
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
}
}
out:
if (copied)
tcp_push(sk, tp, flags, mss_now, tp->nonagle);
TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
do_fault:
if (!skb->len) {
if (sk->sk_send_head == skb)
sk->sk_send_head = NULL;
__skb_unlink(skb, skb->list);
sk_stream_free_skb(sk, skb);
}
do_error:
if (copied)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
}