前後端交互,依賴於include/xen/interface/io/ring.h, include/xen/interface/io/netif.h裏的定義
/*
* Calculate size of a shared ring, given the total available space for the
* ring and indexes (_sz), and the name tag of the request/response structure.
* A ring contains as many entries as will fit, rounded down to the nearest
* power of two (so we can mask with (size-1) to loop around).
*/
#define __CONST_RING_SIZE(_s, _sz) \
(__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \
sizeof(((struct _s##_sring *)0)->ring[0])))
/*
* The same for passing in an actual pointer instead of a name tag.
*/
#define __RING_SIZE(_s, _sz) \
(__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
\
/* Shared ring entry */ \
union __name##_sring_entry { \
__req_t req; \
__rsp_t rsp; \
}; \
\
/* Shared ring page */ \
struct __name##_sring { \
RING_IDX req_prod, req_event; \
RING_IDX rsp_prod, rsp_event; \
union { \
struct { \
uint8_t smartpoll_active; \
} netif; \
struct { \
uint8_t msg; \
} tapif_user; \
uint8_t pvt_pad[4]; \
} private; \
uint8_t pad[44]; \
union __name##_sring_entry ring[1]; /* variable-length */ \
}; \
/* "Front" end's private variables */ \
struct __name##_front_ring { \
RING_IDX req_prod_pvt; \
RING_IDX rsp_cons; \
unsigned int nr_ents; \
struct __name##_sring *sring; \
}; \
\
/* "Back" end's private variables */ \
struct __name##_back_ring { \
RING_IDX rsp_prod_pvt; \
RING_IDX req_cons; \
unsigned int nr_ents; \
struct __name##_sring *sring; \
};
對於ring的操作,一些常用的宏如下#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
((_r)->sring->rsp_prod - (_r)->rsp_cons)
#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
({ \
unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
unsigned int rsp = RING_SIZE(_r) - \
((_r)->req_cons - (_r)->rsp_prod_pvt); \
req < rsp ? req : rsp; \
})
#define RING_GET_REQUEST(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
#define RING_GET_RESPONSE(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
#define RING_PUSH_REQUESTS(_r) do { \
wmb(); /* back sees requests /before/ updated producer index */ \
(_r)->sring->req_prod = (_r)->req_prod_pvt; \
} while (0)
#define RING_PUSH_RESPONSES(_r) do { \
wmb(); /* front sees responses /before/ updated producer index */ \
(_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
} while (0)
#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
RING_IDX __old = (_r)->sring->req_prod; \
RING_IDX __new = (_r)->req_prod_pvt; \
wmb(); /* back sees requests /before/ updated producer index */ \
(_r)->sring->req_prod = __new; \
mb(); /* back sees new requests /before/ we check req_event */ \
(_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
(RING_IDX)(__new - __old)); \
} while (0)
#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
RING_IDX __old = (_r)->sring->rsp_prod; \
RING_IDX __new = (_r)->rsp_prod_pvt; \
wmb(); /* front sees responses /before/ updated producer index */ \
(_r)->sring->rsp_prod = __new; \
mb(); /* front sees new responses /before/ we check rsp_event */ \
(_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
(RING_IDX)(__new - __old)); \
} while (0)
#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
(_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
if (_work_to_do) break; \
(_r)->sring->req_event = (_r)->req_cons + 1; \
mb(); \
(_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
} while (0)
#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
(_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
if (_work_to_do) break; \
(_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
mb(); \
(_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
} while (0)
首先來看RX的報文接收,首先由netfront發起,在xennet_open和xennet_connect中通過xennet_alloc_rx_buffers給後端分配rx buffer。xennet_alloc_rx_buffers之前的文章有提過,這裏摘抄如下:
static void xennet_alloc_rx_buffers(struct net_device *dev)
{
unsigned short id;
struct netfront_info *np = netdev_priv(dev);
struct sk_buff *skb;
struct page *page;
int i, batch_target, notify;
RING_IDX req_prod = np->rx.req_prod_pvt;
grant_ref_t ref;
unsigned long pfn;
void *vaddr;
struct xen_netif_rx_request *req;
if (unlikely(!netif_carrier_ok(dev)))
return;
/*
* Allocate skbuffs greedily, even though we batch updates to the
* receive ring. This creates a less bursty demand on the memory
* allocator, so should reduce the chance of failed allocation requests
* both for ourself and for other kernel subsystems.
*/
/*
分配若干單個frag頁的skb,用於接收
這裏生成一批skb並append到netfront_info->rx_batch的list中,如果遇到__netdev_alloc_skb失敗或者alloc_page失敗,調用mod_timer延遲100ms重試(這個100ms我覺得是一個坑,其次這裏的重試其實是重新調用napi_schedule,繼而調用xennet_poll嘗試接收)
*/
batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD + NET_IP_ALIGN,
GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
goto no_skb;
/* Align ip header to a 16 bytes boundary */
skb_reserve(skb, NET_IP_ALIGN);
page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
if (!page) {
kfree_skb(skb);
no_skb:
/* Any skbuffs queued for refill? Force them out. */
if (i != 0)
goto refill;
/* Could not allocate any skbuffs. Try again later. */
mod_timer(&np->rx_refill_timer,
jiffies + (HZ/10));
break;
}
skb_shinfo(skb)->frags[0].page = page;
skb_shinfo(skb)->nr_frags = 1;
__skb_queue_tail(&np->rx_batch, skb);
}
/* Is the batch large enough to be worthwhile? */
if (i < (np->rx_target/2)) {
if (req_prod > np->rx.sring->req_prod)
goto push;
return;
}
/* Adjust our fill target if we risked running out of buffers. */
if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
((np->rx_target *= 2) > np->rx_max_target))
np->rx_target = np->rx_max_target;
refill:
/*
這段代碼對於netfront_info->rx_batch裏的每個skb,計算對應的ring->req_prod值繼而計算出該skb在netfront_info->rx_skbs數組中的ring index,把skb插入到rx_skbs數組的相應位置。之後調用gnttab_claim_grant_reference從grant_ref_t數組中取出一個沒用的ref,把ref插入到netfront_info->grant_rx_ref數組的相應位置。調用gnttab_grant_foreign_access_ref讓後端可以訪問這個page。
*/
for (i = 0; ; i++) {
skb = __skb_dequeue(&np->rx_batch);
if (skb == NULL)
break;
skb->dev = dev;
id = xennet_rxidx(req_prod + i);
BUG_ON(np->rx_skbs[id]);
np->rx_skbs[id] = skb;
ref = gnttab_claim_grant_reference(&np->gref_rx_head);
BUG_ON((signed short)ref < 0);
np->grant_rx_ref[id] = ref;
pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
vaddr = page_address(skb_shinfo(skb)->frags[0].page);
req = RING_GET_REQUEST(&np->rx, req_prod + i);
gnttab_grant_foreign_access_ref(ref,
np->xbdev->otherend_id,
pfn_to_mfn(pfn),
0);
req->id = id;
req->gref = ref;
}
wmb(); /* barrier so backend seens requests */
/* Above is a suitable barrier to ensure backend will see requests. */
np->rx.req_prod_pvt = req_prod + i;
push:
/*
調用RING_PUSH_REQUESTS_AND_CHECK_NOTIFY檢查netfront_info->xen_netif_rx_front_ring是否有新請求需要通知後端,
如果是則調用notify_remote_via_irq通過evtchn通知後端
*/
RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
if (notify)
notify_remote_via_irq(np->netdev->irq);
}
xennet_alloc_rx_buffers有自己的flow control算法,主要在於計算np->rx_target的值,和基於rx_target分配每次要增加到np->rx_batch中的skb資源,np->rx_batch中的skb都有一個特點,就是一個256長度的header和一個page大小的分片。np->rx_batch中的skb會後續存放到np->rx_skbs數組中,下標爲req_prod的值,skb第一個分片的page會對應一個grant ref,存放到np->grant_rx_ref數組中,下標和np->rx_skbs的一一對應。在調用gnttab_grant_foreign_access_ref之後,此時該page就可以授權給後端訪問了。
當後端收到包準備發送給前端時,首先通過xen_netbk_count_skb_slots計算這個skb需要的xen_netif_rx_request個數,並記錄在vif->rx_req_cons_peek中,rx_req_cons_peek用來判斷ring是不是滿了
後端收包主要在xen_netbk_rx_action中實現,每次xen_netbk_rx_action中都會分配一個struct netrx_pending_operations,使用netbk->grant_copy_op, netbk->meta來保存skb每一個分片對應的gnttab_copy,netbk_rx_meta信息。netbk_gop_skb用來構造與skb分片page個數對應的meta, copy數組項。gnttab_copy用的grant_ref_t就是從xen_netif_rx_request中獲取的GR
後端通過hypercall把skb分片的內容拷貝到GR指定的page,之後調用make_rx_response生成xen_netif_rx_response,調用netbk_add_frag_responsess爲每個分片生成xen_netif_rx_responses,通過RING_PUSH_RESPONSES_AND_CHECK_NOTIFY通知前端
總結下來,後端收包包含如下步驟:取出若干xen_netif_rx_request,設置gnttab_copy,把報文內容拷貝到rx request的相應page中,生成xen_netif_rx_response,最後通過RING_PUSH_RESPONSES_AND_CHECK_NOTIFY通知前端
最後,前端通過xennet_poll來收包,會調用xennet_get_responses和xennet_fill_frags從ring中每一個xen_netif_rx_response生成skb,更新rsp_cons,最後通過handle_incoming_queue讓協議棧接收這個包
再來看TX的報文發送,核心函數是xennet_start_xmit
static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
unsigned short id;
struct netfront_info *np = netdev_priv(dev);
struct xen_netif_tx_request *tx;
struct xen_netif_extra_info *extra;
char *data = skb->data;
RING_IDX i;
grant_ref_t ref;
unsigned long mfn;
int notify;
int frags = skb_shinfo(skb)->nr_frags;
unsigned int offset = offset_in_page(data);
unsigned int len = skb_headlen(skb);
frags += DIV_ROUND_UP(offset + len, PAGE_SIZE);
if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
frags);
dump_stack();
goto drop;
}
spin_lock_irq(&np->tx_lock);
if (unlikely(!netif_carrier_ok(dev) ||
(frags > 1 && !xennet_can_sg(dev)) ||
netif_needs_gso(dev, skb))) {
spin_unlock_irq(&np->tx_lock);
goto drop;
}
i = np->tx.req_prod_pvt;
/*
* tx_skb_freelist代表了tx_skbs中空閒entry的頭部索引, skb_entry->link指向下一個空閒entry
* tx_skbs中空閒entry被用來存放待發送的skb頭部及分片
*/
id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
np->tx_skbs[id].skb = skb;
/* 從req_prod_pvt中得到下一個xen_netif_tx_request,從tx_skb_freelist中得到下一個空閒的tx_skb項,把待發送的skb放到該entry中
* tx->id等於skb在tx_skbs中的下標
*/
tx = RING_GET_REQUEST(&np->tx, i);
tx->id = id;
ref = gnttab_claim_grant_reference(&np->gref_tx_head);
BUG_ON((signed short)ref < 0);
mfn = virt_to_mfn(data);
gnttab_grant_foreign_access_ref(
ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
tx->gref = np->grant_tx_ref[id] = ref;
tx->offset = offset;
tx->size = len;
extra = NULL;
tx->flags = 0;
if (skb->ip_summed == CHECKSUM_PARTIAL)
/* local packet? */
tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
/* remote but checksummed. */
tx->flags |= XEN_NETTXF_data_validated;
if (skb_shinfo(skb)->gso_size) {
struct xen_netif_extra_info *gso;
gso = (struct xen_netif_extra_info *)
RING_GET_REQUEST(&np->tx, ++i);
if (extra)
extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
else
tx->flags |= XEN_NETTXF_extra_info;
gso->u.gso.size = skb_shinfo(skb)->gso_size;
gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
gso->u.gso.pad = 0;
gso->u.gso.features = 0;
gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
gso->flags = 0;
extra = gso;
}
np->tx.req_prod_pvt = i + 1;
/* xennet_make_frags首先解決skb頭部跨多個page的問題,對於每一個跨的page,都會增加一個xen_netif_tx_request,過程和第一個skb頭部相同
* 之後對skb的每一個分片,執行同樣的操作,分爲如下幾步
* 1. tx->flags |= XEN_NETTXF_more_data,設置上一個xen_netif_tx_request,表示下面還有更多分片。首個skb頭部不需要
* 2. get_id_from_freelist得到空閒entry的id,這個id同樣也是xen_netif_tx_request的id
* 3. 通過req_prod_pvt取出下一個xen_netif_tx_request
* 4. 把skb頭部的page或者skb分片的page授權給後端訪問
* 5. 設置xen_netif_tx_request的gref, offset, size, flags
* 每一個tx_skbs的entry的skb指針都指向要發送的skb,可以認爲每個分片(包括頭部分片)都會增加一個skb的引用
*/
xennet_make_frags(skb, dev, tx);
tx->size = skb->len;
RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
if (notify)
notify_remote_via_irq(np->netdev->irq);
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
/* Note: It is not safe to access skb after xennet_tx_buf_gc()! */
/* xennet_tx_buf_gc用來回收rsp_cons到rsp_prod之間的xen_netif_tx_response
* xen_netif_tx_response的id,就是tx_skbs中對應的id,grant_tx_ref中對應的id
* 因此需要解除grant_tx_ref[id]的授權,把id重新放回tx_skb_freelist數組
* skb頭部和每一個分片都會釋放一個skb引用計數,直到最後skb被釋放
*/
xennet_tx_buf_gc(dev);
if (!netfront_tx_slot_available(np))
netif_stop_queue(dev);
spin_unlock_irq(&np->tx_lock);
return NETDEV_TX_OK;
drop:
dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
前端準備好xen_netif_tx_request之後,通過event channel通知到後端,後端的xenvif_interrupt會喚醒kthread線程來處理TX/RX包,後端處理TX報文的函數爲
static void xen_netbk_tx_action(struct xen_netbk *netbk)
{
unsigned nr_gops;
int ret;
nr_gops = xen_netbk_tx_build_gops(netbk);
if (nr_gops == 0)
return;
ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
netbk->tx_copy_ops, nr_gops);
BUG_ON(ret);
xen_netbk_tx_submit(netbk);
}
xen_netbk_tx_build_gops用來把前端的若干xen_netif_tx_request生成一個可以發送的skb,追加到xen_netbk->tx_queue中,並且準備好pending_tx_info和tx_copy_ops數組的對應entry,後續通過GNTTABOP_copy的hypercall完成從前端到後端頁的拷貝。
xen_netbk_tx_submit用來獲得skb頭部以及分片的pending_idx(xen_netif_tx_request的信息都存在xen_netbk的pending_tx_info中,通過pending_ring的pending_idx索引),對於skb頭部,直接把數據拷貝過去,對於skb分片,調用get_page增加一個引用計數,在真正發送skb之前,把pending_tx_info中所有page通過netbk_idx_release釋放給前端