參考
- perf_event內核框架
- tracepoint events
- software events
- hardware events
- perf_event_open系統調用內核源碼分析
- perf_event_open 簡介
內核版本
Linux 6.5
作者
正文
初始化
應用調用perf_event_open系統調用讓內核創建perf_event時,內核創建了一個匿名文件,這個文件的file結構體的fops是perf_fops:
static const struct file_operations perf_fops = {
.llseek = no_llseek,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
.compat_ioctl = perf_compat_ioctl,
.mmap = perf_mmap,
.fasync = perf_fasync,
};
應用在得到fd後,通過調用mmap來讓內核分配ring buffer,用perf_buffer結構體表示,並且將ring buffer也映射給用戶,後續應用和內核就可以通過共享內存的方式實現數據共享。跟這段緩衝區對應的vma的vm_ops是perf_mmap_vmops.
內核對mmap的長度的規定是必須滿足1+2^n
個頁,其中第1個頁只是用來存放結構體perf_event_mmap_page,其中存放的是元信息,後面的2^n
個頁用來存放具體的採樣數據。
如果應用在mmap時設置了可寫,那麼perf_buffer的overwrite爲0,表示內核在向緩衝區寫數據時與需要跟應用進行同步,防止出現內容被覆蓋的情況,當然如果應用讀取不及時,會造成緩衝區滿的情況,此時新數據將無法寫入,發生overflow。
在perf_mmap中分配ring buffer的實現如下:
struct perf_buffer *rb;
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
ring_buffer_attach(event, rb);
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
調用rb_alloc分配出perf_buffer後,在ring_buffer_attach中會將rb賦值給perf_event的rb成員,分配緩衝區有兩種實現方法,在編譯時決定,一種是一頁一頁分配,這種方式會出現頁之間的虛擬地址不連續,所以需要通過data_pages[]數組來記錄每個頁的地址,頁的數量記錄在rb->nr_pages中;第二種是調用vmalloc一次分配完畢,這樣所有這些頁的虛擬地址是連續的,此時nr_pages固定設置爲1,即只需要data_pages[0],記錄首地址即可,下圖這種是連續的情況:
此外,內核調用perf_event_alloc來分配perf_event時,如果沒有指定overflow_handler處理函數,那麼內核會根據應用傳遞的參數設置默認的handler,假如應用沒有要求按backwrite的寫方向,那麼handler就是perf_event_output_forward。
內核寫
void
perf_event_output_forward(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
__perf_event_output(event, data, regs, perf_output_begin_forward);
}
__perf_event_output
static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
int (*output_begin)(struct perf_output_handle *,
struct perf_sample_data *,
struct perf_event *,
unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
int err;
/* protect the callchain buffers */
rcu_read_lock();
// 根據perf_event的設置以及當前的上下文來填充perf_sample_data,此時還沒進ring buffer緩衝區
perf_prepare_sample(data, event, regs);
// 填充perf_event_header,在緩衝區裏內容是通過一個個以perf_event_header爲首的結構體組成
perf_prepare_header(&header, data, event, regs);
// perf_output_begin_forward,更新handle,其中記錄的是要寫入的地址
err = output_begin(&handle, data, event, header.size);
if (err)
goto exit;
// 根據handle中記錄的位置信息,將header、data等寫入到緩衝區
perf_output_sample(&handle, &header, data, event);
// 更新data_head,同時處理喚醒
perf_output_end(&handle);
exit:
rcu_read_unlock();
return err;
}
perf_output_begin_forward
這個函數的作用是更新handle中操作緩衝區的成員,如addr表示要寫入的位置,size表示剩餘空間大小,page表示要寫入的page的數組索引號。
- perf_output_begin_forward
int perf_output_begin_forward(struct perf_output_handle *handle,
struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, data, event, size, false);
}
__perf_output_begin
static __always_inline int
__perf_output_begin(struct perf_output_handle *handle,
struct perf_sample_data *data,
struct perf_event *event, unsigned int size,
bool backward)
{
struct perf_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
u64 lost;
} lost_event;
rcu_read_lock();
/*
* For inherited events we send all the output towards the parent.
*/
if (event->parent)
event = event->parent;
rb = rcu_dereference(event->rb);
if (unlikely(!rb))
goto out;
if (unlikely(rb->paused)) {
if (rb->nr_pages) {
local_inc(&rb->lost);
atomic64_inc(&event->lost_samples);
}
goto out;
}
handle->rb = rb;
handle->event = event;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
size += sizeof(lost_event);
if (event->attr.sample_id_all)
size += event->id_header_size;
}
// 關閉搶佔,同時將rb->nest加1,同時記錄rb->wakeup到handle中,用於處理是否需要喚醒應用
perf_output_get_handle(handle);
do {
/* 這裏data_tail在內核這邊是隻讀,由應用負責更新,初始值爲0 */
tail = READ_ONCE(rb->user_page->data_tail);
// head表示要寫入的位置對應的偏移量,初始值爲0
// 在這個循環中,offset記錄head推進之間的值,用來檢查在此期間rb->head是否有更新
offset = head = local_read(&rb->head);
// 如果overwrite是0,那麼表示內核在寫入之前需要檢查應用是否已經讀走,防止數據被覆蓋
// 當應用在mmap時設置了可寫權限,那麼overwrite就是0,如果是隻讀的話,即overwrite是1,
// 內核可以放心地覆蓋緩衝區的數據,不關心應用是否已經讀走
// 下面ring_buffer_has_space就是用來判斷是否有足夠的空間容納size字節數據:
// (tail - head + 1)& (perf_data_size(rb) - 1)>= size
// 如果空間無法容納,返回0,否則返回1
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
size, backward)))
goto fail; // 空間不足
}
/*
* The above forms a control dependency barrier separating the
* @tail load above from the data stores below. Since the @tail
* load is required to compute the branch to fail below.
*
* A, matches D; the full memory barrier userspace SHOULD issue
* after reading the data and before storing the new tail
* position.
*
* See perf_output_put_handle().
*/
if (!backward)
head += size;
else
head -= size;
// 這裏判斷rb->head是否跟offset相等,如果相等,那麼將head賦值給rb->head,返回offset
// 據此判斷在計算head期間rb->head是否發生了更新
// 這個while循環退出後,rb->head以及head指向下一個可寫的位置,offset表示head推進之前的值
} while (local_cmpxchg(&rb->head, offset, head) != offset);
// 到這裏,rb->head表示下一個要寫入的位置,而offset表示當前要寫入的位置
if (backward) {
offset = head;
head = (u64)(-head);
}
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
*/
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
// page_shift用於表示ring buffer的data區的大小,即2^n * PAGE_SIZE
page_shift = PAGE_SHIFT + page_order(rb);
// 計算data區的page索引號,data區有2^n個page組成,這裏會計算offset對應的是哪個page
// 這裏可以看到,head都是單調遞增,導致offset此時可能已經超過緩衝區大小,需要處理wrap
handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
// 計算頁內偏移,同時處理了迴繞
offset &= (1UL << page_shift) - 1;
// 這裏會兼容兩種緩衝區分配方式,一種是一頁一頁分配,這種方式會出現頁之間的虛擬地址不連續,
// 通過data_pages[]數組記錄每個頁的地址,頁的數量記錄在rb->nr_pages中;第二種是調用vmalloc
// 一次分配完畢,這樣所有這些頁的虛擬地址是連續的,此時nr_pages固定設置爲1,即只需要data_pages[0]
// 記錄首地址即可,具體參考rb_alloc
// 如果是虛擬地址連續的情況,因爲nr_pages是1,所以上面計算得到的handle->page是0,所以下面
// 下面就是rb->data_pages[0] + offset,從而得到要寫入的地址
// 如果是不連續的情況,上面handle->page計算得到offset所在的page的索引,下面再得到要寫入的位置
handle->addr = rb->data_pages[handle->page] + offset;
// 計算緩衝區剩餘空空間大小,offset記錄的是當前要寫入的偏移量,尚未寫入,這裏只是計算寫入位置信息
handle->size = (1UL << page_shift) - offset;
// 如果發生過因爲緩衝區空間不足導致無法寫入,上面會把have_lost設置爲發生lost的次數
// 下面會往ring buffer中寫入一個PERF_RECORD_LOST的記錄
if (unlikely(have_lost)) {
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id; // 發生lost的事件的perf_event的id
lost_event.lost = local_xchg(&rb->lost, 0); // lost的次數
/* XXX mostly redundant; @data is already fully initializes */
perf_event_header__init_id(&lost_event.header, data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, data);
}
return 0;
fail:
// 空間不足導致寫入失敗,記錄這種情況發生的次數
local_inc(&rb->lost);
// perf_event之間可以共享perf buffer,還需要單獨再記錄每個perf event發生lost的次數
atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
return -ENOSPC;
}
perf_output_sample
這個函數用來根據傳入的header、data等來填充緩衝區。這裏暫時不打算分析,只是關心其中具體寫緩衝區的函數:perf_output_put
以下面的調用爲例:
perf_output_put(handle, data->time);
這個函數的作用是將data->time成員的內容寫入到handle描述的緩衝區中。
展開後得到:
perf_output_copy(handle, &data->time, sizeof(data->time));
這樣看上去會更加清楚。
perf_output_copy的定義如下:
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
return __output_copy(handle, buf, len);
}
其中__output_copy進一步展開後得到:
static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n)
{
memcpy(dst, src, n);
return 0;
}
static inline unsigned long __output_copy(struct perf_output_handle *handle, const void *buf, unsigned long len)
{
unsigned long size, written;
do {
// 保證不越界
size = min(handle->size, len);
// 將buf中的內容拷貝到handle->addr指向的緩衝區中,參考上面對__perf_output_begin的分析,拷貝的字節數是size
written = memcpy_common(handle->addr, buf, size);
written = size - written;
// 如果成功寫完
len -= written;
// 向前推進地址
handle->addr += written;
if (true)
buf += written;
// 寫完後,更新緩衝區空閒空間字節數
handle->size -= written;
// 如果size爲0,表示緩衝區用完,此時需要回繞到開頭,從而實現ring buffer的功能
if (!handle->size) {
struct perf_buffer *rb = handle->rb;
// 更新要寫入的page的數組索引
handle->page++;
// 對於連續緩衝區的情況,nr_pages是0,所以會將handle->page設置爲0,因爲此時只有data_pages[0]
handle->page &= rb->nr_pages - 1;
// 重新得到下一個要寫入的位置
handle->addr = rb->data_pages[handle->page];
// data區的大小,即2^n * PAGE_SIZE
handle->size = ((1UL) << 12) << page_order(rb);
}
} while (len && written == size);
return len;
}
在perf_output_sample函數的最後有下面的邏輯:
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
struct perf_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
local_sub(wakeup_events, &rb->events);
local_inc(&rb->wakeup);
}
}
}
其中wakeup_events表示每累計多少次event就喚醒一次,上面的邏輯比較簡單,當次數累計夠了,其中會對rb->wakeup遞增,在perf_output_put_handle中會根據這個值是否有變化來判斷是否需要喚醒應用。
perf_output_put_handle
- perf_output_end
void perf_output_end(struct perf_output_handle *handle)
{
perf_output_put_handle(handle);
rcu_read_unlock();
}
- perf_output_put_handle
static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct perf_buffer *rb = handle->rb;
unsigned long head;
unsigned int nest;
/*
* If this isn't the outermost nesting, we don't have to update
* @rb->user_page->data_head.
*/
nest = READ_ONCE(rb->nest);
if (nest > 1) {
WRITE_ONCE(rb->nest, nest - 1);
goto out;
}
again:
/*
* In order to avoid publishing a head value that goes backwards,
* we must ensure the load of @rb->head happens after we've
* incremented @rb->nest.
*
* Otherwise we can observe a @rb->head value before one published
* by an IRQ/NMI happening between the load and the increment.
*/
barrier();
head = local_read(&rb->head);
/*
* IRQ/NMI can happen here and advance @rb->head, causing our
* load above to be stale.
*/
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
* kernel user
*
* if (LOAD ->data_tail) { LOAD ->data_head
* (A) smp_rmb() (C)
* STORE $data LOAD $data
* smp_wmb() (B) smp_mb() (D)
* STORE ->data_head STORE ->data_tail
* }
*
* Where A pairs with D, and B pairs with C.
*
* In our case (A) is a control dependency that separates the load of
* the ->data_tail and the stores of $data. In case ->data_tail
* indicates there is no room in the buffer to store $data we do not.
*
* D needs to be a full barrier since it separates the data READ
* from the tail WRITE.
*
* For B a WMB is sufficient since it separates two WRITEs, and for C
* an RMB is sufficient since it separates two READs.
*
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
// 將head更新到perf_event_mmap_page中,需要注意的是,這個值是單調遞增,需要應用
// 自己處理迴繞的問題,此外,這個值表示下一個要寫入的位置,而不是剛剛寫入的記錄的
// 位置,所以需要應用自己備份
WRITE_ONCE(rb->user_page->data_head, head);
/*
* We must publish the head before decrementing the nest count,
* otherwise an IRQ/NMI can publish a more recent head value and our
* write will (temporarily) publish a stale value.
*/
barrier();
WRITE_ONCE(rb->nest, 0);
/*
* Ensure we decrement @rb->nest before we validate the @rb->head.
* Otherwise we cannot be sure we caught the 'last' nested update.
*/
barrier();
if (unlikely(head != local_read(&rb->head))) {
WRITE_ONCE(rb->nest, 1);
goto again;
}
if (handle->wakeup != local_read(&rb->wakeup))
perf_output_wakeup(handle);
out:
preempt_enable();
}
喚醒應用
在內核寫完一個事件後,最後在調用perf_output_put_handle時,如果發現需要喚醒應用,那麼會調用perf_output_wakeup。
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending_irq);
}
在分配perf_event時,給pending_irq設置的是perf_pending_irq:
static void perf_pending_irq(struct irq_work *entry)
{
struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
int rctx;
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
rctx = perf_swevent_get_recursion_context();
/*
* The wakeup isn't bound to the context of the event -- it can happen
* irrespective of where the event is.
*/
if (event->pending_wakeup) {
event->pending_wakeup = 0;
// 應用如果在poll的話,會被喚醒
perf_event_wakeup(event);
}
__perf_pending_irq(event);
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
應用讀
參考: https://www.cnblogs.com/pengdonglin137/p/17989602
下面兩個參考鏈接中給出了data_tail如何使用。
完。