參考

內核版本

Linux 6.5

作者

正文

初始化

應用調用perf_event_open系統調用讓內核創建perf_event時，內核創建了一個匿名文件，這個文件的file結構體的fops是perf_fops：

static const struct file_operations perf_fops = {
	.llseek			= no_llseek,
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_compat_ioctl,
	.mmap			= perf_mmap,
	.fasync			= perf_fasync,
};

應用在得到fd後，通過調用mmap來讓內核分配ring buffer，用perf_buffer結構體表示，並且將ring buffer也映射給用戶，後續應用和內核就可以通過共享內存的方式實現數據共享。跟這段緩衝區對應的vma的vm_ops是perf_mmap_vmops.

內核對mmap的長度的規定是必須滿足1+2^n個頁，其中第1個頁只是用來存放結構體perf_event_mmap_page，其中存放的是元信息，後面的2^n個頁用來存放具體的採樣數據。

如果應用在mmap時設置了可寫，那麼perf_buffer的overwrite爲0，表示內核在向緩衝區寫數據時與需要跟應用進行同步，防止出現內容被覆蓋的情況，當然如果應用讀取不及時，會造成緩衝區滿的情況，此時新數據將無法寫入，發生overflow。

在perf_mmap中分配ring buffer的實現如下：

	struct perf_buffer *rb;
	rb = rb_alloc(nr_pages,
				  event->attr.watermark ? event->attr.wakeup_watermark : 0,
				  event->cpu, flags);

	ring_buffer_attach(event, rb);
	perf_event_update_time(event);
	perf_event_init_userpage(event);
	perf_event_update_userpage(event);

調用rb_alloc分配出perf_buffer後，在ring_buffer_attach中會將rb賦值給perf_event的rb成員，分配緩衝區有兩種實現方法，在編譯時決定，一種是一頁一頁分配，這種方式會出現頁之間的虛擬地址不連續，所以需要通過data_pages[]數組來記錄每個頁的地址，頁的數量記錄在rb->nr_pages中；第二種是調用vmalloc一次分配完畢，這樣所有這些頁的虛擬地址是連續的，此時nr_pages固定設置爲1，即只需要data_pages[0]，記錄首地址即可，下圖這種是連續的情況：

此外，內核調用perf_event_alloc來分配perf_event時，如果沒有指定overflow_handler處理函數，那麼內核會根據應用傳遞的參數設置默認的handler，假如應用沒有要求按backwrite的寫方向，那麼handler就是perf_event_output_forward。

內核寫

以perf_event_output_forward爲例:

void
perf_event_output_forward(struct perf_event *event,
			 struct perf_sample_data *data,
			 struct pt_regs *regs)
{
	__perf_event_output(event, data, regs, perf_output_begin_forward);
}

`__perf_event_output`

static __always_inline int
__perf_event_output(struct perf_event *event,
		    struct perf_sample_data *data,
		    struct pt_regs *regs,
		    int (*output_begin)(struct perf_output_handle *,
					struct perf_sample_data *,
					struct perf_event *,
					unsigned int))
{
	struct perf_output_handle handle;
	struct perf_event_header header;
	int err;

	/* protect the callchain buffers */
	rcu_read_lock();

	// 根據perf_event的設置以及當前的上下文來填充perf_sample_data，此時還沒進ring buffer緩衝區
	perf_prepare_sample(data, event, regs);

	// 填充perf_event_header，在緩衝區裏內容是通過一個個以perf_event_header爲首的結構體組成
	perf_prepare_header(&header, data, event, regs);

	// perf_output_begin_forward，更新handle，其中記錄的是要寫入的地址
	err = output_begin(&handle, data, event, header.size);
	if (err)
		goto exit;

	// 根據handle中記錄的位置信息，將header、data等寫入到緩衝區
	perf_output_sample(&handle, &header, data, event);

	// 更新data_head，同時處理喚醒
	perf_output_end(&handle);

exit:
	rcu_read_unlock();
	return err;
}

perf_output_begin_forward

這個函數的作用是更新handle中操作緩衝區的成員，如addr表示要寫入的位置，size表示剩餘空間大小，page表示要寫入的page的數組索引號。

perf_output_begin_forward

int perf_output_begin_forward(struct perf_output_handle *handle,
			      struct perf_sample_data *data,
			      struct perf_event *event, unsigned int size)
{
	return __perf_output_begin(handle, data, event, size, false);
}

__perf_output_begin

static __always_inline int
__perf_output_begin(struct perf_output_handle *handle,
		    struct perf_sample_data *data,
		    struct perf_event *event, unsigned int size,
		    bool backward)
{
	struct perf_buffer *rb;
	unsigned long tail, offset, head;
	int have_lost, page_shift;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;

	rcu_read_lock();
	/*
	 * For inherited events we send all the output towards the parent.
	 */
	if (event->parent)
		event = event->parent;

	rb = rcu_dereference(event->rb);
	if (unlikely(!rb))
		goto out;

	if (unlikely(rb->paused)) {
		if (rb->nr_pages) {
			local_inc(&rb->lost);
			atomic64_inc(&event->lost_samples);
		}
		goto out;
	}

	handle->rb    = rb;
	handle->event = event;

	have_lost = local_read(&rb->lost);
	if (unlikely(have_lost)) {
		size += sizeof(lost_event);
		if (event->attr.sample_id_all)
			size += event->id_header_size;
	}

	// 關閉搶佔，同時將rb->nest加1，同時記錄rb->wakeup到handle中，用於處理是否需要喚醒應用
	perf_output_get_handle(handle);

	do {
		/* 這裏data_tail在內核這邊是隻讀，由應用負責更新，初始值爲0 */
		tail = READ_ONCE(rb->user_page->data_tail);

		// head表示要寫入的位置對應的偏移量，初始值爲0
		// 在這個循環中，offset記錄head推進之間的值，用來檢查在此期間rb->head是否有更新
		offset = head = local_read(&rb->head);

		// 如果overwrite是0，那麼表示內核在寫入之前需要檢查應用是否已經讀走，防止數據被覆蓋
		// 當應用在mmap時設置了可寫權限，那麼overwrite就是0，如果是隻讀的話，即overwrite是1，
		// 內核可以放心地覆蓋緩衝區的數據，不關心應用是否已經讀走
		// 下面ring_buffer_has_space就是用來判斷是否有足夠的空間容納size字節數據:
		// （tail - head + 1）& （perf_data_size(rb) - 1）>= size
		// 如果空間無法容納，返回0，否則返回1
		if (!rb->overwrite) {
			if (unlikely(!ring_buffer_has_space(head, tail,
							    perf_data_size(rb),
							    size, backward)))
				goto fail;  // 空間不足
		}

		/*
		 * The above forms a control dependency barrier separating the
		 * @tail load above from the data stores below. Since the @tail
		 * load is required to compute the branch to fail below.
		 *
		 * A, matches D; the full memory barrier userspace SHOULD issue
		 * after reading the data and before storing the new tail
		 * position.
		 *
		 * See perf_output_put_handle().
		 */

		if (!backward)
			head += size;
		else
			head -= size;

	// 這裏判斷rb->head是否跟offset相等，如果相等，那麼將head賦值給rb->head，返回offset
	// 據此判斷在計算head期間rb->head是否發生了更新
	// 這個while循環退出後，rb->head以及head指向下一個可寫的位置，offset表示head推進之前的值
	} while (local_cmpxchg(&rb->head, offset, head) != offset);

	// 到這裏，rb->head表示下一個要寫入的位置，而offset表示當前要寫入的位置

	if (backward) {
		offset = head;
		head = (u64)(-head);
	}

	/*
	 * We rely on the implied barrier() by local_cmpxchg() to ensure
	 * none of the data stores below can be lifted up by the compiler.
	 */

	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
		local_add(rb->watermark, &rb->wakeup);

	// page_shift用於表示ring buffer的data區的大小，即2^n * PAGE_SIZE
	page_shift = PAGE_SHIFT + page_order(rb);

	// 計算data區的page索引號，data區有2^n個page組成，這裏會計算offset對應的是哪個page
	// 這裏可以看到，head都是單調遞增，導致offset此時可能已經超過緩衝區大小，需要處理wrap
	handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
	// 計算頁內偏移，同時處理了迴繞
	offset &= (1UL << page_shift) - 1;
	// 這裏會兼容兩種緩衝區分配方式，一種是一頁一頁分配，這種方式會出現頁之間的虛擬地址不連續，
	// 通過data_pages[]數組記錄每個頁的地址，頁的數量記錄在rb->nr_pages中；第二種是調用vmalloc
	// 一次分配完畢，這樣所有這些頁的虛擬地址是連續的，此時nr_pages固定設置爲1，即只需要data_pages[0]
	// 記錄首地址即可，具體參考rb_alloc
	// 如果是虛擬地址連續的情況，因爲nr_pages是1，所以上面計算得到的handle->page是0，所以下面
	// 下面就是rb->data_pages[0] + offset，從而得到要寫入的地址
	// 如果是不連續的情況，上面handle->page計算得到offset所在的page的索引，下面再得到要寫入的位置
	handle->addr = rb->data_pages[handle->page] + offset;
	// 計算緩衝區剩餘空空間大小，offset記錄的是當前要寫入的偏移量，尚未寫入，這裏只是計算寫入位置信息
	handle->size = (1UL << page_shift) - offset;

	// 如果發生過因爲緩衝區空間不足導致無法寫入，上面會把have_lost設置爲發生lost的次數
	// 下面會往ring buffer中寫入一個PERF_RECORD_LOST的記錄
	if (unlikely(have_lost)) {
		lost_event.header.size = sizeof(lost_event);
		lost_event.header.type = PERF_RECORD_LOST;
		lost_event.header.misc = 0;
		lost_event.id          = event->id;  // 發生lost的事件的perf_event的id
		lost_event.lost        = local_xchg(&rb->lost, 0); // lost的次數

		/* XXX mostly redundant; @data is already fully initializes */
		perf_event_header__init_id(&lost_event.header, data, event);
		perf_output_put(handle, lost_event);
		perf_event__output_id_sample(event, handle, data);
	}

	return 0;

fail:
	// 空間不足導致寫入失敗，記錄這種情況發生的次數
	local_inc(&rb->lost);
	// perf_event之間可以共享perf buffer，還需要單獨再記錄每個perf event發生lost的次數
	atomic64_inc(&event->lost_samples);
	perf_output_put_handle(handle);
out:
	rcu_read_unlock();

	return -ENOSPC;
}

perf_output_sample

這個函數用來根據傳入的header、data等來填充緩衝區。這裏暫時不打算分析，只是關心其中具體寫緩衝區的函數：perf_output_put

以下面的調用爲例：

perf_output_put(handle, data->time);

這個函數的作用是將data->time成員的內容寫入到handle描述的緩衝區中。

展開後得到：

perf_output_copy(handle, &data->time, sizeof(data->time));

這樣看上去會更加清楚。

perf_output_copy的定義如下：

unsigned int perf_output_copy(struct perf_output_handle *handle,
		      const void *buf, unsigned int len)
{
	return __output_copy(handle, buf, len);
}

其中__output_copy進一步展開後得到：

static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n)
{
	memcpy(dst, src, n);
	return 0;
}

static inline unsigned long __output_copy(struct perf_output_handle *handle, const void *buf, unsigned long len)
{
	unsigned long size, written;

	do {
		// 保證不越界
		size = min(handle->size, len);
		// 將buf中的內容拷貝到handle->addr指向的緩衝區中，參考上面對__perf_output_begin的分析，拷貝的字節數是size
		written = memcpy_common(handle->addr, buf, size);
		written = size - written;
		// 如果成功寫完
		len -= written;

		// 向前推進地址
		handle->addr += written;

		if (true)
			buf += written;

		// 寫完後，更新緩衝區空閒空間字節數
		handle->size -= written;
		// 如果size爲0，表示緩衝區用完，此時需要回繞到開頭，從而實現ring buffer的功能
		if (!handle->size) {
			struct perf_buffer *rb = handle->rb;
			// 更新要寫入的page的數組索引
			handle->page++;
			// 對於連續緩衝區的情況，nr_pages是0，所以會將handle->page設置爲0，因爲此時只有data_pages[0]
			handle->page &= rb->nr_pages - 1;
			// 重新得到下一個要寫入的位置
			handle->addr = rb->data_pages[handle->page];
			// data區的大小，即2^n * PAGE_SIZE
			handle->size = ((1UL) << 12) << page_order(rb);
		}
	} while (len && written == size);

	return len;
}

在perf_output_sample函數的最後有下面的邏輯：

	if (!event->attr.watermark) {
		int wakeup_events = event->attr.wakeup_events;

		if (wakeup_events) {
			struct perf_buffer *rb = handle->rb;
			int events = local_inc_return(&rb->events);

			if (events >= wakeup_events) {
				local_sub(wakeup_events, &rb->events);
				local_inc(&rb->wakeup);
			}
		}
	}

其中wakeup_events表示每累計多少次event就喚醒一次，上面的邏輯比較簡單，當次數累計夠了，其中會對rb->wakeup遞增，在perf_output_put_handle中會根據這個值是否有變化來判斷是否需要喚醒應用。

perf_output_put_handle

perf_output_end

void perf_output_end(struct perf_output_handle *handle)
{
	perf_output_put_handle(handle);
	rcu_read_unlock();
}

perf_output_put_handle

static void perf_output_put_handle(struct perf_output_handle *handle)
{
	struct perf_buffer *rb = handle->rb;
	unsigned long head;
	unsigned int nest;

	/*
	 * If this isn't the outermost nesting, we don't have to update
	 * @rb->user_page->data_head.
	 */
	nest = READ_ONCE(rb->nest);
	if (nest > 1) {
		WRITE_ONCE(rb->nest, nest - 1);
		goto out;
	}

again:
	/*
	 * In order to avoid publishing a head value that goes backwards,
	 * we must ensure the load of @rb->head happens after we've
	 * incremented @rb->nest.
	 *
	 * Otherwise we can observe a @rb->head value before one published
	 * by an IRQ/NMI happening between the load and the increment.
	 */
	barrier();
	head = local_read(&rb->head);

	/*
	 * IRQ/NMI can happen here and advance @rb->head, causing our
	 * load above to be stale.
	 */

	/*
	 * Since the mmap() consumer (userspace) can run on a different CPU:
	 *
	 *   kernel				user
	 *
	 *   if (LOAD ->data_tail) {		LOAD ->data_head
	 *			(A)		smp_rmb()	(C)
	 *	STORE $data			LOAD $data
	 *	smp_wmb()	(B)		smp_mb()	(D)
	 *	STORE ->data_head		STORE ->data_tail
	 *   }
	 *
	 * Where A pairs with D, and B pairs with C.
	 *
	 * In our case (A) is a control dependency that separates the load of
	 * the ->data_tail and the stores of $data. In case ->data_tail
	 * indicates there is no room in the buffer to store $data we do not.
	 *
	 * D needs to be a full barrier since it separates the data READ
	 * from the tail WRITE.
	 *
	 * For B a WMB is sufficient since it separates two WRITEs, and for C
	 * an RMB is sufficient since it separates two READs.
	 *
	 * See perf_output_begin().
	 */
	smp_wmb(); /* B, matches C */
	// 將head更新到perf_event_mmap_page中，需要注意的是，這個值是單調遞增，需要應用
	// 自己處理迴繞的問題，此外，這個值表示下一個要寫入的位置，而不是剛剛寫入的記錄的
	// 位置，所以需要應用自己備份
	WRITE_ONCE(rb->user_page->data_head, head);

	/*
	 * We must publish the head before decrementing the nest count,
	 * otherwise an IRQ/NMI can publish a more recent head value and our
	 * write will (temporarily) publish a stale value.
	 */
	barrier();
	WRITE_ONCE(rb->nest, 0);

	/*
	 * Ensure we decrement @rb->nest before we validate the @rb->head.
	 * Otherwise we cannot be sure we caught the 'last' nested update.
	 */
	barrier();
	if (unlikely(head != local_read(&rb->head))) {
		WRITE_ONCE(rb->nest, 1);
		goto again;
	}

	if (handle->wakeup != local_read(&rb->wakeup))
		perf_output_wakeup(handle);

out:
	preempt_enable();
}

喚醒應用

在內核寫完一個事件後，最後在調用perf_output_put_handle時，如果發現需要喚醒應用，那麼會調用perf_output_wakeup。

static void perf_output_wakeup(struct perf_output_handle *handle)
{
	atomic_set(&handle->rb->poll, EPOLLIN);

	handle->event->pending_wakeup = 1;
	irq_work_queue(&handle->event->pending_irq);
}

在分配perf_event時，給pending_irq設置的是perf_pending_irq：

static void perf_pending_irq(struct irq_work *entry)
{
	struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
	int rctx;

	/*
	 * If we 'fail' here, that's OK, it means recursion is already disabled
	 * and we won't recurse 'further'.
	 */
	rctx = perf_swevent_get_recursion_context();

	/*
	 * The wakeup isn't bound to the context of the event -- it can happen
	 * irrespective of where the event is.
	 */
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		// 應用如果在poll的話，會被喚醒
		perf_event_wakeup(event);
	}

	__perf_pending_irq(event);

	if (rctx >= 0)
		perf_swevent_put_recursion_context(rctx);
}

應用讀

參考： https://www.cnblogs.com/pengdonglin137/p/17989602

下面兩個參考鏈接中給出了data_tail如何使用。

完。

perf_event_open學習 —— 緩衝區管理

參考

內核版本

作者

正文

初始化

內核寫

`__perf_event_output`

perf_output_begin_forward

perf_output_sample

perf_output_put_handle

喚醒應用

應用讀

如何使用 JS 判斷用戶是否處於活躍狀態

lightdb秒級增加列和刪除列（not null帶默認值）

lightdb數據庫超時相關控制參數

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

❤️‍🔥 Solon Cloud Event 新的事務特性與應用

lightdb mysql 8.0兼容之不可見主鍵

使用 JS 實現在瀏覽器控制檯打印圖片 console.image()

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（四）使用域名訪問網站應用

fedora安裝靜態c和c++庫

遠程管理HP筆記本

Android系統學習 —— 替換crash_dump文件

t620折騰 —— 給虛擬機配置大頁

ubuntu上安裝某個程序的符號表和源碼包

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結