glibc memcpy函數的一些研究

在測試內存(AEP,6*256GB interleaved dax)性能的時候,發現通過8B循環寫的帶寬大概是4GB/s,然後無意間用了一下memcpy,發現帶寬達到了10GB/s,就順便研究了一下memcpy函數,做個記錄如下:
glibc的memcpy函數實現如下

void *
memcpy (void *dstpp, const void *srcpp, size_t len)
{
  unsigned long int dstp = (long int) dstpp;
  unsigned long int srcp = (long int) srcpp;

  /* Copy from the beginning to the end.  */

  /* If there not too few bytes to copy, use word copy.  */
  if (len >= OP_T_THRES)
    {
      /* Copy just a few bytes to make DSTP aligned.  */
      len -= (-dstp) % OPSIZ;
      BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);

      /* Copy whole pages from SRCP to DSTP by virtual address manipulation,
	 as much as possible.  */

      PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);

      /* Copy from SRCP to DSTP taking advantage of the known alignment of
	 DSTP.  Number of bytes remaining is put in the third argument,
	 i.e. in LEN.  This number may vary from machine to machine.  */

      WORD_COPY_FWD (dstp, srcp, len, len);

      /* Fall out and copy the tail.  */
    }

  /* There are just a few bytes to copy.  Use byte memory operations.  */
  BYTE_COPY_FWD (dstp, srcp, len);

  return dstpp;
}

主要是在拷貝大塊的時候會使用PAGE_COPY_FWD_MAYBE,定義如下

# define PAGE_COPY_FWD_MAYBE(dstp, srcp, nbytes_left, nbytes)		      \
  do									      \
    {									      \
      if ((nbytes) >= PAGE_COPY_THRESHOLD &&				      \
	  PAGE_OFFSET ((dstp) - (srcp)) == 0) 				      \
	{								      \
	  /* The amount to copy is past the threshold for copying	      \
	     pages virtually with kernel VM operations, and the		      \
	     source and destination addresses have the same alignment.  */    \
	  size_t nbytes_before = PAGE_OFFSET (-(dstp));			      \
	  if (nbytes_before != 0)					      \
	    {								      \
	      /* First copy the words before the first page boundary.  */     \
	      WORD_COPY_FWD (dstp, srcp, nbytes_left, nbytes_before);	      \
	      assert (nbytes_left == 0);				      \
	      nbytes -= nbytes_before;					      \
	    }								      \
	  PAGE_COPY_FWD (dstp, srcp, nbytes_left, nbytes);		      \
	}								      \
    } while (0)

再到PAGE_COPY_FWD函數

#include <mach.h>

/* Threshold at which vm_copy is more efficient than well-optimized copying
   by words.  */
#define PAGE_COPY_THRESHOLD		(16384)

#define PAGE_SIZE		__vm_page_size
#define PAGE_COPY_FWD(dstp, srcp, nbytes_left, nbytes)			      \
  ((nbytes_left) = ((nbytes) -						      \
		    (__vm_copy (__mach_task_self (),			      \
				(vm_address_t) srcp, trunc_page (nbytes),     \
				(vm_address_t) dstp) == KERN_SUCCESS	      \
		     ? trunc_page (nbytes)				      \
		     : 0)))

可以看到使用的是__vm_copy,對它的解釋參考這裏
在這裏插入圖片描述
在這裏插入圖片描述
movq src, dst (源地址, 目的地址)
雖然也是8B的拷貝但是可能是預取使得性能提上去了 。(好像也不太對,預取的是源地址的東西,但是我用的是立即數/常數;循環(分支預測)的問題?我按照這裏的8個一組循環測試,也只有3-4GB/s的帶寬;那不然難道是memcpy沒有真正的複製???難頂…)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章