glibc memcpy函数的一些研究

在测试内存(AEP,6*256GB interleaved dax)性能的时候,发现通过8B循环写的带宽大概是4GB/s,然后无意间用了一下memcpy,发现带宽达到了10GB/s,就顺便研究了一下memcpy函数,做个记录如下:
glibc的memcpy函数实现如下

void *
memcpy (void *dstpp, const void *srcpp, size_t len)
{
  unsigned long int dstp = (long int) dstpp;
  unsigned long int srcp = (long int) srcpp;

  /* Copy from the beginning to the end.  */

  /* If there not too few bytes to copy, use word copy.  */
  if (len >= OP_T_THRES)
    {
      /* Copy just a few bytes to make DSTP aligned.  */
      len -= (-dstp) % OPSIZ;
      BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);

      /* Copy whole pages from SRCP to DSTP by virtual address manipulation,
	 as much as possible.  */

      PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);

      /* Copy from SRCP to DSTP taking advantage of the known alignment of
	 DSTP.  Number of bytes remaining is put in the third argument,
	 i.e. in LEN.  This number may vary from machine to machine.  */

      WORD_COPY_FWD (dstp, srcp, len, len);

      /* Fall out and copy the tail.  */
    }

  /* There are just a few bytes to copy.  Use byte memory operations.  */
  BYTE_COPY_FWD (dstp, srcp, len);

  return dstpp;
}

主要是在拷贝大块的时候会使用PAGE_COPY_FWD_MAYBE,定义如下

# define PAGE_COPY_FWD_MAYBE(dstp, srcp, nbytes_left, nbytes)		      \
  do									      \
    {									      \
      if ((nbytes) >= PAGE_COPY_THRESHOLD &&				      \
	  PAGE_OFFSET ((dstp) - (srcp)) == 0) 				      \
	{								      \
	  /* The amount to copy is past the threshold for copying	      \
	     pages virtually with kernel VM operations, and the		      \
	     source and destination addresses have the same alignment.  */    \
	  size_t nbytes_before = PAGE_OFFSET (-(dstp));			      \
	  if (nbytes_before != 0)					      \
	    {								      \
	      /* First copy the words before the first page boundary.  */     \
	      WORD_COPY_FWD (dstp, srcp, nbytes_left, nbytes_before);	      \
	      assert (nbytes_left == 0);				      \
	      nbytes -= nbytes_before;					      \
	    }								      \
	  PAGE_COPY_FWD (dstp, srcp, nbytes_left, nbytes);		      \
	}								      \
    } while (0)

再到PAGE_COPY_FWD函数

#include <mach.h>

/* Threshold at which vm_copy is more efficient than well-optimized copying
   by words.  */
#define PAGE_COPY_THRESHOLD		(16384)

#define PAGE_SIZE		__vm_page_size
#define PAGE_COPY_FWD(dstp, srcp, nbytes_left, nbytes)			      \
  ((nbytes_left) = ((nbytes) -						      \
		    (__vm_copy (__mach_task_self (),			      \
				(vm_address_t) srcp, trunc_page (nbytes),     \
				(vm_address_t) dstp) == KERN_SUCCESS	      \
		     ? trunc_page (nbytes)				      \
		     : 0)))

可以看到使用的是__vm_copy,对它的解释参考这里
在这里插入图片描述
在这里插入图片描述
movq src, dst (源地址, 目的地址)
虽然也是8B的拷贝但是可能是预取使得性能提上去了 。(好像也不太对,预取的是源地址的东西,但是我用的是立即数/常数;循环(分支预测)的问题?我按照这里的8个一组循环测试,也只有3-4GB/s的带宽;那不然难道是memcpy没有真正的复制???难顶…)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章