atom_inc

當然首先還是說一下,好像在RISC 思想中, 使用原子交換好像並不能讓 CPU在處理流程上變得更快更高效.

反而浪費了CPU的處理能力, 下面是轉載

===================================================================

但是隨着計算機硬件的快速發展，獲得這種鎖的開銷相對於CPU的速度在成倍地增加，原因很簡單，CPU的速度與訪問內存的速度差距越來越大，而這種鎖使用了原子操作指令，它需要原子地訪問內存，也就說獲得鎖的開銷與訪存速度相關，另外在大部分非x86架構上獲取鎖使用了內存柵(Memory Barrier)，這會導致處理器流水線停滯或刷新，因此它的開銷相對於CPU速度而言就越來越大。表1數據證明了這一點。

表1是在700MHz的奔騰III機器上的基本操作的開銷，在該機器上一個時鐘週期能夠執行兩條整數指令。在1.8GHz的奔騰4機器上, 原子加1指令的開銷要比700MHz的奔騰III機器慢75納秒(ns)，儘管CPU速度快兩倍多。

這種鎖機制的另一個問題在於其可擴展性，在多處理器系統上，可擴展性非常重要，否則根本無法發揮其性能。圖1表明了Linux上各種鎖的擴展性。

===================================================================

怎麼說呢，至少我現在用 phtread_spin_lock 還是比較多的，我也看了glib 的實現確實是是上面說的那種，

但是我同樣堅持能不用鎖的地方就不用粒度按照事件概率改造鎖 (堅決不用pthread rw鎖等)，或者和內核一樣退化鎖爲 RCU

3 常用指令

3.1 數據交換指令

爲什麼要提着幾個呢因爲他能實現原子操作

普通的交換或者相加,比如:

movl %eax,%ecx

movl %ebx,%eax

movl %ecx,%ebx

CPU 隨時可能被討厭的總線信號中斷 ,或許你可以屏蔽,但是效率不高,於是

XCHG : 兩個寄存器之間或寄存器與內存之間

此時處理器被自動LOCK 防止SMP上其他的處理器訪問

BSWAP: 交換32位寄存器中的字節序(little-endian <=> big-endian)

XADD : 交換兩個值把綜合存儲在目標操作數中

CMPXCHG :把一個值和一個外部值進行比較並且交換他和另一個值(重要)

比較目標操作數和EAX 寄存器中的值,

1 相等 : 就把源操作數的值加載到目標操作數中

2 不等 : 把目標操作數加載到EAX中

cmpxchg sour ,dest

比如你想實現下面的功能把111 弄到數組最後一個

 my @arr = qw/ 111 32 9 22/;
 my $index = 0;
 foreach(@arr){
  if($arr[$index] ge $arr[index+1]){
    ($arr[$index] , $arr[index+1]) = ($arr[$index+1] , $arr[index]) ;
  $index++;
  .....
}}

movl (%esi) , %eax  ! esi 爲數組arr首地址
cmp %eax , 4(%esi)  !相當於上面第4行
jgp leav
xchg %eax , 4(%esi)  !體會一下
movl %eax ,(%esi)     !
leav

其實 NPTL的線程庫中就是指令來實現原子的交換,看看他怎麼寫的,學習學習

宏: atomic_exchange_acq

#define atomic_exchange_acq(mem, newvalue) \
  ({ __typeof (*mem) result;						      \
     if (sizeof (*mem) == 1)						      \
       __asm __volatile ("xchgb %b0, %1"				      \
			 : "=q" (result), "=m" (*mem)			      \
			 : "0" (newvalue), "m" (*mem));			      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile ("xchgw %w0, %1"				      \
			 : "=r" (result), "=m" (*mem)			      \
			 : "0" (newvalue), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile ("xchgl %0, %1"					      \
			 : "=r" (result), "=m" (*mem)			      \
			 : "0" (newvalue), "m" (*mem));			      \
     else								      \
       __asm __volatile ("xchgq %q0, %1"				      \
			 : "=r" (result), "=m" (*mem)			      \
			 : "0" ((long) (newvalue)), "m" (*mem));	      \
     result; })

很簡單吧先判斷類型然後 xchg? 嘿嘿至於怎麼一路走過來的,就自己用vim 跟吧

同時也發現一個更強大的指令

# define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
  ({ __typeof (*mem) ret;            \
     __asm __volatile (LOCK_PREFIX "cmpxchgl %2, %1"         \
         : "=a" (ret), "=m" (*mem)         \
         : "r" (newval), "m" (*mem), "0" (oldval));       \
     ret; })

類似的在用 gcc -O3 優化選項後會用一個指令 cmovl %edx, %eax

代替比較跳轉賦值 ~

當然對於用這麼高的級別優化,最好了解每個優化子選項的意義. 不瞭解的話, 我個人感覺最好多用用__volatile__ 不然發生了死循環就不好了....

然後我們在來看看 nginx 裏面的原子操作

#if (NGX_SMP)
#define NGX_SMP_LOCK  "lock;"
#else
#define NGX_SMP_LOCK
#endif


/*
 * "cmpxchgl  r, [m]":
 *
 *     if (eax == [m]) {
 *         zf = 1;
 *         [m] = r;
 *     } else {
 *         zf = 0;
 *         eax = [m];
 *     }
 *
 *
 * The "r" means the general register.
 * The "=a" and "a" are the %eax register.
 * Although we can return result in any register, we use "a" because it is
 * used in cmpxchgl anyway.  The result is actually in %al but not in %eax,
 * however, as the code is inlined gcc can test %al as well as %eax,
 * and icc adds "movzbl %al, %eax" by itself.
 *
 * The "cc" means that flags were changed.
 */

static ngx_inline ngx_atomic_uint_t
ngx_atomic_cmp_set(ngx_atomic_t *lock, ngx_atomic_uint_t old,
    ngx_atomic_uint_t set)
{
    u_char  res;

    __asm__ volatile (

         NGX_SMP_LOCK
    "    cmpxchgl  %3, %1;   "
    "    sete      %0;       "

    : "=a" (res) : "m" (*lock), "a" (old), "r" (set) : "cc", "memory");

    return res;
}


/*
 * "xaddl  r, [m]":
 *
 *     temp = [m];
 *     [m] += r;
 *     r = temp;
 *
 *
 * The "+r" means the general register.
 * The "cc" means that flags were changed.
 */


#if !(( __GNUC__ == 2 && __GNUC_MINOR__ <= 7 ) || ( __INTEL_COMPILER >= 800 ))

/*
 * icc 8.1 and 9.0 compile broken code with -march=pentium4 option:
 * ngx_atomic_fetch_add() always return the input "add" value,
 * so we use the gcc 2.7 version.
 *
 * icc 8.1 and 9.0 with -march=pentiumpro option or icc 7.1 compile
 * correct code.
 */

static ngx_inline ngx_atomic_int_t
ngx_atomic_fetch_add(ngx_atomic_t *value, ngx_atomic_int_t add)
{
    __asm__ volatile (

         NGX_SMP_LOCK
    "    xaddl  %0, %1;   "

    : "+r" (add) : "m" (*value) : "cc", "memory");

    return add;
}


#else

/*
 * gcc 2.7 does not support "+r", so we have to use the fixed
 * %eax ("=a" and "a") and this adds two superfluous instructions in the end
 * of code, something like this: "mov %eax, %edx / mov %edx, %eax".
 */

static ngx_inline ngx_atomic_int_t
ngx_atomic_fetch_add(ngx_atomic_t *value, ngx_atomic_int_t add)
{
    ngx_atomic_uint_t  old;

    __asm__ volatile (

         NGX_SMP_LOCK
    "    xaddl  %2, %1;   "

    : "=a" (old) : "m" (*value), "a" (add) : "cc", "memory");

    return old;
}

#endif


/*
 * on x86 the write operations go in a program order, so we need only
 * to disable the gcc reorder optimizations
 */

#define ngx_memory_barrier()    __asm__ volatile ("" ::: "memory")

/* old "as" does not support "pause" opcode */
#define ngx_cpu_pause()         __asm__ (".byte 0xf3, 0x90")

如果你不想代碼裏面嵌入式 assembly 也可以用gcc 4 提供的特性

http://gcc.gnu.org/onlinedocs/gcc-4.5.2/gcc/Atomic-Builtins.html#Atomic-Builtins 寫道

type __sync_fetch_and_add (type *ptr, type value, ...)
type __sync_fetch_and_sub (type *ptr, type value, ...)
type __sync_fetch_and_or (type *ptr, type value, ...)
type __sync_fetch_and_and (type *ptr, type value, ...)
type __sync_fetch_and_xor (type *ptr, type value, ...)
type __sync_fetch_and_nand (type *ptr, type value, ...)

These builtins perform the operation suggested by the name, and returns the value that had previously been in memory. That is,

          { tmp = *ptr; *ptr op= value; return tmp; }
          { tmp = *ptr; *ptr = ~(tmp & value); return tmp; }   // nand

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

一鍵自動化博客發佈工具,用過的人都說好(掘金篇)

[轉帖]

python列出centos7內存使用前50的進程信息

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

Java ThreadPoolShutdown

5月21日相聚上海張江！與文心大模型一起共建大模型產業應用生態圈

通義千問 2.5 “客串” ChatGPT4，你分的清嗎？

“她”來了，陪伴賽道鉅變！爲GPT-4o加上你的一個數字分身

京東秒送售後系統退款業務重構心得| 京東零售技術團隊

Mutex Subsystem

騰訊用 AI 自動創作內容，離取代人類還有多遠？

Gluster ABC

【內容算法】內容質量之標題黨

Glusterfs 3.2

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結