當然首先還是說 一下,好像在RISC 思想中, 使用 原子交換好像並不能讓 CPU在處理流程上變得更快更高效.
反而浪費了CPU的處理能力, 下面是轉載
===================================================================
但是隨着計算機硬件的快速發展,獲得這種鎖的開銷相對於CPU的速度在成倍地增加,原因很簡單,CPU的速度與訪問內存的速度差距越來越大,而這種鎖使用了原子操作指令,它需要原子地訪問內存,也就說獲得鎖的開銷與訪存速度相關,另外在大部分非x86架構上獲取鎖使用了內存柵(Memory Barrier),這會導致處理器流水線停滯或刷新,因此它的開銷相對於CPU速度而言就越來越大。表1數據證明了這一點。
表1是在700MHz的奔騰III機器上的基本操作的開銷,在該機器上一個時鐘週期能夠執行兩條整數指令。在1.8GHz的奔騰4機器上, 原子加1指令的開銷要比700MHz的奔騰III機器慢75納秒(ns),儘管CPU速度快兩倍多。
這種鎖機制的另一個問題在於其可擴展性,在多處理器系統上,可擴展性非常重要,否則根本無法發揮其性能。圖1表明了Linux上各種鎖的擴展性。
===================================================================
怎麼說呢 ,至少我現在用 phtread_spin_lock 還是比較多的 , 我也看了glib 的實現 確實是是上面說的那種 ,
但是我同樣堅持能不用鎖 的地方就不用 粒度按照事件概率改造鎖 (堅決不用pthread rw鎖 等),或者和內核一樣退化鎖 爲 RCU
3 常用指令
3.1 數據交換指令
爲什麼要提着幾個呢 因爲他能實現原子操作
普通的交換 或者相加,比如:
movl %eax,%ecx
movl %ebx,%eax
movl %ecx,%ebx
CPU 隨時可能被討厭的總線信號中斷 ,或許你可以 屏蔽,但是效率不高,於是
XCHG : 兩個寄存器之間 或寄存器與內存之間
此時處理器被自動LOCK 防止SMP上其他的處理器訪問
BSWAP: 交換32位寄存器中的字節序(little-endian <=> big-endian)
XADD : 交換兩個值把綜合存儲在目標操作數中
CMPXCHG :把一個值和一個外部值進行比較並且交換他和另一個值(重要)
比較目標操作數和EAX 寄存器中的值,
1 相等 : 就把源操作數的值加載到目標操作數中
2 不等 : 把目標操作數加載到EAX中
cmpxchg sour ,dest
比如你想實現下面的功能 把111 弄到數組最後一個
my @arr = qw/ 111 32 9 22/; my $index = 0; foreach(@arr){ if($arr[$index] ge $arr[index+1]){ ($arr[$index] , $arr[index+1]) = ($arr[$index+1] , $arr[index]) ; $index++; ..... }}
movl (%esi) , %eax ! esi 爲數組arr首地址 cmp %eax , 4(%esi) !相當於上面第4行 jgp leav xchg %eax , 4(%esi) !體會一下 movl %eax ,(%esi) ! leav
其實 NPTL的線程庫中 就是指令來實現 原子的交換,看看他怎麼寫的,學習學習
宏: atomic_exchange_acq
#define atomic_exchange_acq(mem, newvalue) \ ({ __typeof (*mem) result; \ if (sizeof (*mem) == 1) \ __asm __volatile ("xchgb %b0, %1" \ : "=q" (result), "=m" (*mem) \ : "0" (newvalue), "m" (*mem)); \ else if (sizeof (*mem) == 2) \ __asm __volatile ("xchgw %w0, %1" \ : "=r" (result), "=m" (*mem) \ : "0" (newvalue), "m" (*mem)); \ else if (sizeof (*mem) == 4) \ __asm __volatile ("xchgl %0, %1" \ : "=r" (result), "=m" (*mem) \ : "0" (newvalue), "m" (*mem)); \ else \ __asm __volatile ("xchgq %q0, %1" \ : "=r" (result), "=m" (*mem) \ : "0" ((long) (newvalue)), "m" (*mem)); \ result; })
很簡單吧 先判斷類型然後 xchg? 嘿嘿 至於怎麼一路走過來的,就自己用vim 跟吧
同時也發現一個更強大的指令
# define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \ ({ __typeof (*mem) ret; \ __asm __volatile (LOCK_PREFIX "cmpxchgl %2, %1" \ : "=a" (ret), "=m" (*mem) \ : "r" (newval), "m" (*mem), "0" (oldval)); \ ret; })
類似的在用 gcc -O3 優化選項後 會用 一個指令 cmovl %edx, %eax
代替 比較 跳轉 賦值 ~
當然對於用這麼高的級別優化,最好了解每個優化子選項的意義. 不瞭解的話, 我個人感覺最好多用用__volatile__ 不然發生了死循環就不好了....
然後我們在來看看 nginx 裏面的 原子操作
#if (NGX_SMP) #define NGX_SMP_LOCK "lock;" #else #define NGX_SMP_LOCK #endif /* * "cmpxchgl r, [m]": * * if (eax == [m]) { * zf = 1; * [m] = r; * } else { * zf = 0; * eax = [m]; * } * * * The "r" means the general register. * The "=a" and "a" are the %eax register. * Although we can return result in any register, we use "a" because it is * used in cmpxchgl anyway. The result is actually in %al but not in %eax, * however, as the code is inlined gcc can test %al as well as %eax, * and icc adds "movzbl %al, %eax" by itself. * * The "cc" means that flags were changed. */ static ngx_inline ngx_atomic_uint_t ngx_atomic_cmp_set(ngx_atomic_t *lock, ngx_atomic_uint_t old, ngx_atomic_uint_t set) { u_char res; __asm__ volatile ( NGX_SMP_LOCK " cmpxchgl %3, %1; " " sete %0; " : "=a" (res) : "m" (*lock), "a" (old), "r" (set) : "cc", "memory"); return res; } /* * "xaddl r, [m]": * * temp = [m]; * [m] += r; * r = temp; * * * The "+r" means the general register. * The "cc" means that flags were changed. */ #if !(( __GNUC__ == 2 && __GNUC_MINOR__ <= 7 ) || ( __INTEL_COMPILER >= 800 )) /* * icc 8.1 and 9.0 compile broken code with -march=pentium4 option: * ngx_atomic_fetch_add() always return the input "add" value, * so we use the gcc 2.7 version. * * icc 8.1 and 9.0 with -march=pentiumpro option or icc 7.1 compile * correct code. */ static ngx_inline ngx_atomic_int_t ngx_atomic_fetch_add(ngx_atomic_t *value, ngx_atomic_int_t add) { __asm__ volatile ( NGX_SMP_LOCK " xaddl %0, %1; " : "+r" (add) : "m" (*value) : "cc", "memory"); return add; } #else /* * gcc 2.7 does not support "+r", so we have to use the fixed * %eax ("=a" and "a") and this adds two superfluous instructions in the end * of code, something like this: "mov %eax, %edx / mov %edx, %eax". */ static ngx_inline ngx_atomic_int_t ngx_atomic_fetch_add(ngx_atomic_t *value, ngx_atomic_int_t add) { ngx_atomic_uint_t old; __asm__ volatile ( NGX_SMP_LOCK " xaddl %2, %1; " : "=a" (old) : "m" (*value), "a" (add) : "cc", "memory"); return old; } #endif /* * on x86 the write operations go in a program order, so we need only * to disable the gcc reorder optimizations */ #define ngx_memory_barrier() __asm__ volatile ("" ::: "memory") /* old "as" does not support "pause" opcode */ #define ngx_cpu_pause() __asm__ (".byte 0xf3, 0x90")
如果你不想代碼裏面嵌入式 assembly 也可以用gcc 4 提供的特性
type __sync_fetch_and_sub (type *ptr, type value, ...)
type __sync_fetch_and_or (type *ptr, type value, ...)
type __sync_fetch_and_and (type *ptr, type value, ...)
type __sync_fetch_and_xor (type *ptr, type value, ...)
type __sync_fetch_and_nand (type *ptr, type value, ...)
{ tmp = *ptr; *ptr op= value; return tmp; } { tmp = *ptr; *ptr = ~(tmp & value); return tmp; } // nand