多核編程：選擇合適的結構體大小，提高多核併發性能

作者：[email protected]
博客：blog.focus-linux.net linuxfocus.blog.chinaunix.net

本文的copyleft歸[email protected]所有，使用GPL發佈，可以自由拷貝，轉載。但轉載請保持文檔的完整性，註明原作者及原鏈接，嚴禁用於任何商業用途。

======================================================================================================

在現代的程序設計中，多核編程已經是很普遍的應用了。多核編程究竟有什麼不同？我們如何提高多核編程的性能？針對這個問題，我們需要了解多核與單核在體系架構上有什麼不同。

由於本文不是用於介紹多核架構的文章，所以不準備對其架構進行展開。感興趣的朋友可以自行搜索google。今天就說其中的一點。大家都知道現代的CPU都具有cache，用於提高CPU訪問指令或者數據的速度——一般來說，指令cache和數據cache是分開的，因爲這樣性能更好。在cache的匹配和訪問過程中，cache的最小單元是line，即cache line，有的也稱其爲cache的data block。之所以稱爲block，因爲在cache中存的不是內存傳遞的最小單元（字），而是多個字——32位機，一個字爲4個bytes。當cache miss的時候，CPU從內存中預取一個data block大小的數據，放到cache中。（這裏只是一個極其簡單的描述，準確具體請google）。

迴歸正題。在多核編程下，cache line又是如何影響多核的性能的呢。比如有兩個CPU，CPU1要修改一個變量var的值。這時var是在CPU1的cache中的，var的值被更新。那麼萬一CPU2的cache中也有var怎麼辦？爲了保證數據的一致性，CPU1需要使CPU2中var變量對應的cache line失效或者將其同樣更新爲最新值。一般來說，使其失效更爲普遍。如果使失效，那麼當CPU2要訪問var時，會產生一次cache miss。如果使其更新，同樣要涉及更新CPU2的cache line操作，都是要損失一定性能的。

在多核編程的時候，爲了保證併發性，往往使用空間來換取時間，讓每個CPU訪問獨立的變量或者per cpu的變量，來避免加鎖。這是一種很常見的多核編程技巧。一般的簡單實現，都是使用數組來實現，其中數組的個數爲CPU的個數。那麼，在這個時候，該變量就需要選用一個適當的size，來避免多核cache失效帶來的性能下降。

下面看實例。（我的硬件平臺：雙核Intel(R) Pentium(R) 4 CPU，這個CPU的cache line爲64 bytes）

#define _GNU_SOURCE

#include <pthread.h>

#include <sched.h>

#include <stdio.h>

#include <stdlib.h>

#include <errno.h>

#include <sys/types.h>

#include <unistd.h>

// 設置線程的CPU親和性，使不同線程同時運行在不同的CPU上

static int set_thread_affinity(int cpu_id)

{

    cpu_set_t cpuset;

    int ret;

    CPU_ZERO(&cpuset);

    CPU_SET(cpu_id, &cpuset);

    ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);

    if (ret != 0) {

        printf("set affinity error\n");

        return -1;

    }

    return 0;

}

 //檢查線程的CPU親和性

static void check_cpu_affinity(void)

{

    cpu_set_t cpu_set;

    int ret;

    int i;

    ret = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_set);

    if (ret != 0) {

        printf("check err!\n");

        return;

    }

    for (i = 0; i < CPU_SETSIZE; ++i) {

        if (CPU_ISSET(i, &cpu_set)) {

            printf("cpu %d\n", i);

        }

    }

}

#define CPU_NR          2

#define CACHE_LINE_SIZE 64

#define VAR_NR ((CACHE_LINE_SIZE/sizeof(int))-1)

//這個結構爲多核編程中最頻繁使用的結構

//其size大小爲本文重點

struct key {

    int a[VAR_NR];

    //int pad;

} __attribute__((packed));

//使用空間換時間，每個CPU擁有不同的數據

static struct key g_key[CPU_NR];

  //醜陋的硬編碼——這裏僅僅爲了說明問題，我就不改了。

static void real_job(int index)

{

#define LOOP_NR 100000000

    struct key *k = g_key+index;

    int i;

    for (i = 0; i < VAR_NR; ++i) {

        k->a[i] = i;

    }

    for (i = 0; i < LOOP_NR; ++i) {

        k->a[14] = k->a[14]+k->a[3];

        k->a[3] = k->a[14]+k->a[5];

        k->a[1] = k->a[1]+k->a[7];

        k->a[7] = k->a[1]+k->a[9];

    }

}

static volatile int thread_ready = 0;

  //這裏使用醜陋的硬編碼。最好是通過參數來設置親和的CPU
  //這個線程運行在CPU 1上

static void *thread_task(void *data)

{

    set_thread_affinity(1);

    check_cpu_affinity();

    thread_ready = 1;

    real_job(1);

    return NULL;

}

int main(int argc, char *argv[])

{

    pthread_t tid;

    int ret;
 
     //設置主線程運行在CPU 0上

    ret = set_thread_affinity(0);

    if (ret != 0) {

        printf("err1\n");

        return -1;

    }

    check_cpu_affinity();

     //提高優先級，避免進程被換出。因爲換出後，cache會失效，會影響測試效果

    ret = nice(-20);

    if (-1 == ret) {

        printf("err2\n");

        return -1;

    }

    ret = pthread_create(&tid, NULL, thread_task, NULL);

    if (ret != 0) {

        printf("err2\n");

        return -1;

    }

     //忙等待，使兩個real_job同時進行

    while (!thread_ready)

        ;

    real_job(0);

    pthread_join(tid, NULL);

    printf("Completed!\n");

    return 0;

}

感興趣的同學，可以修改這代碼，使其運行更多的線程來測試。但是一定注意你的平臺的cache line的大小。

第一次，關鍵結構struct key的size爲60字節。這樣主線程CPU 0 在訪問g_key[0]的時候，其對應的cache line包含了g_key[1]的開頭部分的數據。那麼當主線程更新g_key[0]的值時，會使CPU 1的cache失效，導致CPU1 訪問g_key[1]的部分數據時產生cache miss，從而影響性能。

下面編譯運行：

[root@Lnx99 cache]#gcc -g -Wall cache_line.c -lpthread -o no_padd

[root@Lnx99 cache]#time ./no_padd

cpu 0

cpu 1

Completed!

real 0m9.830s

user 0m19.427s

sys 0m0.011s

[root@Lnx99 cache]#time ./no_padd

cpu 0

cpu 1

Completed!

real 0m10.081s

user 0m20.074s

sys 0m0.010s

[root@Lnx99 cache]#time ./no_padd

cpu 0

cpu 1

Completed!

real 0m9.989s

user 0m19.877s

sys 0m0.010s

下面我們把int pad前面的//去掉，使struct key的size變爲64字節，即與cache line匹配。這時CPU 0修改g_key[0]時就不會影響CPU 1的cache。因爲g_key[1]的數據不包含在g_key[0]所在的CPU 0的cache中。也就是說g_key[0]和g_key[1]的所在的cache line已經獨立，不會互相影響了。

請看測試結果：

[root@Lnx99 cache]#gcc -g -Wall cache_line.c -lpthread -o padd

[root@Lnx99 cache]#time ./padd

cpu 0

cpu 1

Completed!

real 0m1.824s

user 0m3.614s

sys 0m0.012s

[root@Lnx99 cache]#time ./padd

cpu 0

cpu 1

Completed!

real 0m1.817s

user 0m3.625s

sys 0m0.011s

[root@Lnx99 cache]#time ./padd

cpu 0

cpu 1

Completed!

real 0m1.824s

user 0m3.613s

sys 0m0.011s

結果有些出人意料吧。同樣的代碼，僅僅是更改了關鍵結構體的大小，性能卻相差了近10倍！

從這個例子中，我們應該學到

1. CPU的cache對於提高程序性能非常重要！一個良好的設計，可以保證更高的cache hit，從而得到更好的性能；

2. 多核編程中，對於cache line一定要格外關注。關鍵結構體size大小的控制和選擇，可以大幅提高多核的性能；

3. 在多核編程中，寫程序時，一定要思考，思考，再思考

qiushanjushi

發佈了15 篇原創文章 · 獲贊 9 · 訪問量 35萬+

私信關注

多核編程：選擇合適的結構體大小，提高多核併發性能

Intel 82599 ixgbe & ixgbevf CNA 卡驅動分析02——VF/PF/MailBox

Intel 82599 ixgbe & ixgbevf CNA 卡驅動分析03——部分功能代碼分析

單IP做NAT支持的最大連接數問題

Python裝飾器與面向切面編程

關於Neutron的幾篇不錯的博客

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結