cache緩存優化

1：cache的基本原理描述

2：時間局部性和空間局部性

3: 測試CPU的L2 cache 大小

注：編譯要使用g++ *.cpp -std=c++11

#include <iostream>
#include <random>
#include <ctime>
#include <algorithm>

#define KB(x) ((size_t)(x) << 10)
using namespace std;

int main()
{
    // 需要測試的數組的大小
    vector<size_t> sizes_KB;
    for (int i = 1; i < 18; i++)
    {
        sizes_KB.push_back(1 << i);
    }
    random_device rd;
    // 僞隨機數算法，計算更快，佔用內存更少
    mt19937 gen(rd());

    for (size_t size : sizes_KB)
    {
        // 離散均勻分佈類
        uniform_int_distribution<> dis(0, KB(size) - 1);
        // 創建連續內存塊
        vector<char> memory(KB(size));
        // 在內存中填入內容
        fill(memory.begin(), memory.end(), 1);

        int dummy = 0;

        // 在內存上進行大量的隨機訪問並計時
        clock_t begin = clock();
        // 1<<25：將1左移25位，進行大量隨機訪問
        for (int i = 0; i < (1 << 25); i++)
        {
            dummy += memory[dis(gen)];
        }
        clock_t end = clock();

        // 輸出
        double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
        cout << size << " KB, " << elapsed_secs << "secs, dummy:" << dummy << endl;
    }
}

輸出：可以看到從4M-8M的時候，數據訪問發生了越遷，這是由於cache miss的結果。

$ ./a.out
2 KB, 2.29258secs, dummy:33554432
4 KB, 2.28871secs, dummy:33554432
8 KB, 2.29648secs, dummy:33554432
16 KB, 2.33673secs, dummy:33554432
32 KB, 2.27567secs, dummy:33554432
64 KB, 2.31243secs, dummy:33554432
128 KB, 2.27227secs, dummy:33554432
256 KB, 2.29501secs, dummy:33554432
512 KB, 2.35033secs, dummy:33554432
1024 KB, 2.40089secs, dummy:33554432
2048 KB, 2.37182secs, dummy:33554432
4096 KB, 2.52168secs, dummy:33554432
8192 KB, 4.14635secs, dummy:33554432
16384 KB, 5.97561secs, dummy:33554432
32768 KB, 6.4334secs, dummy:33554432
65536 KB, 6.74219secs, dummy:33554432
131072 KB, 6.97896secs, dummy:33554432

同時，在Linux裏面，也可以使用getconf -a | grep CACHE或者sudo lshw -C memory查看系統cache配置。

  *-cache:0
       description: L1 cache
       physical id: 4c
       slot: CPU Internal L1
       size: 896KiB
       capacity: 896KiB
       capabilities: internal write-back
       configuration: level=1
  *-cache:1
       description: L2 cache
       physical id: 4d
       slot: CPU Internal L2
       size: 3584KiB
       capacity: 3584KiB
       capabilities: internal write-back unified
       configuration: level=2
  *-cache:2
       description: L3 cache
       physical id: 4e
       slot: CPU Internal L3
       size: 35MiB
       capacity: 35MiB
       capabilities: internal write-back unified
       configuration: level=3
  *-cache:3
       description: L1 cache
       physical id: 50
       slot: CPU Internal L1
       size: 896KiB
       capacity: 896KiB
       capabilities: internal write-back
       configuration: level=1
  *-cache:4
       description: L2 cache
       physical id: 51
       slot: CPU Internal L2
       size: 3584KiB
       capacity: 3584KiB
       capabilities: internal write-back unified
       configuration: level=2
  *-cache:5
       description: L3 cache
       physical id: 52
       slot: CPU Internal L3
       size: 35MiB
       capacity: 35MiB
       capabilities: internal write-back unified
       configuration: level=3

3：矩陣乘法：程序分析（基本版）

基本源碼

void gemm_v1(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{

	size_t a_dim_x = m_a->x;
	size_t a_dim_y = m_a->y;

	size_t b_dim_x = m_b->x;
	size_t b_dim_y = m_b->y;

	size_t c_dim_x = m_c->x;
	size_t c_dim_y = m_c->y;


	printf("a:[%lu, %lu]; b:[%lu, %lu]; c:[%lu, %lu]; \n", a_dim_x, a_dim_y, b_dim_x, b_dim_y, c_dim_x, c_dim_y);

	int* d_a = (int*)(m_a->m_data);
	int* d_b = (int*)(m_b->m_data);
	int* d_c = (int*)(m_c->m_data);

	for (size_t i = 0; i < c_dim_x; ++i)
	{
		/* code */
		for (size_t j = 0; j < c_dim_y; ++j)
		{
			/* code */
			for (size_t k = 0; k < a_dim_y; ++k)
			{
				/* code */
				//printf("i=%lu,j=%lu,k=%lu;  dc=%d,da=%d,db=%d\n", i, j, k, d_c[i * c_dim_y + j], d_a[i * a_dim_y + k],  d_b[k * b_dim_y + j]);
				d_c[i * c_dim_y + j] += d_a[i * a_dim_y + k] * d_b[k * b_dim_y + j];
				d_c[i * c_dim_y + j] %= 99811;
			}
		}
	}

4：矩陣乘法：cache優化（行）

void gemm_v2(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{

	size_t a_dim_x = m_a->x;
	size_t a_dim_y = m_a->y;

	size_t b_dim_x = m_b->x;
	size_t b_dim_y = m_b->y;

	size_t c_dim_x = m_c->x;
	size_t c_dim_y = m_c->y;


	printf("a:[%lu, %lu]; b:[%lu, %lu]; c:[%lu, %lu]; \n", a_dim_x, a_dim_y, b_dim_x, b_dim_y, c_dim_x, c_dim_y);

	int* d_a = (int*)(m_a->m_data);
	int* d_b = (int*)(m_b->m_data);
	int* d_c = (int*)(m_c->m_data);

	for (size_t i = 0; i < c_dim_x; ++i)
	{
		for (size_t k = 0; k < a_dim_y; ++k)
		{
			for (size_t j = 0; j < c_dim_y; ++j)
			{
				d_c[i * c_dim_y + j] += d_a[i * a_dim_y + k] * d_b[k * b_dim_y + j];
				d_c[i * c_dim_y + j] %= 99811;
			}
		}
	}
}

5：矩陣乘法：cache優化（多線程）

6：矩陣乘法：cache優化（分塊）

/*
矩陣分開計算
C=A*B  --- C(i,j)等於A的第i行乘以第j列
*/
#include <cstdio>
#include <ctime>
#include <cstdlib>
#include <cmath>
#include <thread>
#include <vector>

/*
    generating matrix p = [m,n]
*/
void GenerateMatrix(float *m, int n);
void PrintMatrix(float *p, int n);
// traditional matrix mulplity
void GeneralMul(float *A, float *B, float *C, int n);
void ClearMatrix(float *m, int n);


void GeneralMul_ikj(float *A, float *B, float *C, int n);
/*
    matrix multiplity with blocks
    m is the block size
*/
void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m);
/*
    check the diff for given two matrices c1 and c0
*/
float diff(float *C1, float *C0, int n);

struct ARG
{
    float *A;
    int ax, ay; //shape of matrix A
    float *B;
    int bx, by; //shape of matrix B
    float *C;
    int cx, cy; //shape of matrix C
    int m; // block size
    int n; // matrix size
};
int main(int argc, char **argv)
{
    if (argc != 4)
    {
        printf("Usage: %s N thread_num M [N is the matrix size, thread_num is for maultiple threads, M is the block size]\n", argv[0]);
        return 0;
    }
    int n = atoi(argv[1]); //matrix shapes (squared)
    int thread_num = atoi(argv[2]);
    int m = atoi(argv[3]); // blocks numbers
    float *A = new float[n * n];
    float *B = new float[n * n];
    float *C = new float[n * n];
    float *C0 = new float[n * n];

    GenerateMatrix(A, n);
    GenerateMatrix(B, n);


    clock_t start;
    float time_used;

    ClearMatrix(C0, n);
    start = clock();
    GeneralMul(A, B, C0, n);
    time_used = static_cast<float>(clock() - start) / CLOCKS_PER_SEC * 1000;
    printf("General ijk:   time = %f\n", time_used);

    ClearMatrix(C0, n);
    start = clock();
    GeneralMul_ikj(A, B, C0, n);
    time_used = static_cast<float>(clock() - start) / CLOCKS_PER_SEC * 1000;
    printf("General ikj:   time = %f\n", time_used);

    ClearMatrix(C, n);
    start = clock();
    BlockCacul(A, B, C, n, thread_num, m);
    time_used = static_cast<float>(clock() - start) / CLOCKS_PER_SEC * 1000;
    printf("Block:  time = %f\n", time_used);
    printf("Difference of two result: %f\n", diff(C0, C, n));

    delete [] A;
    delete [] B;
    delete [] C;
    delete [] C0;
    return 0;
}


void ClearMatrix(float *m, int n)
{
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
            m[i * n + j] = 0.0;
    }
}
/*
    the general matrix mulplity
*/
void GeneralMul(float *A, float *B, float *C, int n)
{
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            float *p = C + i * n + j;
            for (int k = 0; k < n; k++)
            {
                *p += A[i * n + k] * B[k * n + j];
            }
        }
    }
}

void GeneralMul_ikj(float *A, float *B, float *C, int n)
{
    for (int i = 0; i < n; i++)
    {
        for (int k = 0; k < n; k++)
        {
            float *p = A + i * n + k;
            for (int j = 0; j < n; j++)
            {
                //float *p = C + i * n + j;

                C[i * n + j] += *p * B[k * n + j];

            }
        }
    }
}


int Mul_Fun(void* arg)
{
    struct ARG *p = (struct ARG *)arg;

    float *A = p->A;
    float *B = p->B;
    float *C = p->C;
    int m = p->m;
    int n = p->n;
    for (int i = 0; i < m; i++)
    {
        for (int j = 0; j < m; j++)
        {
            float *t = C + (i + p->cx) * n + p->cy + j;

            for (int k = 0; k < m; k++)
            {
                *t += A[(p->ax + i) * n + p->ay + k] * B[(p->bx + k) * n + p->by + j];
            }
        }
    }

    return 0;
}

int Mul_Fun_ikj(void* arg)
{
    struct ARG *p = (struct ARG *)arg;

    float *A = p->A;
    float *B = p->B;
    float *C = p->C;
    int m = p->m;
    int n = p->n;
    for (int i = 0; i < m; i++)
    {
        for (int k = 0; k < m; k++)
        {

            float *t = A + (p->ax + i) * n + p->ay + k;

            for (int j = 0; j < m; j++)
            {
                {
                    C[(i + p->cx) * n + p->cy + j] += *t * B[(p->bx + k) * n + p->by + j];
                }
            }
        }
    }

    return 0;
}


void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m)
{
    // m is the block size

    //m = static_cast<int>(sqrt(m));
    struct ARG *args = new struct ARG[thread_num];


    std::vector<std::thread*> tid;
    int t = 0;
    int i;
    for (i = 0; i < thread_num; i++)
    {
        args[i].A = A;
        args[i].B = B;
        args[i].C = C;
        args[i].m = m;
        args[i].n = n;
    }
    //divide into n/m * n/m blocks, each block is with a size [m, m]
    //the ith row and jth col for A
    for (i = 0; i < n; i += m)
    {
        for (int j = 0; j < n; j += m)
        {
            //B j行k列
            for (int k = 0; k < n; k += m)
            {
                args[t].ax = i;
                args[t].ay = j;
                args[t].bx = j;
                args[t].by = k;
                args[t].cx = i;
                args[t].cy = k;

                //Mul_Fun((void*)(&args[t]));
                Mul_Fun_ikj((void*)(&args[t]));

                /*
                if (t < thread_num)
                {
                    // handle[t] = CreateThread(NULL, 0, Mul_Fun, (void*)(&args[t]), 0, 0 );
                    tid.push_back(new std::thread(Mul_Fun, (void*)(&args[t])));
                    t++;
                }
                if (t == thread_num)
                {
                    for (int ii = 0; ii < t; ii++)
                        tid[ii]->join();
                    t = 0;
                }
                */
            }
        }
    }

}
void GenerateMatrix(float *p, int n)
{
    srand(time(NULL) + rand());
    for (int i = 0; i < n * n; i++)
    {
        *p = static_cast<float>(rand()) / (static_cast<float>(rand()) + static_cast<float>(0.55));
        p++;
    }
}

float diff(float *C1, float *C0, int n)
{
    float rst = 0.0;
    float t;

    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            t = C1[i * n + j] - C0[i * n + j];
            if (t < 0)
                t = -t;
            rst += t;
        }
    }
    return rst;
}

void PrintMatrix(float *p, int n)
{
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            printf("%.2f\t", p[i * n + j]);
        }
        printf("\n");
    }
    printf("\n");
}

7：完整代碼

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>


typedef struct
{
        void* m_data;
        size_t x, y;
} matrix_descriptor;

void gemm_v2(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{
...
}


void gemm_v1(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{
...
}



int main(int argc, char const *argv[])
{
        size_t dim_x = 2000, dim_y = 100, dim_z = 2000;

        /* code */
        int* m_a = (int*)malloc(sizeof(int) * dim_x * dim_y);
        int* m_b = (int*)malloc(sizeof(int) * dim_z * dim_y);
        int* m_c = (int*)malloc(sizeof(int) * dim_x * dim_z);

        {
                for (size_t i = 0; i < dim_x; ++i)
                {
                        for (size_t j = 0; j < dim_y; ++j)
                        {
                                m_a[i * dim_y + j] = 1;
                        }
                }

                for (size_t i = 0; i < dim_y; ++i)
                {
                        for (size_t j = 0; j < dim_z; ++j)
                        {
                                m_b[i * dim_z + j] = 1;
                        }
                }

                memset(m_c, 0, sizeof(int)*dim_x * dim_z);
        }
        matrix_descriptor a = {.m_data = m_a, .x = dim_x, .y = dim_y};
        matrix_descriptor b = {.m_data = m_b, .x = dim_y, .y = dim_z};
        matrix_descriptor c = {.m_data = m_c, .x = dim_x, .y = dim_z};

        clock_t start, stop;
        
        start = clock();
        double duration;
        gemm_v2(&a, &b, &c);
        stop = clock();
        
        duration = (double)(stop - start) / CLOCKS_PER_SEC;

        printf("Result --> C: [%lu, %lu]\n", dim_x, dim_z);

        printf("The time was: %f\n", duration);
        
        {
                free(m_a);
                free(m_b);
                free(m_c);
        }
        return 0;
}

參考

張先軼-雷鋒網演講

1：cache的基本原理描述

2：時間局部性和空間局部性

3: 測試CPU的L2 cache 大小

3：矩陣乘法：程序分析（基本版）

4：矩陣乘法：cache優化（行）

5：矩陣乘法：cache優化（多線程）

6：矩陣乘法：cache優化（分塊）

7：完整代碼

參考

公司剛入職了一名 Java 中級開發，短短 4 行代碼居然湊齊了 3 個 bug！我哭了~~

公衆號5月C#/.NET熱文一覽

git 下載大陸鏡像地址

子網管理和維護：arp/nmap

cache緩存優化

redis源碼分析

C++線程與線程池

Pytorch實用技巧

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

cache緩存優化

1：cache的基本原理描述

2：時間局部性和空間局部性

3: 測試CPU的L2 cache 大小

3：矩陣乘法：程序分析（基本版）

4：矩陣乘法：cache優化（行）

5：矩陣乘法：cache優化（多線程）

6：矩陣乘法：cache優化 （分塊）

7：完整代碼

參考

6：矩陣乘法：cache優化（分塊）