1:cache的基本原理描述
2:時間局部性和空間局部性
3: 測試CPU的L2 cache 大小
注:編譯要使用g++ *.cpp -std=c++11
#include <iostream>
#include <random>
#include <ctime>
#include <algorithm>
#define KB(x) ((size_t)(x) << 10)
using namespace std;
int main()
{
// 需要測試的數組的大小
vector<size_t> sizes_KB;
for (int i = 1; i < 18; i++)
{
sizes_KB.push_back(1 << i);
}
random_device rd;
// 僞隨機數算法,計算更快,佔用內存更少
mt19937 gen(rd());
for (size_t size : sizes_KB)
{
// 離散均勻分佈類
uniform_int_distribution<> dis(0, KB(size) - 1);
// 創建連續內存塊
vector<char> memory(KB(size));
// 在內存中填入內容
fill(memory.begin(), memory.end(), 1);
int dummy = 0;
// 在內存上進行大量的隨機訪問並計時
clock_t begin = clock();
// 1<<25:將1左移25位,進行大量隨機訪問
for (int i = 0; i < (1 << 25); i++)
{
dummy += memory[dis(gen)];
}
clock_t end = clock();
// 輸出
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout << size << " KB, " << elapsed_secs << "secs, dummy:" << dummy << endl;
}
}
輸出:可以看到從4M-8M的時候,數據訪問發生了越遷,這是由於cache miss的結果。
$ ./a.out
2 KB, 2.29258secs, dummy:33554432
4 KB, 2.28871secs, dummy:33554432
8 KB, 2.29648secs, dummy:33554432
16 KB, 2.33673secs, dummy:33554432
32 KB, 2.27567secs, dummy:33554432
64 KB, 2.31243secs, dummy:33554432
128 KB, 2.27227secs, dummy:33554432
256 KB, 2.29501secs, dummy:33554432
512 KB, 2.35033secs, dummy:33554432
1024 KB, 2.40089secs, dummy:33554432
2048 KB, 2.37182secs, dummy:33554432
4096 KB, 2.52168secs, dummy:33554432
8192 KB, 4.14635secs, dummy:33554432
16384 KB, 5.97561secs, dummy:33554432
32768 KB, 6.4334secs, dummy:33554432
65536 KB, 6.74219secs, dummy:33554432
131072 KB, 6.97896secs, dummy:33554432
同時,在Linux裏面,也可以使用getconf -a | grep CACHE
或者sudo lshw -C memory
查看系統cache配置。
*-cache:0
description: L1 cache
physical id: 4c
slot: CPU Internal L1
size: 896KiB
capacity: 896KiB
capabilities: internal write-back
configuration: level=1
*-cache:1
description: L2 cache
physical id: 4d
slot: CPU Internal L2
size: 3584KiB
capacity: 3584KiB
capabilities: internal write-back unified
configuration: level=2
*-cache:2
description: L3 cache
physical id: 4e
slot: CPU Internal L3
size: 35MiB
capacity: 35MiB
capabilities: internal write-back unified
configuration: level=3
*-cache:3
description: L1 cache
physical id: 50
slot: CPU Internal L1
size: 896KiB
capacity: 896KiB
capabilities: internal write-back
configuration: level=1
*-cache:4
description: L2 cache
physical id: 51
slot: CPU Internal L2
size: 3584KiB
capacity: 3584KiB
capabilities: internal write-back unified
configuration: level=2
*-cache:5
description: L3 cache
physical id: 52
slot: CPU Internal L3
size: 35MiB
capacity: 35MiB
capabilities: internal write-back unified
configuration: level=3
3:矩陣乘法:程序分析(基本版)
基本源碼
void gemm_v1(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{
size_t a_dim_x = m_a->x;
size_t a_dim_y = m_a->y;
size_t b_dim_x = m_b->x;
size_t b_dim_y = m_b->y;
size_t c_dim_x = m_c->x;
size_t c_dim_y = m_c->y;
printf("a:[%lu, %lu]; b:[%lu, %lu]; c:[%lu, %lu]; \n", a_dim_x, a_dim_y, b_dim_x, b_dim_y, c_dim_x, c_dim_y);
int* d_a = (int*)(m_a->m_data);
int* d_b = (int*)(m_b->m_data);
int* d_c = (int*)(m_c->m_data);
for (size_t i = 0; i < c_dim_x; ++i)
{
/* code */
for (size_t j = 0; j < c_dim_y; ++j)
{
/* code */
for (size_t k = 0; k < a_dim_y; ++k)
{
/* code */
//printf("i=%lu,j=%lu,k=%lu; dc=%d,da=%d,db=%d\n", i, j, k, d_c[i * c_dim_y + j], d_a[i * a_dim_y + k], d_b[k * b_dim_y + j]);
d_c[i * c_dim_y + j] += d_a[i * a_dim_y + k] * d_b[k * b_dim_y + j];
d_c[i * c_dim_y + j] %= 99811;
}
}
}
4:矩陣乘法:cache優化(行)
void gemm_v2(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{
size_t a_dim_x = m_a->x;
size_t a_dim_y = m_a->y;
size_t b_dim_x = m_b->x;
size_t b_dim_y = m_b->y;
size_t c_dim_x = m_c->x;
size_t c_dim_y = m_c->y;
printf("a:[%lu, %lu]; b:[%lu, %lu]; c:[%lu, %lu]; \n", a_dim_x, a_dim_y, b_dim_x, b_dim_y, c_dim_x, c_dim_y);
int* d_a = (int*)(m_a->m_data);
int* d_b = (int*)(m_b->m_data);
int* d_c = (int*)(m_c->m_data);
for (size_t i = 0; i < c_dim_x; ++i)
{
for (size_t k = 0; k < a_dim_y; ++k)
{
for (size_t j = 0; j < c_dim_y; ++j)
{
d_c[i * c_dim_y + j] += d_a[i * a_dim_y + k] * d_b[k * b_dim_y + j];
d_c[i * c_dim_y + j] %= 99811;
}
}
}
}
5:矩陣乘法:cache優化(多線程)
6:矩陣乘法:cache優化 (分塊)
/*
矩陣分開計算
C=A*B --- C(i,j)等於A的第i行乘以第j列
*/
#include <cstdio>
#include <ctime>
#include <cstdlib>
#include <cmath>
#include <thread>
#include <vector>
/*
generating matrix p = [m,n]
*/
void GenerateMatrix(float *m, int n);
void PrintMatrix(float *p, int n);
// traditional matrix mulplity
void GeneralMul(float *A, float *B, float *C, int n);
void ClearMatrix(float *m, int n);
void GeneralMul_ikj(float *A, float *B, float *C, int n);
/*
matrix multiplity with blocks
m is the block size
*/
void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m);
/*
check the diff for given two matrices c1 and c0
*/
float diff(float *C1, float *C0, int n);
struct ARG
{
float *A;
int ax, ay; //shape of matrix A
float *B;
int bx, by; //shape of matrix B
float *C;
int cx, cy; //shape of matrix C
int m; // block size
int n; // matrix size
};
int main(int argc, char **argv)
{
if (argc != 4)
{
printf("Usage: %s N thread_num M [N is the matrix size, thread_num is for maultiple threads, M is the block size]\n", argv[0]);
return 0;
}
int n = atoi(argv[1]); //matrix shapes (squared)
int thread_num = atoi(argv[2]);
int m = atoi(argv[3]); // blocks numbers
float *A = new float[n * n];
float *B = new float[n * n];
float *C = new float[n * n];
float *C0 = new float[n * n];
GenerateMatrix(A, n);
GenerateMatrix(B, n);
clock_t start;
float time_used;
ClearMatrix(C0, n);
start = clock();
GeneralMul(A, B, C0, n);
time_used = static_cast<float>(clock() - start) / CLOCKS_PER_SEC * 1000;
printf("General ijk: time = %f\n", time_used);
ClearMatrix(C0, n);
start = clock();
GeneralMul_ikj(A, B, C0, n);
time_used = static_cast<float>(clock() - start) / CLOCKS_PER_SEC * 1000;
printf("General ikj: time = %f\n", time_used);
ClearMatrix(C, n);
start = clock();
BlockCacul(A, B, C, n, thread_num, m);
time_used = static_cast<float>(clock() - start) / CLOCKS_PER_SEC * 1000;
printf("Block: time = %f\n", time_used);
printf("Difference of two result: %f\n", diff(C0, C, n));
delete [] A;
delete [] B;
delete [] C;
delete [] C0;
return 0;
}
void ClearMatrix(float *m, int n)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
m[i * n + j] = 0.0;
}
}
/*
the general matrix mulplity
*/
void GeneralMul(float *A, float *B, float *C, int n)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
float *p = C + i * n + j;
for (int k = 0; k < n; k++)
{
*p += A[i * n + k] * B[k * n + j];
}
}
}
}
void GeneralMul_ikj(float *A, float *B, float *C, int n)
{
for (int i = 0; i < n; i++)
{
for (int k = 0; k < n; k++)
{
float *p = A + i * n + k;
for (int j = 0; j < n; j++)
{
//float *p = C + i * n + j;
C[i * n + j] += *p * B[k * n + j];
}
}
}
}
int Mul_Fun(void* arg)
{
struct ARG *p = (struct ARG *)arg;
float *A = p->A;
float *B = p->B;
float *C = p->C;
int m = p->m;
int n = p->n;
for (int i = 0; i < m; i++)
{
for (int j = 0; j < m; j++)
{
float *t = C + (i + p->cx) * n + p->cy + j;
for (int k = 0; k < m; k++)
{
*t += A[(p->ax + i) * n + p->ay + k] * B[(p->bx + k) * n + p->by + j];
}
}
}
return 0;
}
int Mul_Fun_ikj(void* arg)
{
struct ARG *p = (struct ARG *)arg;
float *A = p->A;
float *B = p->B;
float *C = p->C;
int m = p->m;
int n = p->n;
for (int i = 0; i < m; i++)
{
for (int k = 0; k < m; k++)
{
float *t = A + (p->ax + i) * n + p->ay + k;
for (int j = 0; j < m; j++)
{
{
C[(i + p->cx) * n + p->cy + j] += *t * B[(p->bx + k) * n + p->by + j];
}
}
}
}
return 0;
}
void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m)
{
// m is the block size
//m = static_cast<int>(sqrt(m));
struct ARG *args = new struct ARG[thread_num];
std::vector<std::thread*> tid;
int t = 0;
int i;
for (i = 0; i < thread_num; i++)
{
args[i].A = A;
args[i].B = B;
args[i].C = C;
args[i].m = m;
args[i].n = n;
}
//divide into n/m * n/m blocks, each block is with a size [m, m]
//the ith row and jth col for A
for (i = 0; i < n; i += m)
{
for (int j = 0; j < n; j += m)
{
//B j行k列
for (int k = 0; k < n; k += m)
{
args[t].ax = i;
args[t].ay = j;
args[t].bx = j;
args[t].by = k;
args[t].cx = i;
args[t].cy = k;
//Mul_Fun((void*)(&args[t]));
Mul_Fun_ikj((void*)(&args[t]));
/*
if (t < thread_num)
{
// handle[t] = CreateThread(NULL, 0, Mul_Fun, (void*)(&args[t]), 0, 0 );
tid.push_back(new std::thread(Mul_Fun, (void*)(&args[t])));
t++;
}
if (t == thread_num)
{
for (int ii = 0; ii < t; ii++)
tid[ii]->join();
t = 0;
}
*/
}
}
}
}
void GenerateMatrix(float *p, int n)
{
srand(time(NULL) + rand());
for (int i = 0; i < n * n; i++)
{
*p = static_cast<float>(rand()) / (static_cast<float>(rand()) + static_cast<float>(0.55));
p++;
}
}
float diff(float *C1, float *C0, int n)
{
float rst = 0.0;
float t;
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
t = C1[i * n + j] - C0[i * n + j];
if (t < 0)
t = -t;
rst += t;
}
}
return rst;
}
void PrintMatrix(float *p, int n)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
printf("%.2f\t", p[i * n + j]);
}
printf("\n");
}
printf("\n");
}
7:完整代碼
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
typedef struct
{
void* m_data;
size_t x, y;
} matrix_descriptor;
void gemm_v2(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{
...
}
void gemm_v1(const matrix_descriptor* m_a, const matrix_descriptor* m_b, matrix_descriptor* m_c)
{
...
}
int main(int argc, char const *argv[])
{
size_t dim_x = 2000, dim_y = 100, dim_z = 2000;
/* code */
int* m_a = (int*)malloc(sizeof(int) * dim_x * dim_y);
int* m_b = (int*)malloc(sizeof(int) * dim_z * dim_y);
int* m_c = (int*)malloc(sizeof(int) * dim_x * dim_z);
{
for (size_t i = 0; i < dim_x; ++i)
{
for (size_t j = 0; j < dim_y; ++j)
{
m_a[i * dim_y + j] = 1;
}
}
for (size_t i = 0; i < dim_y; ++i)
{
for (size_t j = 0; j < dim_z; ++j)
{
m_b[i * dim_z + j] = 1;
}
}
memset(m_c, 0, sizeof(int)*dim_x * dim_z);
}
matrix_descriptor a = {.m_data = m_a, .x = dim_x, .y = dim_y};
matrix_descriptor b = {.m_data = m_b, .x = dim_y, .y = dim_z};
matrix_descriptor c = {.m_data = m_c, .x = dim_x, .y = dim_z};
clock_t start, stop;
start = clock();
double duration;
gemm_v2(&a, &b, &c);
stop = clock();
duration = (double)(stop - start) / CLOCKS_PER_SEC;
printf("Result --> C: [%lu, %lu]\n", dim_x, dim_z);
printf("The time was: %f\n", duration);
{
free(m_a);
free(m_b);
free(m_c);
}
return 0;
}