純軟件實現CRC32經常是藉助於查表實現的(https://pycrc.org/一個可以生成CRC C語言計算代碼的工具),當計算CRC32過於頻繁時可通硬件指令優化以減少對CPU的佔用。目前Intel支持的用於計算CRC的有CRC32和PCLMULQDQ兩個指令。本文僅討論使用CRC32指令的使用。CRC32指令計算的是iSCSI CRC,也就是生成多項式爲0x11EDC6F41的32位CRC。
使用CRC32指令的方式有2種:一種是直接使用(內聯)彙編代碼;另一種是藉助編譯器intrinsics。本文介紹藉助編譯器intrinsics計算CRC32的過程。
1 使用CRC32指令之前必須檢測處理器是否支持SSE4.2
可通過 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1) 來判斷。
int check_support_sse4_2() {
int res=0;
__asm__ __volatile__(
"movl $1,%%eax\n\t"
"cpuid\n\t"
"test $0x0100000,%%ecx\n\t"
"jz 1f\n\t"
"movl $1,%0\n\t"
"1:\n\t"
:"=m"(res)
:
:"eax","ebx","ecx","edx");
return res;
}
1.2 利用gcc提供的cpuid.h
#include <cpuid.h>
#include <stdio.h>
void
main () {
unsigned int eax, ebx, ecx, edx;
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
if (ecx & bit_SSE4_2)
printf ("SSE4.2 is supported\n");
return;
}
2 使用compiler intrinsics(x86intrin.h) 計算CRC32c
#ifdef __x86_64__
#define ALIGN_SIZE 8
#else
#define ALIGN_SIZE 4
#endif
#define ALIGN_MASK (ALIGN_SIZE - 1)
uint32_t extend(uint32_t init_crc, const char *data, size_t n) {
uint32_t res = init_crc ^ 0xffffffff;
size_t i;
#ifdef __x86_64__
uint64_t *ptr_u64;
uint64_t tmp;
#endif
uint32_t *ptr_u32;
uint16_t *ptr_u16;
uint8_t *ptr_u8;
// aligned to machine word's boundary
for (i = 0; (i < n) && ((intptr_t)(data + i) & ALIGN_MASK); ++i) {
res = _mm_crc32_u8(res, data[i]);
}
#ifdef __x86_64__
tmp = res;
while (n - i >= sizeof(uint64_t)) {
ptr_u64 = (uint64_t *)&data[i];
tmp = _mm_crc32_u64(tmp, *ptr_u64);
i += sizeof(uint64_t);
}
res = (uint32_t)tmp;
#endif
while (n - i >= sizeof(uint32_t)) {
ptr_u32 = (uint32_t *)&data[i];
res = _mm_crc32_u32(res, *ptr_u32);
i += sizeof(uint32_t);
}
while (n - i >= sizeof(uint16_t)) {
ptr_u16 = (uint16_t *)&data[i];
res = _mm_crc32_u16(res, *ptr_u16);
i += sizeof(uint16_t);
}
while (n - i >= sizeof(uint8_t)) {
ptr_u8 = (uint8_t *)&data[i];
res = _mm_crc32_u8(res, *ptr_u8);
i += sizeof(uint8_t);
}
return res ^ 0xffffffff;
}
static inline uint32_t crc32c(const char *data, size_t n) {
return extend(0, data, n);
}
4 引用
[1] Intel® 64 and IA-32 Architectures Software Developer’s Manual
[2]Choosing a CRC polynomial and associated method for Fast CRC Computation on Intel® Processors
[3] simd,http://dirlt.com/simd.html