單線程、SSE、AVX運行效率對比——加法運算

使用Intrinsics函數操作SIMD指令集——頭文件查找表
頭文件 指令集描述
intrin.h All Architectures
mmintrin.h MMX
xmmintrin.h SSE
emmintrin.h SSE2
pmmintrin.h SSE3
smmintrin.h SSE4.1
nmmintrin.h SSE4.2
immintrin.h AVX

math_function.h

#pragma once

#include <immintrin.h>
#include <stdio.h>

float MathSum(const float *input, int size);
float SSESum(const float *input, int size);
float AVXSum(const float *input, int size);

math_function.cpp

#include "math_function.h"

float MathSum(const float *input, int size)
{
	float output = 0.0;
	for (int i = 0; i < size; i++)
	{
		output += input[i];
	}
	return output;
}

float SSESum(const float *input, int size)
{
	if (input == nullptr)
	{
		printf("input data is null\n");
		return -1;
	}
	int nBlockWidth = 4;
	int cntBlock = size / nBlockWidth;
	int cntRem = size % nBlockWidth;

	float output = 0;
	__m128 loadData;
	__m128 sumData = _mm_setzero_ps();
	const float *p = input;
	for (int i = 0; i < cntBlock; i++)
	{
		loadData = _mm_load_ps(p);
		sumData = _mm_add_ps(sumData, loadData);
		p += nBlockWidth;
	}
	sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
	sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
	output += sumData.m128_f32[(0)];         // 前4組

	for (int i = 0; i < cntRem; i++)
	{
		output += p[i];
	}

	return output;
}

float AVXSum(const float *input, int size)
{
	if (input == nullptr)
	{
		printf("input data is null\n");
		return -1;
	}
	int nBlockWidth = 8;
	int cntBlock = size / nBlockWidth;
	int cntRem = size % nBlockWidth;

	float output = 0;
	__m256 loadData;
	__m256 sumData = _mm256_setzero_ps();
	const float *p = input;
	for (int i = 0; i < cntBlock; i++)
	{
		loadData = _mm256_load_ps(p);
		sumData = _mm256_add_ps(sumData, loadData);
		p += nBlockWidth;
	}
	sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ... 
	sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ... 
	output += sumData.m256_f32[(0)];            // 前4組
	output += sumData.m256_f32[(4)];            // 後4組

	for (int i = 0; i < cntRem; i++)
	{
		output += p[i];
	}

	return output;
}

main.cpp

#include "math_function.h"
#include <time.h>

int main(int argc, char* argv[])
{
	int size = 27;
	float *input = (float *)malloc(sizeof(float) * size);
	for (int i = 0; i < size; i++)
		input[i] = 0.0025;

	int cntLoop = 300000000;
	clock_t start_t = clock();
	float org = 0.0;
	for (int i = 0; i < cntLoop; i++)
		org = MathSum(input, size);
	printf("org = %f\t", org);
	printf("cost time: %d\n", clock() - start_t);

	start_t = clock();
	float sse = 0.0;
	for (int i = 0; i < cntLoop; i++)
		sse = SSESum(input, size);
	printf("sse = %f\t", sse);
	printf("cost time: %d\n", clock() - start_t);

	start_t = clock();
	float avx = 0.0;
	for (int i = 0; i < cntLoop; i++)
		avx = AVXSum(input, size);
	printf("avx = %f\t", avx);
	printf("cost time: %d\n", clock() - start_t);

	getchar();
	free(input);

	return 0;
}

運行結果

測試硬件:CPU-4790-4core

預處理器:_WINDOWS

命令行:/arch:AVX 

 

任何問題請加唯一QQ2258205918(名稱samylee)!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章