SSE intrinsic函数_优化

    编写一个基于SSE多媒体指令集的快速矩阵加法运算函数,输入参数为两个单精度浮点型数组srcA与srcB,长度为N,输出结果保存在一个单精度浮点型数组dest中,假设srcA、srcB以及dest内存空间的首地址均按照16-byte对齐。请利用多媒体指令集获得最大的程序性能(可以使用Visual Studio中的SSE intrinsic函数)

 

推荐函数定义:void SSE_Add(float* srcA, float* srcB, float* dest, int M) {}

 

编程实现:

// SSE.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <xmmintrin.h>
#include <iomanip>
#include <stdlib.h>
#include <time.h>
#include <windows.h>
#include<iostream>
using namespace std;
 
void SSE_Add(float* srcA, float* srcB, float* dest, int N)
{
 
     __m128 a, b, c;
     int len = N/4;
 
     for(int i=0;i<4*len;i=i+4)
     {
         a = _mm_set_ps(srcA[i+3], srcA[i+2], srcA[i+1], srcA[i]);
         b = _mm_set_ps(srcB[i+3], srcB[i+2], srcB[i+1], srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i+3] = c.m128_f32[3];
         dest[i+2] = c.m128_f32[2];
         dest[i+1] = c.m128_f32[1];
         dest[i] = c.m128_f32[0];
     }
     int last = N-4*len;
     //cout <<last<<endl;
     if(last == 3)
     {
         int i = 4*len;
         a = _mm_set_ps(0, srcA[i+2], srcA[i+1], srcA[i]);
         b = _mm_set_ps(0, srcB[i+2], srcB[i+1], srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i+2] = c.m128_f32[2];
         dest[i+1] = c.m128_f32[1];
         dest[i] = c.m128_f32[0];
     }
 
     if(last == 2)
     {
         int i = 4*len;
         a = _mm_set_ps(0, 0, srcA[i+1], srcA[i]);
         b = _mm_set_ps(0, 0, srcB[i+1], srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i+1] = c.m128_f32[1];
         dest[i] = c.m128_f32[0];
     }
     if(last == 1)
     {
         int i = 4*len;
         a = _mm_set_ps(0, 0, 0, srcA[i]);
         b = _mm_set_ps(0, 0, 0, srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i] = c.m128_f32[0];
     }
}
 
void normal_Add(float* srcA, float* srcB, float* dest, int N)
{
     for(int i=0;i<N;i++)
         dest[i] = srcA[i] + srcB[i];
}
 
int main()
{
     double len=100009;//len=100010;
     double run_time;
     double  duration,duration1;
 
     float *srcA = new float[len];
     float *srcB = new float[len];
     float *dest = new float[len];
 
     int i;
     for( i=0;i<len;i++)
     {
         srcA[i] = (float)i;
         srcB[i] = (float)i;
     }
    
     SYSTEMTIME sys;
     SYSTEMTIME sys_end;
     double calcRunTime;
 
/*
     SSE_Add(srcA,srcB,dest,len);
     for(int i=0;i<len;i++)
         cout<<setw(7)<<dest[i]<<endl;
*/
    
     for(int m =0 ;m<3;m++)
     {
         cout<<"第"<<m<<"次测试:"<<endl;
         run_time = 10;
         for(;run_time <1000000;run_time = run_time*10)
         {
              calcRunTime = len * run_time;
              cout<<"运行"<<calcRunTime<<"次加法:";
              //优化前
              GetLocalTime( &sys );
              for(i=0;i<run_time;i++)
                   normal_Add(srcA,srcB,dest,len);
              GetLocalTime( &sys_end );
              duration = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds
                   -(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);
             
              cout<<"优化前"<<"用时"<<duration<<"ms    ";
 
 
              //优化后
              GetLocalTime( &sys );
              for(i=0;i<run_time;i++)
                   SSE_Add(srcA,srcB,dest,10000);
 
              GetLocalTime( &sys_end );
              duration1 = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds
                   -(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);
 
              float speedup;
              if(duration1 == 0)
                   speedup = 0;
              else
                   speedup = duration/duration1;
 
              cout<<"优化后"<<"用时"<<duration1<<"ms"<<"速度提高"<<speedup<<"倍"<<endl;
         }
     }
     return 0;
}


 

 

运行环境:

Cpu T7250 ,内存:1G,XP系统

 

优化结果截图

 优化效果图

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章