SSE intrinsic函數_優化

    編寫一個基於SSE多媒體指令集的快速矩陣加法運算函數,輸入參數爲兩個單精度浮點型數組srcA與srcB,長度爲N,輸出結果保存在一個單精度浮點型數組dest中,假設srcA、srcB以及dest內存空間的首地址均按照16-byte對齊。請利用多媒體指令集獲得最大的程序性能(可以使用Visual Studio中的SSE intrinsic函數)

 

推薦函數定義:void SSE_Add(float* srcA, float* srcB, float* dest, int M) {}

 

編程實現:

// SSE.cpp : 定義控制檯應用程序的入口點。
//
#include "stdafx.h"
#include <xmmintrin.h>
#include <iomanip>
#include <stdlib.h>
#include <time.h>
#include <windows.h>
#include<iostream>
using namespace std;
 
void SSE_Add(float* srcA, float* srcB, float* dest, int N)
{
 
     __m128 a, b, c;
     int len = N/4;
 
     for(int i=0;i<4*len;i=i+4)
     {
         a = _mm_set_ps(srcA[i+3], srcA[i+2], srcA[i+1], srcA[i]);
         b = _mm_set_ps(srcB[i+3], srcB[i+2], srcB[i+1], srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i+3] = c.m128_f32[3];
         dest[i+2] = c.m128_f32[2];
         dest[i+1] = c.m128_f32[1];
         dest[i] = c.m128_f32[0];
     }
     int last = N-4*len;
     //cout <<last<<endl;
     if(last == 3)
     {
         int i = 4*len;
         a = _mm_set_ps(0, srcA[i+2], srcA[i+1], srcA[i]);
         b = _mm_set_ps(0, srcB[i+2], srcB[i+1], srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i+2] = c.m128_f32[2];
         dest[i+1] = c.m128_f32[1];
         dest[i] = c.m128_f32[0];
     }
 
     if(last == 2)
     {
         int i = 4*len;
         a = _mm_set_ps(0, 0, srcA[i+1], srcA[i]);
         b = _mm_set_ps(0, 0, srcB[i+1], srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i+1] = c.m128_f32[1];
         dest[i] = c.m128_f32[0];
     }
     if(last == 1)
     {
         int i = 4*len;
         a = _mm_set_ps(0, 0, 0, srcA[i]);
         b = _mm_set_ps(0, 0, 0, srcB[i]);
         c = _mm_set_ps(0, 0, 0, 0);
         c = _mm_add_ps(a, b);
        
         dest[i] = c.m128_f32[0];
     }
}
 
void normal_Add(float* srcA, float* srcB, float* dest, int N)
{
     for(int i=0;i<N;i++)
         dest[i] = srcA[i] + srcB[i];
}
 
int main()
{
     double len=100009;//len=100010;
     double run_time;
     double  duration,duration1;
 
     float *srcA = new float[len];
     float *srcB = new float[len];
     float *dest = new float[len];
 
     int i;
     for( i=0;i<len;i++)
     {
         srcA[i] = (float)i;
         srcB[i] = (float)i;
     }
    
     SYSTEMTIME sys;
     SYSTEMTIME sys_end;
     double calcRunTime;
 
/*
     SSE_Add(srcA,srcB,dest,len);
     for(int i=0;i<len;i++)
         cout<<setw(7)<<dest[i]<<endl;
*/
    
     for(int m =0 ;m<3;m++)
     {
         cout<<"第"<<m<<"次測試:"<<endl;
         run_time = 10;
         for(;run_time <1000000;run_time = run_time*10)
         {
              calcRunTime = len * run_time;
              cout<<"運行"<<calcRunTime<<"次加法:";
              //優化前
              GetLocalTime( &sys );
              for(i=0;i<run_time;i++)
                   normal_Add(srcA,srcB,dest,len);
              GetLocalTime( &sys_end );
              duration = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds
                   -(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);
             
              cout<<"優化前"<<"用時"<<duration<<"ms    ";
 
 
              //優化後
              GetLocalTime( &sys );
              for(i=0;i<run_time;i++)
                   SSE_Add(srcA,srcB,dest,10000);
 
              GetLocalTime( &sys_end );
              duration1 = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds
                   -(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);
 
              float speedup;
              if(duration1 == 0)
                   speedup = 0;
              else
                   speedup = duration/duration1;
 
              cout<<"優化後"<<"用時"<<duration1<<"ms"<<"速度提高"<<speedup<<"倍"<<endl;
         }
     }
     return 0;
}


 

 

運行環境:

Cpu T7250 ,內存:1G,XP系統

 

優化結果截圖

 優化效果圖

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章