DSP320C6000的指令列表彙集

TMS320C6000產品是美國TI公司於1997年推出的dsp芯片,該DSP芯片定點、浮點兼容,其中,定點系列是TMS320C62xx系列,浮點系列是TMS320C67xx系列,2000年3月,TI發佈新的C64xx內核,主頻爲1.1GHz,處理速度9000MIPS,在圖像處理和流媒體領域得到了廣泛的應用。

C6000片內有8個並行的處理單元,分爲相同的兩組。DSP的體系結構採用超長指令字(vliw)結構,單指令字長爲32位,指令包裏有8條指令,總字長達到256位。執行指令的功能單元已經在編譯時分配好,程序運行時通過專門的指令分配模塊,可以將每個256爲的指令包同時分配到8個處理單元,並有8個單元同時運行。芯片最高時鐘頻率爲300MHz(67xx系列),且內部8個處理單元並行運行時,其最大處理能力可達到1600MIPS。

DSP320C6000系列

DSP320C6000的指令列表彙集

  內聯指令 彙編指令 簡要描述

  int _abs (int src);

  int _labs (__int40_t src)ABS返回src的絕對值

  int _add2 (int src1, int src2)ADD2把src1的高、低16位和src2的高、低16位分別相加,放入結果的高、低16位

  ushort & _amem2 (void *ptr);LDHU

  STHU從內存中加載一個halfword到dst裏,必須2byte對齊(讀或存)

  const ushort & _amem2_const (const void *ptr);LDHU必須2byte對齊(讀)

  unsigned & _amem4 (void *ptr);LDW

  STW必須4byte對齊(讀或存)

  const unsigned & _amem4_const (const void *ptr);LDW必須4byte對齊(讀)

  double & _amemd8 (void *ptr);LDW/LDW

  STW/STW必須8byte對齊(讀或存)

  const double & _amemd8_const (const void *ptr);LDDW必須8byte對齊(讀)

  unsigned _clr (unsigned src2, unsigned csta,unsigned cstb);CLR指定了從需要清0的首位和末位

  unsigned _clrr (unsigned src2, int src1);CLR將src2中指定位清0,清0的首位和末位由src1的低10位指定

  __int40_t _dtol (double src); 將一個double寄存器重新解釋成一個__int40_t

  long long _dtoll (double src); 將一個double寄存器重新解釋成一個long long

  int _ext (int src2, unsigned csta, unsigned cstb);EXT從src2裏提取csta和cstb指定的區域且符號擴展到32位。提取出的區域先符號左移再右移。

  int _extr (int src2, int src1);EXT同上,區別:左右移的位數由src1的低10位指定

  unsigned _extu (unsigned src2, unsigned csta , unsigned cstb);EXTU同上上,區別最後是0擴展到32位。

  unsigned _extur (unsigned src2, intsrc1);EXTU同上,區別:左右移的位數由src1的低10位指定例:

  _ftoi (1.0) == 1065353216U

  unsigned _ftoi (float src); 將float的比特位解釋成unsigned

  unsigned _hi (double src); 返回double寄存器的高位(奇數位)

  unsigned _hill (long long src); 返回longlong寄存器的高位(奇數位)

  double _itod (unsigned src2, unsignedsrc1); 創建一個新的double寄存器爲了解釋2個unsigned的值,其中src2是高(奇數)寄存器,src1是低(偶數)寄存器

  float _itof (unsigned src); 將unsigned中的比特位解釋成float例:

  _itof (0x3f800000) = 1.0

  long long _itoll (unsigned src2, unsignedsrc1); 創建一個新的longlong寄存器爲了解釋2個unsigned的值,其中src2是高(奇數)寄存器,src1是低(偶數)

  unsigned _lmbd (unsigned src1, unsignedsrc2);LMBD搜索src2裏面的1或0,1或0是由src1的LSB決定的,返回比特位變化的位數

  unsigned _lo (double src); 返回double寄存器的低(奇數)寄存器

  unsigned _loll (long long src); 返回longlong寄存器的低(奇數)寄存器

  double _ltod (__int40_t src); 把一個__int40_t寄存器解釋成一個double寄存器

  double _lltod (long long src); 把一個longlong寄存器解釋成一個double寄存器

  int _mpy (int src1, int src2);MPYSrc1和src2相乘,操作數默認爲有符號的

  int _mpyus (unsigned src1, int src2);MPYUS無符號src1和有符號src2相乘,S是用來那個是有符號的操作數,當兩個操作數都是有符號的或者無符號的

  int _mpysu (int src1, unsigned src2);MPYSU同上

  unsigned _mpyu (unsigned src1,unsigned src2);MPYU同上上上,默認爲無符號

  int _mpyh (int src1, int src2);MPYH同上,區別見圖示

  int _mpyhus (unsigned src1, int src2);MPYHUS

  int _mpyhsu (int src1, unsigned src2);MPYHSU

  unsigned _mpyhu (unsigned src1,unsigned src2);MPYHU

  int _mpyhl (int src1, int src2);MPYHL同上,區別見圖示

  int _mpyhuls (unsigned src1, int src2);MPYHULS

  int _mpyhslu (int src1, unsigned src2);MPYHSLU

  unsigned _mpyhlu (unsigned src1,unsigned src2);MPYHLU

  int _mpylh (int src1, int src2);MPYLH

  int _mpyluhs (unsigned src1, int src2);MPYLUHS

  int _mpylshu (int src1, unsigned src2);MPYLSHU

  unsigned _mpylhu (unsigned src1,unsigned src2);MPYLHU

  void _nassert (int src); 不生成代碼,告訴優化器一些事情

  unsigned _norm (int src);

  unsigned _lnorm (__int40_t src);NORM

  返回src2的冗餘的符號比特位的個數,具體見圖示

  int _sadd (int src1, int src2);

  long _lsadd (int src1, __int40_t src2);SADD將src1和src2相加,且飽和其結果

  int _sat (__int40_t src2);SAT將一個40比特的long轉換爲一個32比特的有符號int,如有需要,對結果進行飽和

  unsigned _set (unsigned src2, unsignedcsta , unsigned cstb);SET將src2中指定的區域置位1,指定的區域由csta和cstb指定

  unsigned _setr (unit src2, int src1);SET

  int _smpy (int src1, int src2);SMPY把src1的低16位和src2的低16位相乘

  int _smpyh (int src1, int src2);SMPYH高16位

  int _smpyhl (int src1, int src2);SMPYHL

  int _smpylh (int src1, int src2);SMPYLH

  int _sshl (int src2, unsigned src1);SSHL以src1操作數將src2左移,並且將結果飽和在32位

  int _ssub (int src1, int src2);

  __int40_t _lssub (int src1, __int40_tsrc2);SSUB從src1中減去src2,並飽和結果(src1-src2)

  unsigned _subc (unsigned src1, unsignedsrc2);SUBC有條件的減和左移(常用於除法)

  int _sub2 (int src1, int src2);SUB2把src1的高低16位分別減去src2的高低16位。任何低16位的借位不會影響高16位。

  int _abs2 (int src);ABS2計算16位的絕對值

  int _add4 (int src1, int src2);ADD4把src1和src2的4對8位數相加。不會進行飽和,進位不會影響其他的8位數

  long long & _amem8 (void *ptr);LDDW

  STDW加載和存儲8bytes,指針必須8byte對齊

  const long long & _amem8_const (const void *ptr);LDDW加載8bytes,指針必須8byte對齊

  __float2_t & _amem8_f2(void * ptr);LDDW

  STDW加載和存儲8bytes,指針必須8byte對齊,必須包含c6x.h

  const __float2_t & _amem8_f2_const(void * ptr);LDDW加載8bytes,指針必須8byte對齊,必須包含c6x.h

  double & _amemd8 (void *ptr);LDDW

  STDW

  const double & _amemd8_const (const void *ptr);LDDW

  int _avg2 (int src1, int src2);AVG2計算每對有符號16位置的平均值

  unsigned _avgu4 (unsigned, unsigned);AVGU4計算每對有符號8位數的平均值

  unsigned _bitc4 (unsigned src);BITC4統計每個8位的比特位是1的個數,寫入結果對應位置

  unsigned _bitr (unsigned src);BITR翻轉比特位的順序

  int _cmpeq2 (int src1, int src2);CMPEQ2比較每16位的值是否相等,結果放入dst的最低2位

  int _cmpeq4 (int src1, int src2);CMPEQ4比較每8位的值是否相等,結果放入dst的最低4位,相等置1,否則爲0

  int _cmpgt2 (int src1, int src2);CMPGT2每16位有符號比較,src1》src2,置爲1;否則置爲0。結果放入dst的最低2位

  unsigned _cmpgtu4 (unsigned src1,unsigned src2);CMPGTU4每8位無符號比較,src1》src2,置爲1;否則置爲0。結果放入dst的最低4位

  unsigned _deal (unsigned src );DEAL將src中的比特位的奇數位和偶數位抽出來進行重組,偶數位放在低的16位,奇數位放在高的16位

  int _dotp2 (int src1, int src2);

  __int40_t _ldotp2 (int src1, int src2);DOTP2

  DOTP2將src1中的和src2中的16位有符號對進行點積,結果被寫成有符號32位int或者符號擴展爲64位

  int _dotpn2 (int src1, int src2);DOTPN2將src1和src2中的16位有符號數進行點積相減

  int _dotpnrsu2 (int src1, unsigned src2);DOTPNRSU2Src1和src2的高16位的點積減去低16位的點積。Src1中的數被當做有符號,src2中的數被當做無符號,再加上2^15,結果再符號右移16位

  int _dotprsu2 (int src1, unsigned src2);DOTPRSU2Src1和src2的高16位的點積加上低16位的點積。Src1中的數被當做有符號,src2中的數被當做無符號,再加上2^15,結果再符號右移16位

  int _dotpsu4 (int src1, unsigned src2);DOTPSU4將src1和src2的每8位進行相乘再求和,src1的每8位數被當做有符號,src2的每8位數被當做無符號

  unsigned _dotpu4 (unsigned src1,unsigned src2);DOTPU4都被當做無符號的

  int _gmpy4 (int src1, int src2);GMPY4將src1和src2的4個無符號進行伽羅瓦域的乘法

  int _max2 (int src1, int src2);MAX2將src1和src2的2個有符號16位整數比較,取較大值

  int _min2 (int src1, int src2);MIN2將src1和src2的2個有符號16位整數比較,取較小值

  unsigned _maxu4 (unsigned src1,unsigned src2);MAXU4將src1和src2的4個無符號8位整數比較,取較大值

  unsigned _minu4 (unsigned src1,unsigned src2);MINU4將src1和src2的4個無符號8位整數比較,取較小值

  ushort & _mem2 (void * ptr);LDB/LDB

  STB/STB加載和存儲2byte,不需要對齊

  const ushort & _mem2_const (const void * ptr);LDB/LDB加載2byte,不需要對齊

  unsigned & _mem4 (void * ptr);LDNW

  STNW加載和存儲4byte,不需要對齊

  const unsigned & _mem4_const (const void * ptr);LDNW加載4byte,不需要對齊

  long long & _mem8 (void * ptr);LDNDW

  STNDW加載和存儲8byte,不需要對齊

  const long long & _mem8_const (const void * ptr);LDNDW加載8byte,不需要對齊

  double & _memd8 (void * ptr);LDNDW

  STNDW加載和存儲8byte,不需要對齊

  const double & _memd8_const (const void * ptr);LDNDW加載8byte,不需要對齊

  long long _mpy2ll (int src1, int src2);MPY2將src1和src2中的2個有符號16位分別相乘,將2個32位的結果寫入longlong中

  long long _mpyhill (int src1, int src2);MPYHI將src1中高16位作爲1個有符號16位乘以src2的有符號32位,結果寫入longlong的低48位

  long long _mpylill (int src1, int src2);MPYLI將src1中低16位作爲1個有符號16位乘以src2的有符號32位,結果寫入longlong的低48位

  int _mpyhir (int src1, int src2);MPYHIR將src1的高16位作爲一個16位有符號乘以src2的有符號32位。乘積利用round模式通過加2^14轉成32位,最後再右移15位

  int _mpylir (int src1, int src2);MPYLIR將src1的低16位作爲一個16位有符號乘以src2的有符號32位。乘積利用round模式通過加2^14轉成32位,最後再右移15位

  long long _mpysu4ll (int src1, unsignedsrc2);MPYSU4將src1的4個8位有符號乘src2的4個8位無符號,得到4個16位有符號,組成一個64位

  long long _mpyu4ll (unsigned src1,unsigned src2);MPYU4將src1和src2的4個無符號8位相乘,得到4個無符號16位組成一個64位的數

  int _mvd (int src2 );MVD將src2的數據移入返回值中,利用了乘法流水線(延遲)

  unsigned _pack2 (unsigned src1,unsigned src2);PACK2

  unsigned _packh2 (unsigned src1,unsigned src2);PACKH2

  unsigned _packh4 (unsigned src1,unsigned src2);PACKH4

  unsigned _packl4 (unsigned src1,unsigned src2);PACKL4

  unsigned _packhl2 (unsigned src1,unsigned src2);PACKHL2

  unsigned _packlh2 (unsigned src1,unsigned src2);PACKLH2

  unsigned _rotl (unsigned src1, unsignedsrc2);ROTL按照src1的最低5位的數去左移src2的32位,src1中剩下的高的5-31位被忽略

  int _sadd2 (int src1, int src2);SADD2將src1和src2中的2個16位有符號數相加,生成2個16有符號數並且是飽和過的。

  int _saddus2 (unsigned src1, int src2);SADDUS2將src1中的2個無符號16位數和src中的2個16位有符號數相加,得到2個無符號16位數

  unsigned _saddu4 (unsigned src1,unsigned src2);SADDU4將src1和src2中的4個無符號8位數相加

  unsigned _shfl (unsigned src2);SHFL將src2的高16和低16位進行交織

  unsigned _shlmb (unsigned src1,unsigned src2);SHLMB將src2左移1byte,然後將src1的最高位充入src2左移後多出來的位置

  unsigned _shrmb (unsigned src1,unsigned src2);SHRMB將src2右移1byte,然後將src1的最低位充入src2右移後多出來的位置

  int _shr2 (int src1, unsigned src2);SHR2將src2的2個16位有符號數分別右移,右移的位數由src1的低5位決定,多出的位置由符號位擴展

  unsigned shru2 (unsigned src1, unsignedsrc2);SHRU2將src2的2個16位無符號數分別右移,右移的位數由src1的低5位決定,多出的位置由0擴展

  long long _smpy2ll (int src1, int src2);SMPY2將src1和src2中的2個有符號16位數相乘,然後左移1位,再進行飽和。

  int _spack2 (int src1, int src2);SPACK2將src1和src2中的1個有符號32位數進行飽和到有符號16位,然後把src1的飽和結果放入dst的高16位,src2的飽和結果放入dst的低16位

  unsigned _spacku4 (int src1 , int src2);SPACKU4將src1和src2中的4個有符號16位數飽和成無符號8位數,

  int _sshvl (int src2, int src1);SSHVL將src2中的有符號32位數左移或右移,移位的數量由src1指定的比特數確定。

  src1在[-31,31]之間,如果src1爲正,src2則左移;如果src1爲負,src2右移|src1|且符號位擴展

  int _sshvr (int src2, int src1);SSHVR將src2中的有符號32位數左移或右移,移位的數量由src1指定的比特數確定。

  src1在[-31,31]之間,如果src1爲正,src2則右移且是符號擴展;如果src1爲負,src2左移|src1|

  int _sub4 (int src1, int src2);SUB4將src1和src2中的4個8位數相減,不進行飽和

  int _subabs4 (int src1, int src2);SUBABS4將src1和src2中的4個無符號8位相減求絕對值

  unsigned _swap4 (unsigned src);SWAP4將src的4個8位無符號數按圖示換位置

  unsigned _unpkhu4 (unsigned src);UNPKHU4擴展0

  unsigned _unpklu4 (unsigned src);UNPKLU4擴0

  unsigned _xpnd2 (unsigned src);XPND2按src的最低2位進行擴展,bit1擴展高16位,bit0擴展低16位

  unsigned _xpnd4 (unsigned src);XPND4按src的最低4位進行擴展

  long long _addsub (int src1, int src2);ADDSUB平行做2步:

  1、src2+src1-》dst_o

  2、src1-src2-》dst_e

  long long _addsub2 (int src1, int src2);ADDSUB216位有符號

  ADD2:src2的高、低16位+src1的高、低16位-》dst_o

  SUB2: src1的高、低16位-src2的高、低16位-》dst_e

  long long _cmpy (unsigned src1,unsigned src2);CMPY有符號16位

  Src1和src2的高16位的點積-src1和src2的低16位點積-》dst_o

  飽和(src1和src2的高16位的點積+src1和src2的低16位點積)-》dst_e

  unsigned _cmpyr (unsigned src1,unsigned src2);CMPYR

  unsigned _cmpyr1 (unsigned src1,unsigned src2 );CMPYR1

  long long _ddotp4 (unsigned src1,unsigned src2);DDOTP4沒有飽和

  long long _ddotph2 (long long src1,unsigned src2);DDOTPH2

  long long _ddotpl2 (long long src1,unsigned src2);DDOTPL2

  unsigned _ddotph2r (long long src1,unsigned src2);DDOTPH2R

  unsigned _ddotpl2r (long long src1,unsigned src2);DDOTPL2R

  long long _dmv (int src1, int src2);DMV將兩個寄存器移入一個寄存器一次性的

  long long _dpack2 (unsigned src1,unsigned src2);DPACK2

  long long _dpackx2 (unsigned src1,unsigned src2);DPACKX2

  __float2_t _fmdv_f2(float src1, floatsrc2)DMV

  unsigned _gmpy (unsigned src1,unsigned src2);GMPY伽羅瓦域上的乘法

  long long _mpy2ir (int src1, int src2);MPY2IR進行16位乘32位。

  將src1的高16位和低16位當做有符號16位;將src2的值當做有符號32位。

  乘積通過加上2^14round到32位,然後結果右移15位。

  2個結果的低32位寫入dst_o:dst_e

  int _mpy32 (int src1, int src2);MPY32進行32位乘32位。都是有符號的,64位結果中的低32位寫入dst

  long long _mpy32ll (int src1, int src2);MPY3232位有符號數×32位有符號數,有符號的64位結果被寫入dst

  long long _mpy32su (int src1, int src2);MPY32SUsrc1有符號32位×src2無符號32位=dst有符號64位

  long long _mpy32us (unsigned src1, intsrc2);MPY32USsrc1無符號32位×src2有符號32位=dst有符號64位

  long long _mpy32u (unsigned src1,unsigned src2);MPY32Usrc1無符號32位×src2無符號32位=dst無符號64位

  int _rpack2 (int src1, int src2);RPACK2

  long long _saddsub (unsigned src1,unsigned src2);SADDSUB並行進行:

  1、飽和(src1+src2)-》dst_o

  2、飽和(src1-src2)-》dst_e

  long long _saddsub2 (unsigned src1,unsigned src2);SADDSUB2並行進行SADD2和SSUB2指令

  long long _shfl3 (unsigned src1, unsignedsrc2);SHFL3如圖,生成一個longlong

  int _smpy32 (int src1, int src2);SMPY3232位有符號×32位有符號,64位的結果左移1位然後飽和,然後將之後的結果的高32位寫入dst

  int _ssub2 (unsigned src1, unsignedsrc2);SSUB2Src1中的2個16位有符號-src2中的2個有符號16位,結果進行飽和

  unsigned _xormpy (unsigned src1,unsigned src2);XORMPY加瓦羅域乘法

  int _dpint (double src);DPINT將double轉成int(round)

  __int40_t _f2tol(__float2_t src); 將一個__float2_t解釋成一個__int40

  __float2_t _f2toll(__float2_t src); 將一個__float2_t解釋成一個longlong

  double _fabs (double src);ABSDP將src的絕對值放入dst。

  float _fabsf (float src);ABSSP

  __float2_t _lltof2(long long src); 將一個longlong解釋成一個__float2_t

  __float2_t _ltof2(__int40_t src); 將一個__int40解釋成一個__float2_t

  __float2_t & _mem8_f2(void * ptr);LDNDW

  STNDW從內存里加載一個64位值

  const __float2_t & _mem8_f2_const(void * ptr);LDNDW

  STNDW

  long long _mpyidll (int src1, int src2);MPYIDSrc1×src2-》dst

  double_mpysp2dp (float src1, float src2);MPYSP2DPSrc1×src2-》dst

  double_mpyspdp (float src1, doublesrc2);MPYSPDPSrc1×src2-》dst

  double _rcpdp (double src);RCPDP64位double倒數近似值放入dst

  float _rcpsp (float src);RCPSP32位float的倒數近似值

  double _rsqrdp (double src);RSQRDP64位double的平方根倒數近似值

  float _rsqrsp (float src);RSQRSP32位float的平方根倒數近似值

  int _spint (float);SPINTFloat轉爲int

  ADDDP2個double相加

  ADDSP2個float相加

  AND位與

  ANDN與後取反

  MPYSP2個float相乘

  OR位或

  SUBDP2個double相減

  SUBSP2和float相減

  XOR異或

  __x128_t _ccmatmpy (long long src1,__x128_t src2);CMATMPY

  long long _ccmatmpyr1 (long long src1,__x128_t src2);CCMATMPYR1

  long long _ccmpy32r1 (long long src1,long long src2);CCMPY32R1

  __x128_t _cmatmpy (long long src1,__x128_t src2);CMATMPY

  long long _cmatmpyr1 (long long src1,__x128_t src2);CMATMPYR1

  long long _cmpy32r1 (long long src1,long long src2);CMPY32R1

  __x128_t _cmpysp (__float2_t src1,__float2_t src2);CMPYSP

  double _complex_conjugate_mpysp (double src1, double src2);CMPYSP

  DSUBSP

  double _complex_mpysp (double src1,double src2);CMPYSP

  DADDSP

  int _crot90 (int src);CROT90複數的90度旋轉

  int _crot270 (int src);CROT270複數的270度旋轉

  long long _dadd (long long src1, long longsrc2);DADDSrc1的2個32位有符號數+src2的2個32位有符號數

  long long _dadd2 (long long src1, long long src2);DADD24路有符號16位相加

  __float2_t _daddsp (__float2_t src1,__float2_t src2);DADDSP

  long long _dadd_c (scst5 immediate src1,long long src2);DADD2路float加法

  long long _dapys2 (long long src1, long long src2);DAPYS2

  long long _davg2 (long long src1, long long src2);DAVG2有符號16位

  long long _davgnr2 (long long src1, long long src2);DAVGNR2有符號16位,無round模式

  long long _davgnru4 (long long src1,long long src2);DAVGNRU4無符號8位,無round模式

  long long _davgu4 (long long src1, long long src2);DAVGU4無符號8位

  long long _dccmpyr1 (long long src1,long long src2);DCCMPYR1

  unsigned _dcmpeq2 (long long src1, long long src2);DCMPEQ216位比較,相等返回1,不等返回0

  unsigned _dcmpeq4 (long long src1, long long src2);DCMPEQ48位比較,相等返回1,不等返回0

  unsigned _dcmpgt2 (long long src1, long long src2);DCMPGT216位比較,src1》src-》1,否則返回0

  unsigned _dcmpgtu4 (long long src1,long long src2);DCMPGTU48位比較,src1》src-》1,否則返回0

  __x128_t _dccmpy (long long src1, long long src2);DCCMPY

  __x128_t _dcmpy (long long src1, long long src2);DCMPY

  long long _dcmpyr1 (long long src1, long long src2);DCMPYR1

  long long _dcrot90 (long long src);DCROT90

  long long _dcrot270 (long long src);DCROT270

  long long _ddotp4h (__x128_t src1,__x128_t src2 );DDOTP4H執行2個dotp4h,都是有符號的

  long long _ddotpsu4h (__x128_t src1,__x128_t src2 );DDOTPSU4H執行2個dotpsu4h,一個有符號,一個無符號

  __float2_t _dinthsp (int src);DINTHSPSrc中的16位有符號數轉成單精度浮點放入dst_e和dst_o中

  __float2_t _dinthspu (unsigned src);DINTHSPUSrc中的16位無符號數轉成單精度浮點放入dst_e和dst_o中

  __float2_t _dintsp(long long src);DINTSPSrc中的有符號32位轉成單精度浮點,放入dst_e和dst_o中

  __float2_t _dintspu(long long src);DINTSPUSrc中的無符號32位轉成單精度浮點,放入dst_e和dst_o中

  long long _dmax2 (long long src1, long long src2);DMAX2對src1和src2中的16位有符號數比大小,將大的放入dst中

  long long _dmaxu4 (long long src1, long long src2);DMAXU4對src1和src2中的8位有符號數比大小,將大的放入dst中

  long long _dmin2 (long long src1, long long src2);DMIN2對src1和src2中的16位有符號數比大小,將小的放入dst中

  long long _dminu4 (long long src1, long long src2);DMINU4對src1和src2中的8位有符號數比大小,將小的放入dst中

  __x128_t _dmpy2 (long long src1, long long src2);DMPY2將src1和src2中的16位有符號數相乘,得到32位有符號數放入128位寄存器中

  __float2_t _dmpysp (__float2_t src1,__float2_t src2);DMPYSP

  __x128_t _dmpysu4 (long long src1,long long src2);DMPYSU4將src1中的8位有符號數乘以src2中的無符號8位,等到有符號16位

  __x128_t _dmpyu2 (long long src1, long long src2);DMPYU216位無符號數相乘,得到32位數放入128位寄存器中

  __x128_t _dmpyu4 (long long src1, long long src2);DMPYU48位無符號數相乘,得到有符號16位結果

  long long _dmvd (long long src1,unsigned src2 );DMVD將2個寄存器移入一個寄存器中。依次進行2次移動,當處理很多的double word時很有用。減輕寄存器壓力

  int _dotp4h (long long src1, long longsrc2 );DOTP4H進行兩個系列的16位值的點積

  long long _dotp4hll (long long src1, long long src2 );DOTP4H返回值不同

  int _dotpsu4h (long long src1, long longsrc2);DOTPSU4HSrc1中被當做有符號16位,src2被當做無符號16位,得到32位結果

  long long _dotspu4hll (long long src1,long long src2);DOTPSU4HSrc1中被當做有符號16位,src2被當做無符號16位,得到64位結果

  long long _dpackh2 (long long src1, long long src2);DPACKH2

  long long _dpackh4 (long long src1, long long src2);DPACKH4並行執行2個PACKH4

  long long _dpacklh2 (long long src1, long long src2);DPACKLH2

  long long _dpacklh4 (unsigned src1,unsigned src2);DPACKLH4並行執行PACKH4和PACKL4

  long long _dpackl2 (long long src1, long long src2);DPACKL2

  long long _dpackl4 (long long src1, long long src2);DPACKL4並行執行2個PACKL4

  long long _dsadd (long long src1, long long src2);DSADD將src1中的2個有符號32位數加上src2中的2個有符號32位數,結果進行飽和

  long long _dsadd2 (long long src1, long long src2);DSADD2結果飽和到[-2^15 2^15]

  long long _dshl (long long src1, unsignedsrc2);DSHL將longlong中的2個32位左移,用0補位(有符號32位)

  long long _dshl2 (long long src1,unsigned src2);DSHL2將longlong中的4個16位左移,用0補位(有符號16位)

  long long _dshr (long long src1, unsignedsrc2);DSHR右移,符號位補位(有符號32位)

  long long _dshr2 (long long src1,unsigned src2);DSHR2右移,符號位補位(有符號16位)

  long long _dshru (long long src1,unsigned src2);DSHRU右移,0補位(無符號32位)

  long long _dshru2 (long long src1,unsigned src2);DSHRU2右移,0補位(無符號16位)

  __x128_t _dsmpy2 (long long src1, long long src2);DSMPY2見圖示

  long long _dspacku4 (long long src1, long long src2);DSPACKU4並行進行2個SPACK4

  long long _dspint (__float2_t src);DSPINT將src中的2個單精度數轉成2個整型

  unsigned _dspinth (__float2_t src);DSPINTH將src_e和src_o的兩個單精度浮點數轉陳高個有符號的16位整數

  long long _dssub (long long src1, long long src2);DSSUB將src1中的2個32位有符號數減src2中的2個32位有符號數,得到的結果進行飽和[-2^31 (2^31)-1]

  long long _dssub2 (long long src1, long long src2);DSSUB24個16位有符號數相減,結果進行飽和[-2^15 (2^15)-1]

  long long _dsub (long long src1, long longsrc2);DSUB不飽和

  long long _dsub2 (long long src1, long long src2);DSUB2不飽和

  __float2_t _dsubsp (__float2_t src1,__float2_t src2);DSUBSP32位單精度數相減

  long long _dxpnd2 (unsigned src);DXPND2

  long long _dxpnd4 (unsigned src);DXPND4

  __float2_t _fdmvd_f2(float src1, floatsrc2);DMVD見MVD

  int _land (int src1, int src2);LAND邏輯與

  int _landn (int src1, int src2);LANDN

  int _lor (int src1, int src2);LOR邏輯或

  void _mfence();MFENCE延遲取指令流水線一直到內存系統的busy標誌降低

  double_mpysp2dp (float src1, float src2);MPYSP2DP將2個float相乘得到1個double結果

  double_mpyspdp (float src1, doublesrc2);MPYSPDP1個float×1個double得到1個double

  long long _mpyu2 (unsigned src1,unsigned src2 );MPYU22個無符號16位數×2個無符號16位數得到2個無符號32位數

  __x128_t _qmpy32 (__x128_t src1,__x128_t src2);QMPY324路:32位有符號×32位有符號,結果的低32位放入dst

  __x128_t _qmpysp (__x128_t src1,__x128_t src2);QMPYSP

  __x128_t _qsmpy32r1 (__x128_t src1,__x128_t src2);QSMPY32R14路:有符號32位×有符號32位,得到32位。和QMOY32的區別是飽和round

  unsigned _shl2 (unsigned src1, unsignedsrc2);SHL22個有符號16位,左移。Src2的低4位是移動的位數。結果也是當做有符號16位

  long long _unpkbu4 (unsigned src);UNPKBU4將無符號8位擴成無符號16位

  long long _unpkh2 (unsigned src);UNPKH2有符號16位符號擴展

  long long _unpkhu2 (unsigned src);UNPKHU2無符號16位進行0擴展

  long long _xorll_c (scst5 immediate src1,long long src2);XOR邏輯異或

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章