VEX IR

       VEX不是一種新的語言,它是從機器碼轉化而來的一種中間表達式,那麼爲什麼要用到這種中間表達式呢?從我理解的程度來說,不同的處理器有不同的架構,其機器碼的表現形式也是不一樣的,所以爲了屏蔽這種差異性,產生了一種新的中間表達式。當然VEX的產生也是帶有一定導向的,它可以表示出每一條機器指令對機器產生的影響,程序都走過了哪些路徑等等,這樣對於在測試中幫助程序改變所走路徑,到達程序的高的覆蓋率很有幫助。


學習VEX IR應該有一些學習彙編碼的基礎,下面講幾個VEX中會用到的指令概念:

1.CAS(compare-and-swap):CAS指令是並行程序設計最基礎的基石,隨着越來越多的本本都用上了雙核,這個世界已經快速步入並行計算時代,CAS指令發揮的作用也就越來越大。CAS指令,在Intel CPU上稱爲CMPXCHG,作用是將指定內存地址的內容與所給的某個值相比,如果相等,則將其內容替換爲所給的另一個值,這一系列操作是原子的,不可能被中斷。基本上所有的同步機制,與信號量、Java中的synchronized等的實現最終都要用到CAS指令,即使鎖無關的數據結構也離不開CAS指令。

2.load-link/store-conditional(LL/SC):它們是在多線程的環境下實現多線程同步的一對指令。Load-link返回一個存儲器位置的當前值;跟在其後的store-conditional如果對同一存儲器地址進行操作,那麼將會做如下判定:如果從那條load-link指令開始起沒有對該地址用store-conditional做過更新,那麼一個新的值將會被寫入該地址;否則:更新將會失敗,使從load-link所讀取的值被恢復。他們結合起來實現了一個lock-free 原子的read-modify-write操作。


一.VEX基本數據類型:


/* Always 8 bits. */
typedef  unsigned char   UChar;
typedef    signed char   Char;
typedef           char   HChar; /* signfulness depends on host */
                                /* Only to be used for printf etc
                                   format strings */

/* Always 16 bits. */
typedef  unsigned short  UShort;
typedef    signed short  Short;

/* Always 32 bits. */
typedef  unsigned int    UInt;
typedef    signed int    Int;

/* Always 64 bits. */
typedef  unsigned long long int   ULong;
typedef    signed long long int   Long;

/* Always 128 bits. */
typedef  UInt  U128[4];

/* Always 256 bits. */
typedef  UInt  U256[8];

//集中所有128位的vector,記作v128

typedef
   union {
      UChar  w8[16];
      UShort w16[8];
      UInt   w32[4];
      ULong  w64[2];
   }
   V128;


     static inline函數toBool,tochar,toHchar,toUchar,toUshort,toShort分別把Int型變量轉換成to後面的類型,toUInt把long型變量轉換成UInt。

      不同的處理器的架構不同,host的字長(32位或64位)不一樣,要先搞清楚字長,否則會導致編譯錯誤。這裏預編譯了x86_64, i386,powerpc,powerpc_64,arm,AIX(64位和非64位),s390x,mips這9種不同的架構,分別定義了其VEX_HOST_WORDSIZE的大小(4或8)和VEX_REGPARM(_n)(??暫時不知到這是什麼)。 Ptr_to_ULong 和ULong_to_Ptr函數的功能是 cast pointers to and from 64-bit integers(在不考慮host字長的情況下) ,知道host字長寫這些函數會很方便。


二.VEX IR結構介紹:

VEX IR是一種隔離不同架構的中間表達式而不是一種語言,它更像是編譯器運行的IR。它有一定的結構:

code block:

       代碼被分解成多個小的代碼塊(“superblock”,type:IRSB)。IRSB是單入口多出口的,IRSB裏包含3個內容:1.a type environment,表明IRSB中每個臨時變量的類型;2.a list of statement;3.a jump that exits from the end the IRSB。

statement and expression:

         statement(type:IRStmt)表示有side-effects的操作,例如  guest register writes, stores, and assignments to temporaries.expression(type:IRExpr)表示沒有side-effects的操作,這些操作可以包含子表達式和表達式樹,例如 (3 + (4 * load(addr1))。

guest state 的存儲:

        guest state包括guest register和guest machine,VEX庫將他們存儲在一個默認的內存塊。要對他們進行操作,必須用“Get”將guest state讀到臨時變量,用“Put”寫回到guest state。

關於guest state和IR的例子可參考論文《Valgrind: A Framework for Heavyweight Dynamic Binary Instrumentation》3.6.

No need for deallocations:

           當translation完成時,VEX的機制將自動回收allocated的memory。


1.statement種類定義:

/*標誌META的tag不代表代碼,而是關於代碼的額外信息。刪除這些表達式不影響代碼的功能性行爲,但是基於IR的instrument代碼的工具需要這樣的statement。*/

typedef
   enum {
      Ist_NoOp=0x19000,
      Ist_IMark,     /* META */
      Ist_AbiHint,   /* META */
      Ist_Put,
      Ist_PutI,
      Ist_WrTmp,
      Ist_Store,
      Ist_CAS,
      Ist_LLSC,
      Ist_Dirty,
      Ist_MBE,       /* META (maybe) */
      Ist_Exit
   }
   IRStmtTag;

/*下面的IRStat結構體裏有一個數據成員IRStmtTag tag和一個共用體(共用提中羅列了總共的12種statement,每次只能用到一種staement)*/

typedef
   struct _IRStmt {
      IRStmtTag tag;
      union {

 struct {
     } NoOp;//一般是IR優化的結果,可忽略。ppIRStmt output: IR-NoOp。

/*一條指令可轉化爲多條IR,要對每條指令的IR區分,IMark標誌爲每條機器指令的起始。

ppIRStmt output: ------ IMark(<addr>, <len>, <delta>) ------,
                         eg. ------ IMark(0x4000792, 5, 0) ------,

addr和len分別代表被轉化的機器指令的地址和長度,delta:For x86, amd64, ppc32,ppc64 and arm, the delta value is zero.  For Thumb instructions, the delta value is one.
*/

 struct {
            Addr64 addr;   /* instruction address */
            Int    len;    /* instruction length */
            UChar  delta;  /* addr = program counter as encoded in guest state
                                     - delta */
           } IMark;


/*ABI(應用二進制接口,機器碼層的接口,是二進制代碼之間的調用規則)。這裏的AbiHint指示地址空間的一個給定chunk([base .. base+len-1])成爲undefined。

ppIRStmt output: ====== AbiHint(<base>, <len>, <nia>) ======
                         eg. ====== AbiHint(t1, 16, t2) ======

base是chunk基址,len是長度,nia是下一條指令的地址

*/

 struct {
            IRExpr* base;     /* Start  of undefined chunk */
            Int     len;      /* Length of undefined chunk */
            IRExpr* nia;      /* Address of next (guest) insn */
           } AbiHint;

//Put是寄存器的寫操作,寫的地址在寄存器中的偏移量固定。ppIRStmt output: PUT(<offset>) = <data>, eg. PUT(60) = t1

 struct {
            Int     offset;   /* Offset into the guest state */
            IRExpr* data;     /* The value to write */
           } Put;

/*PutI也是寄存器的寫操作,偏移量不固定 。詳細描述見見GetI。ppIRStmt output: PUTI<descr>[<ix>,<bias>] = <data>,
                         eg. PUTI(64:8xF64)[t5,0] = t1

*/

struct {
            IRPutI* details;
          } PutI;

//臨時變量賦值。ppIRStmt output: t<tmp> = <data>, eg. t1 = 3

struct {
            IRTemp  tmp;   /* Temporary  (LHS of assignment) */
            IRExpr* data;  /* Expression (RHS of assignment) */
          } WrTmp;

//寫memory。 ppIRStmt output: ST<end>(<addr>) = <data>, eg. STle(t1) = t2

struct {
            IREndness end;    /* Endianness of the store */
            IRExpr*   addr;   /* store address */
            IRExpr*   data;   /* value to write */
          } Store;

/*原子的比較和交換(compare-and-swap)操作,語義在IRCAs中定義。

 ppIRStmt output:
               t<tmp> = CAS<end>(<addr> :: <expected> -> <new>)
            eg
               t1 = CASle(t2 :: t3->Add32(t3,1))
               which denotes a 32-bit atomic increment
               of a value at address t2

*/

 struct {
            IRCAS* details;
          } CAS;

/*如果stroedata是NULL,那麼這就是一個 Load-Linked操作:從memory加載數據。result = Load-Linked(addr, end),轉換後的數據類型由result決定(I32,I64等)。

eg ppIRStmt output:

               result = ( ST<end>-Cond(<addr>) = <storedata> )
               eg t3 = ( STbe-Cond(t1, t2) )

 ppIRStmt output:

               result = LD<end>-Linked(<addr>), eg. LDbe-Linked(t1)

如果stroedata不是NULL,那麼就是一個Store-Conditional。如果address之前loged reservation,那麼操作就會fail,result爲0,否則result爲1。轉化後的類型是storedata的類型,result是Ity_I1類型。

eg ppIRStmt output:

               result = ( ST<end>-Cond(<addr>) = <storedata> )
               eg t3 = ( STbe-Cond(t1, t2) )

*/

 struct {
            IREndness end;
            IRTemp    result;
            IRExpr*   addr;
            IRExpr*   storedata; /* NULL => LL, non-NULL => SC */
          } LLSC;

/*調用一個具有side-efdfects的C函數(ie. is "dirty")

 ppIRStmt output:
               t<tmp> = DIRTY <guard> <effects>
                  ::: <callee>(<args>)
            eg.
               t1 = DIRTY t27 RdFX-gst(16,4) RdFX-gst(60,4)
                     ::: foo{0x380035f4}(t2)

*/  
 struct {
            IRDirty* details;
           } Dirty;

/*內存總線的事件:a fence, or acquisition/release of the hardware bus lock.

ppIRStmt output: MBusEvent-Fence,
                             MBusEvent-BusLock, MBusEvent-BusUnlock.
*/

 struct {
            IRMBusEvent event;
           } MBE;

/*從IRSB的退出條件。

ppIRStmt output: if (<guard>) goto {<jk>} <dst>
                         eg. if (t69) goto {Boring} 0x4000AAA:I32

*/

 struct {
            IRExpr*    guard;    /* Conditional expression */
            IRConst*   dst;      /* Jump target (constant only) */
            IRJumpKind jk;       /* Jump kind */
            Int        offsIP;   /* Guest state offset for IP */
          } Exit;
      } Ist;
   }
   IRStmt;


2.expression種類定義:

typedef struct _IRQop   IRQop;   /* forward declaration */
typedef struct _IRTriop IRTriop; /* forward declaration */

typedef
   enum {
      Iex_Binder=0x15000,
      Iex_Get,
      Iex_GetI,
      Iex_RdTmp,
      Iex_Qop,
      Iex_Triop,
      Iex_Binop,
      Iex_Unop,
      Iex_Load,
      Iex_Const,
      Iex_Mux0X,
      Iex_CCall
   }
   IRExprTag;

/*expression stored as a tagged union.‘tag’標識了expression的種類。‘Iex’ is the union that holds the fields.如果有一個IRExpr e,e.tag=Iex_Load,則e是一個load expression,訪問這塊地址的方法是:e.Iex.Load.<fieldname>*/

typedef
   struct _IRExpr
   IRExpr;

struct _IRExpr {
   IRExprTag tag;
   union {
      /* Used only in pattern matching within Vex.  Should not be seen
         outside of Vex. */

      struct {
         Int binder;
      } Binder;

      /* Read a guest register, at a fixed offset in the guest state.
         ppIRExpr output: GET:<ty>(<offset>), eg. GET:I32(0)
      */

      struct {
         Int    offset;    /* Offset into the guest state */
         IRType ty;        /* Type of the value being read */
      } Get;

      /* Read a guest register at a non-fixed offset in the guest
         state.  This allows circular indexing into parts of the guest
         state, which is essential for modelling situations where the
         identity of guest registers is not known until run time.  One
         example is the x87 FP register stack.

         The part of the guest state to be treated as a circular array
         is described in the IRRegArray 'descr' field.  It holds the
         offset of the first element in the array, the type of each
         element, and the number of elements.

         The array index is indicated rather indirectly, in a way
         which makes optimisation easy: as the sum of variable part
         (the 'ix' field) and a constant offset (the 'bias' field).

         Since the indexing is circular, the actual array index to use
         is computed as (ix + bias) % num-of-elems-in-the-array.

         Here's an example.  The description

            (96:8xF64)[t39,-7]

         describes an array of 8 F64-typed values, the
         guest-state-offset of the first being 96.  This array is
         being indexed at (t39 - 7) % 8.

         It is important to get the array size/type exactly correct
         since IR optimisation looks closely at such info in order to
         establish aliasing/non-aliasing between seperate GetI and
         PutI events, which is used to establish when they can be
         reordered, etc.  Putting incorrect info in will lead to
         obscure IR optimisation bugs.

            ppIRExpr output: GETI<descr>[<ix>,<bias]
                         eg. GETI(128:8xI8)[t1,0]
      */

      struct {
         IRRegArray* descr; /* Part of guest state treated as circular */
         IRExpr*     ix;    /* Variable part of index into array */
         Int         bias;  /* Constant offset part of index into array */
      } GetI;

      /* The value held by a temporary.
         ppIRExpr output: t<tmp>, eg. t1
      */

      struct {
         IRTemp tmp;       /* The temporary number */
      } RdTmp;

      /* A quarternary operation.
         ppIRExpr output: <op>(<arg1>, <arg2>, <arg3>, <arg4>),
                      eg. MAddF64r32(t1, t2, t3, t4)
      */

      struct {
        IRQop* details;
      } Qop;

      /* A ternary operation.
         ppIRExpr output: <op>(<arg1>, <arg2>, <arg3>),
                      eg. MulF64(1, 2.0, 3.0)
      */

      struct {
        IRTriop* details;
      } Triop;

      /* A binary operation.
         ppIRExpr output: <op>(<arg1>, <arg2>), eg. Add32(t1,t2)
      */

      struct {
         IROp op;          /* op-code   */
         IRExpr* arg1;     /* operand 1 */
         IRExpr* arg2;     /* operand 2 */
      } Binop;

      /* A unary operation.
         ppIRExpr output: <op>(<arg>), eg. Neg8(t1)
      */

      struct {
         IROp    op;       /* op-code */
         IRExpr* arg;      /* operand */
      } Unop;

      /* A load from memory -- a normal load, not a load-linked.
         Load-Linkeds (and Store-Conditionals) are instead represented
         by IRStmt.LLSC since Load-Linkeds have side effects and so
         are not semantically valid IRExpr's.
         ppIRExpr output: LD<end>:<ty>(<addr>), eg. LDle:I32(t1)
      */

      struct {
         IREndness end;    /* Endian-ness of the load */
         IRType    ty;     /* Type of the loaded value */
         IRExpr*   addr;   /* Address being loaded from */
      } Load;

      /* A constant-valued expression.
         ppIRExpr output: <con>, eg. 0x4:I32
      */

      struct {
         IRConst* con;     /* The constant itself */
      } Const;

      /* A call to a pure (no side-effects) helper C function.

         With the 'cee' field, 'name' is the function's name.  It is
         only used for pretty-printing purposes.  The address to call
         (host address, of course) is stored in the 'addr' field
         inside 'cee'.

         The 'args' field is a NULL-terminated array of arguments.
         The stated return IRType, and the implied argument types,
         must match that of the function being called well enough so
         that the back end can actually generate correct code for the
         call.

         The called function **must** satisfy the following:

         * no side effects -- must be a pure function, the result of
           which depends only on the passed parameters.

         * it may not look at, nor modify, any of the guest state
           since that would hide guest state transitions from
           instrumenters

         * it may not access guest memory, since that would hide
           guest memory transactions from the instrumenters

         * it must not assume that arguments are being evaluated in a
           particular order. The oder of evaluation is unspecified.

         This is restrictive, but makes the semantics clean, and does
         not interfere with IR optimisation.

         If you want to call a helper which can mess with guest state
         and/or memory, instead use Ist_Dirty.  This is a lot more
         flexible, but you have to give a bunch of details about what
         the helper does (and you better be telling the truth,
         otherwise any derived instrumentation will be wrong).  Also
         Ist_Dirty inhibits various IR optimisations and so can cause
         quite poor code to be generated.  Try to avoid it.

         ppIRExpr output: <cee>(<args>):<retty>
                      eg. foo{0x80489304}(t1, t2):I32
      */

      struct {
         IRCallee* cee;    /* Function to call. */
         IRType    retty;  /* Type of return value. */
         IRExpr**  args;   /* Vector of argument expressions. */
      }  CCall;

      /* A ternary if-then-else operator.  It returns expr0 if cond is
         zero, exprX otherwise.  Note that it is STRICT, ie. both
         expr0 and exprX are evaluated in all cases.

         ppIRExpr output: Mux0X(<cond>,<expr0>,<exprX>),
                         eg. Mux0X(t6,t7,t8)
      */

      struct {
         IRExpr* cond;     /* Condition */
         IRExpr* expr0;    /* True expression */
         IRExpr* exprX;    /* False expression */
      } Mux0X;
   } Iex;
};


未完待續~~~~

  

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章