【DynamoRIO 入门教程】四： inc2add.c

例子中的注释写到：执行动态优化，在不干扰目标应用程序行为的情况下, 只要有价值和可行, 就将 “inc” 指令转换为 “add 1”。说明在已知底层处理器时最好的做法是在运行时执行微体系结构特定优化。可见这个例子和上一个 div.c 的区别就是这次我们实实在在进行了代码替换和优化。

老规矩，先看 dr_client_main

DR_EXPORT void
dr_client_main(client_id_t id, int argc, const char *argv[])
{
    /* We only used drreg for liveness, not for spilling, so we need no slots. */
    drreg_options_t ops = { sizeof(ops), 0 /*no slots needed*/, false };
    if (!drmgr_init() || drreg_init(&ops) != DRREG_SUCCESS)
        DR_ASSERT(false);

    /* Register for our events: process exit, and code transformation.
     * We're changing the app's code, rather than just inserting observational
     * instrumentation.
     */
    dr_register_exit_event(event_exit);
    if (!drmgr_register_bb_app2app_event(event_instruction_change, NULL))
        DR_ASSERT(false);

    /* Long ago, this optimization would target the Pentium 4 (identified via
     * "proc_get_family() == FAMILY_PENTIUM_4"), where an add of 1 is faster
     * than an inc.  For illustration purposes we leave a boolean controlling it
     * but we turn it on all the time in this sample and leave it for future
     * work to determine whether to disable it on certain microarchitectures.
     */
    enable = true;

    /* Initialize our global variables. */
    num_examined = 0;
    num_converted = 0;

注释里说，这个优化是针对于奔腾4 处理器的，不过我们是为了学习 DynamoRIO ，不用管这些。

drreg_options_t ops = { sizeof(ops), 0 /no slots needed/, false };
这一行代码是我们第一次遇见， drreg_options_t 用来初始化 drreg 扩展，可以看到后面的 drreg_init(&ops)。drreg 是一个辅佐管理寄存器的扩展，我们可以利用它获得一个寄存器的使用权。这里ops结构体里的第二个参数 0，用来表示需要的插槽数，因为本次我们不需要寄存器插槽，因此设为0。关于该扩展更多的细节我还没有弄明白，以后再说。

然后就是一堆初始化，把 drmgr 和 drreg 都初始化完成。

dr_register_exit_event(event_exit) 这里注册了结束回调函数。

drmgr_register_bb_app2app_event(event_instruction_change, NULL) 也是个关键，该函数的作用是为在 basic block creation 细分化的4个阶段里的第一个阶段 “app2app" ，注册回调函数。这样在4个阶段之前会先调用 event_instruction_change函数。

剩下几个变量里，enable 用来表示决定需要优化。

event_exit 退出回调函数

static void
event_exit(void)
{
    char msg[256];
    int len;
    if (enable) {
        len = dr_snprintf(msg, sizeof(msg) / sizeof(msg[0]),
                          "converted %d out of %d inc/dec to add/sub\n", num_converted,
                          num_examined);
    } else {
        len = dr_snprintf(msg, sizeof(msg) / sizeof(msg[0]),
                          "decided to keep all original inc/dec\n");
    }
    DR_ASSERT(len > 0);
    msg[sizeof(msg) / sizeof(msg[0]) - 1] = '\0';
    DISPLAY_STRING(msg);

    if (!drmgr_unregister_bb_app2app_event(event_instruction_change) ||
        drreg_exit() != DRREG_SUCCESS)
        DR_ASSERT(false);
    drmgr_exit();
}

第一部分是用来打印结果的，没有什么好说的。
第二部分则是 unregister 函数和 exit 函数。 drreg_exit() 和 drmgr_exit() 都是和 _init()函数相对应的。
关键在于那个 drmgr_unregister_bb_app2app_event(event_instruction_change) 函数。
我有了一个疑问，为什么有的时候有 basic block 的unregister函数，有的时候没有呢？答案在 dr_unregister_bb_event() 的文档里：

We do not recommend unregistering for the basic block event unless it aways returned DR_EMIT_STORE_TRANSLATIONS (including when for_trace is true, or if the client has a trace creation callback that returns DR_EMIT_STORE_TRANSLATIONS). Unregistering can prevent proper state translation on a later fault or other translation event for this basic block or for a trace that includes this basic block. Instead of unregistering, turn the event callback into a nop.

文档里提到，不建议使用 basic block event 的 unregister函数。除非在bb的回调函数里返回了DR_EMIT_STORE_TRANSLATIONS，不幸的是，我没有在这个例子里找到这个返回值。

再看注册bb回调函数 event_instruction_change

/* Replaces inc with add 1, dec with sub 1.
 * If cannot replace (eflags constraints), leaves original instruction alone.
 */
static dr_emit_flags_t
event_instruction_change(void *drcontext, void *tag, instrlist_t *bb, bool for_trace,
                         bool translating)
{
    int opcode;
    instr_t *instr, *next_instr;

    /* Only bother replacing for hot code, i.e., when for_trace is true, and
     * when the underlying microarchitecture calls for it.
     */
    if (!for_trace || !enable)
        return DR_EMIT_DEFAULT;

    for (instr = instrlist_first_app(bb); instr != NULL; instr = next_instr) {
        /* We're deleting some instrs, so get the next first. */
        next_instr = instr_get_next_app(instr);
        opcode = instr_get_opcode(instr);
        if (opcode == OP_inc || opcode == OP_dec) {
            if (!translating)
                ATOMIC_INC(num_examined);
            if (replace_inc_with_add(drcontext, instr, bb)) {
                if (!translating)
                    ATOMIC_INC(num_converted);
            }
        }
    }

    return DR_EMIT_DEFAULT;
}

注释里写到，只针对 hot code代码进行优化，所以只有 for_trace = trace，也就是当前 basic block 要加入到trace 里时，才对当前basic block 进行优化。

instrlist_first_app(bb) 用来从bb指令序列里取出第一条指令。
instr_get_next_app(instr) 用来获取instr指令的下一条指令。
instr_get_opcode(instr) 用来获取instr指令的操作码。

然后就是用 replace_inc_with_add 函数来进行替换。等会再说这个函数。

ATOMIC_INC 好像是一个内联汇编，用来保证原子操作。

还有一处让我疑惑，只有当 translatign 为false 时才进行 num_examined和 nm_converted的增加。
translating 为 false表示本次回调函数是为了 basic block 的创建，当为true 时，则表示本次回调函数的调用是因为地址转换。这里的地址转换我觉得可能是故障地址转换。

如果因为地址转换调用而不进行统计，那为什么不把这个 if判断放在该回调函数的最前面呢？

replace_inc_with_add

/* Replaces inc with add 1, dec with sub 1.
 * Returns true if successful, false if not.
 */
static bool
replace_inc_with_add(void *drcontext, instr_t *instr, instrlist_t *bb)
{
    instr_t *new_instr;
    uint eflags;
    int opcode = instr_get_opcode(instr);

    DR_ASSERT(opcode == OP_inc || opcode == OP_dec);

    /* Add/sub writes CF, inc/dec does not, so we make sure that's ok.
     * We use drreg's liveness analysis, which includes the rest of this block.
     * To be more sophisticated, we could examine instructions at target of each
     * direct exit instead of assuming CF is live across any branch.
     */
    if (drreg_aflags_liveness(drcontext, instr, &eflags) != DRREG_SUCCESS ||
        (eflags & EFLAGS_READ_CF) != 0) {

        return false;
    }
    if (opcode == OP_inc) {

        new_instr =
            INSTR_CREATE_add(drcontext, instr_get_dst(instr, 0), OPND_CREATE_INT8(1));
    } else {

        new_instr =
            INSTR_CREATE_sub(drcontext, instr_get_dst(instr, 0), OPND_CREATE_INT8(1));
    }
    if (instr_get_prefix_flag(instr, PREFIX_LOCK))
        instr_set_prefix_flag(new_instr, PREFIX_LOCK);
    instr_set_translation(new_instr, instr_get_app_pc(instr));
    instrlist_replace(bb, instr, new_instr);
    instr_destroy(drcontext, instr);
    return true;

该函数涉及函数较多，我们仔细说一说。
注释里说到，替换后的 add/sub 指令会对标志寄存器里的 CF标志进行操作（注意，CF标志是进位标志），而 inc/dec 则不会影响到 CF标志位。所以，如果我们要把 inc/dec 替换为 add/sub 则要保证，替换指令位置后面的一系列指令不会对 CF标志位进行读操作。所以我们可以看到代码里用 drreg 对指令进行了活跃度分析。

if (drreg_aflags_liveness(drcontext, instr, &eflags) != DRREG_SUCCESS ||
        (eflags & EFLAGS_READ_CF) != 0)

drreg_aflagsg_liveness 用来进行活跃度分析，如果执行成功，则返回 DRREG_SUCCESS，并且将分析结果保存在 eflags变量里面，结果是 EFLAGS_READ_6bits 这样一个常量。
另外，我们要注意注释里这样一句话：We use drreg’s liveness analysis, which includes the rest of this block
这个分析涵盖了当前 basic block 的剩余部分，也就是说，这里的活跃度分析是从 instr 到 basic block 结束的。

当完成活跃度分析，判断指令替换后不会对后面的指令照成影响，我们就开始正是进行指令替换。

INSTR_CREATE_add 和 INSTR_CREATE_sub 都是用来创建指令的，分别创建了 add指令，和 sub指令，新的指令是存放再在变量 new_instr 里面的，还没有插入到 basic block 里。

instr_get_prefix_flag(instr, PREFIX_LOCK)
** instr_set_prefix_flag(new_instr, PREFIX_LOCK)**
从函数名来看，这两个函数用来获取/设置指令前缀，一开始我不知道指令前缀是啥？查了一下，原来是这样的：

指令前缀有4种，而且一条指令可以前有多种前缀，每一个前缀占一个字节，在32位指令里，前缀种类的排列顺序不作规定。它们的名称和机器码，分别是：

操作数长度前缀（66H）

地址长度前缀（67H）

段超越前缀（2eH、3eH、26H、64H、65H、36H）

锁定前缀和重复前缀

那这里，我们重点关注的就是锁定前缀，如果原指令存在锁定前缀，那么我们就要给新指令添上锁定前缀，保持一致。

** instr_set_translation(new_instr, instr_get_app_pc(instr))** 设置新指令的转换地址。
我们把 basic block 从原始可执行文件里拷贝到 code cache 时，指令的地址必然要发生变化。所以，code cache 里的每一条指令都有一个转换地址，这个地址就是原程序里对应指令的地址。
如今我们创建了一个新指令，也要为它设置一个转换地址，尽管原程序里该地址处的指令可能是 inc/dec ，而新指令是 add/sub 。这样做可能是为了以后的故障信息转换什么的？
总之，这里先用 instr_get_app_pc 函数获取了原指令的app_pc，即应用程序里的地址。然后将新地址设置给了新指令 new_instr。

instrlist_replace(bb, instr, new_instr); 在basic block 里将老指令 instr 替换为新指令 new_instr ，但是不会销毁老指令。因此后面跟着一个销毁函数。

instr_destroy(drcontext, instr) 文档里写到：执行instr_free（），然后为instr_create（）执行的instr释放线程局部堆存储。
但由于这里我们要销毁的指令是原始的有 DR自己创建的，所以应该只会执行 instr_free() 这一步。

本例子，可以分为三步：指令查找与筛选，指令创建与准备，指令替换。
其中，第一步的麻烦在于要分析指令快里标志寄存器的活跃度，以此来判断是否可以进行替换。
第二步，设计的函数较多，创建一个新指令时要考虑到指令前缀、转换地址、操作数等关键信息。
第三步都非常简单，一两个函数就可以搞定，不要忘记了销毁旧指令。

完整代码如下：

#include "dr_api.h"
#include "drmgr.h"
#include "drreg.h"

#ifdef WINDOWS
#    define DISPLAY_STRING(msg) dr_messagebox(msg)
#    define ATOMIC_INC(var) _InterlockedIncrement((volatile LONG *)(&(var)))
#else
#    define DISPLAY_STRING(msg) dr_printf("%s\n", msg);
#    define ATOMIC_INC(var) __asm__ __volatile__("lock incl %0" : "=m"(var) : : "memory")
#endif

static bool enable;

/* Use atomic operations to increment these to avoid the hassle of locking. */
static int num_examined, num_converted;

/* Replaces inc with add 1, dec with sub 1.
 * Returns true if successful, false if not.
 */
static bool
replace_inc_with_add(void *drcontext, instr_t *inst, instrlist_t *trace);

static dr_emit_flags_t
event_instruction_change(void *drcontext, void *tag, instrlist_t *bb, bool for_trace,
                         bool translating);

static void
event_exit(void);

DR_EXPORT void
dr_client_main(client_id_t id, int argc, const char *argv[])
{
    /* We only used drreg for liveness, not for spilling, so we need no slots. */
    drreg_options_t ops = { sizeof(ops), 0 /*no slots needed*/, false };
    //dr_set_client_name("DynamoRIO Sample Client 'inc2add'",
    //                  "http://dynamorio.org/issues");
    if (!drmgr_init() || drreg_init(&ops) != DRREG_SUCCESS)
        DR_ASSERT(false);

    /* Register for our events: process exit, and code transformation.
     * We're changing the app's code, rather than just inserting observational
     * instrumentation.
     */
    dr_register_exit_event(event_exit);
    if (!drmgr_register_bb_app2app_event(event_instruction_change, NULL))
        DR_ASSERT(false);

    /* Long ago, this optimization would target the Pentium 4 (identified via
     * "proc_get_family() == FAMILY_PENTIUM_4"), where an add of 1 is faster
     * than an inc.  For illustration purposes we leave a boolean controlling it
     * but we turn it on all the time in this sample and leave it for future
     * work to determine whether to disable it on certain microarchitectures.
     */
    enable = true;

    /* Initialize our global variables. */
    num_examined = 0;
    num_converted = 0;
}

static void
event_exit(void)
{
    char msg[256];
    int len;
    if (enable) {
        len = dr_snprintf(msg, sizeof(msg) / sizeof(msg[0]),
                          "converted %d out of %d inc/dec to add/sub\n", num_converted,
                          num_examined);
    } else {
        len = dr_snprintf(msg, sizeof(msg) / sizeof(msg[0]),
                          "decided to keep all original inc/dec\n");
    }
    DR_ASSERT(len > 0);
    msg[sizeof(msg) / sizeof(msg[0]) - 1] = '\0';
    DISPLAY_STRING(msg);

    if (!drmgr_unregister_bb_app2app_event(event_instruction_change) ||
        drreg_exit() != DRREG_SUCCESS)
        DR_ASSERT(false);
    drmgr_exit();
}

/* Replaces inc with add 1, dec with sub 1.
 * If cannot replace (eflags constraints), leaves original instruction alone.
 */
static dr_emit_flags_t
event_instruction_change(void *drcontext, void *tag, instrlist_t *bb, bool for_trace,
                         bool translating)
{
    int opcode;
    instr_t *instr, *next_instr;

    /* Only bother replacing for hot code, i.e., when for_trace is true, and
     * when the underlying microarchitecture calls for it.
     */
    if (!for_trace || !enable)
        return DR_EMIT_DEFAULT;

    for (instr = instrlist_first_app(bb); instr != NULL; instr = next_instr) {
        /* We're deleting some instrs, so get the next first. */
        next_instr = instr_get_next_app(instr);
        opcode = instr_get_opcode(instr);
        if (opcode == OP_inc || opcode == OP_dec) {
            if (!translating)
                ATOMIC_INC(num_examined);
            if (replace_inc_with_add(drcontext, instr, bb)) {
                if (!translating)
                    ATOMIC_INC(num_converted);
            }
        }
    }

    return DR_EMIT_DEFAULT;
}

/* Replaces inc with add 1, dec with sub 1.
 * Returns true if successful, false if not.
 */
static bool
replace_inc_with_add(void *drcontext, instr_t *instr, instrlist_t *bb)
{
    instr_t *new_instr;
    uint eflags;
    int opcode = instr_get_opcode(instr);

    DR_ASSERT(opcode == OP_inc || opcode == OP_dec);

    /* Add/sub writes CF, inc/dec does not, so we make sure that's ok.
     * We use drreg's liveness analysis, which includes the rest of this block.
     * To be more sophisticated, we could examine instructions at target of each
     * direct exit instead of assuming CF is live across any branch.
     */
    if (drreg_aflags_liveness(drcontext, instr, &eflags) != DRREG_SUCCESS ||
        (eflags & EFLAGS_READ_CF) != 0) {

        return false;
    }
    if (opcode == OP_inc) {

        new_instr =
            INSTR_CREATE_add(drcontext, instr_get_dst(instr, 0), OPND_CREATE_INT8(1));
    } else {

        new_instr =
            INSTR_CREATE_sub(drcontext, instr_get_dst(instr, 0), OPND_CREATE_INT8(1));
    }
    if (instr_get_prefix_flag(instr, PREFIX_LOCK))
        instr_set_prefix_flag(new_instr, PREFIX_LOCK);
    instr_set_translation(new_instr, instr_get_app_pc(instr));
    instrlist_replace(bb, instr, new_instr);
    instr_destroy(drcontext, instr);
    return true;
}

【DynamoRIO 入门教程】四： inc2add.c

老规矩，先看 dr_client_main

event_exit 退出回调函数

再看注册bb回调函数 event_instruction_change

replace_inc_with_add

VS2017 編譯 libpeg

x6dbg配色方案的導入導出

如何在 64位的linux 上安裝32位的庫

Visual Studio “無可用源“ 問題

【DynamoRIO 入門教程】六：inline.c

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結