zend抽象語法樹AST流程解析

年底了空閒一些,開始看zend虛擬機,還有幾天過年了,寫下這篇學習筆記,簡單的介紹一下我近期對zend虛擬機的學習

我最近學習了zend虛擬機,首先瞭解到了一個東西

re2c+bison

php正是通過這個東西對php腳本進行的解析,這個我的初步瞭解是在php 胖子的tipi上

http://www.php-internals.com/book/?p=chapt07/07-00-zend-vm

然後我對裏面的demo語法進行了運行瞭解到 ,不管是re2c也好或者是bison也罷,我們都可以生成對應的c語言解析文件,只要我們封裝好良好的頭文件,再引入對應的實現文件,我們就可以實現這個解析

我們使用

re2c -F -c -o zend_language_scanner2.c zend_language_scanner.l

我們就可以實現自己的c源碼文件庫

bison語法解析也是這個樣子的 ,用一個最簡單的變量複製爲例子,在zend底層中的識別規則是,這樣我們使用re2c 可以分析出對應的token然後使用語法分析器將對應的token掛到對應的ast上,這個re2c和bison我也不是很熟,只是運行了一些簡單的demo,直接貼出幾個php片段吧

詞法分析:

<ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE>"$"{LABEL}"->"[a-zA-Z_\x80-\xff] {
    yyless(yyleng - 3);
    yy_push_state(ST_LOOKING_FOR_PROPERTY);
    RETURN_TOKEN_WITH_STR(T_VARIABLE, 1);
}

語法解析:

  case 386:

    { (yyval.ast) = zend_ast_list_add((yyvsp[-2].ast), (yyvsp[0].ast)); }

    break;

  case 387:

    { (yyval.ast) = zend_ast_create_list(1, ZEND_AST_CLOSURE_USES, (yyvsp[0].ast)); }

    break;

詞法分析是將一句話給拆成一個一個的單詞

語法解析入口是yylex,zend把他給改名爲了

zendparse

使用的入口是在

static zend_op_array *zend_compile(int type)
{
    zend_op_array *op_array = NULL;
    zend_bool original_in_compilation = CG(in_compilation);

    CG(in_compilation) = 1;
    CG(ast) = NULL;
    CG(ast_arena) = zend_arena_create(1024 * 32);

    if (!zendparse()) {
        int last_lineno = CG(zend_lineno);
        zend_file_context original_file_context;
        zend_oparray_context original_oparray_context;
        zend_op_array *original_active_op_array = CG(active_op_array);

        op_array = emalloc(sizeof(zend_op_array));
        init_op_array(op_array, type, INITIAL_OP_ARRAY_SIZE);
        CG(active_op_array) = op_array;

        if (zend_ast_process) {
            zend_ast_process(CG(ast));
        }

        zend_file_context_begin(&original_file_context);
        zend_oparray_context_begin(&original_oparray_context);
        zend_compile_top_stmt(CG(ast));
        CG(zend_lineno) = last_lineno;
        zend_emit_final_return(type == ZEND_USER_FUNCTION);
        op_array->line_start = 1;
        op_array->line_end = last_lineno;
        pass_two(op_array);
        zend_oparray_context_end(&original_oparray_context);
        zend_file_context_end(&original_file_context);

        CG(active_op_array) = original_active_op_array;
    }

    zend_ast_destroy(CG(ast));
    zend_arena_destroy(CG(ast_arena));

    CG(in_compilation) = original_in_compilation;

    return op_array;
}

當調用到這裏zend就展開了語法解析和詞法分析

zendparse之後,zend將php語法掛到了ast樹上

struct _zend_compiler_globals {
    zend_stack loop_var_stack;

    zend_class_entry *active_class_entry;

    zend_string *compiled_filename;

    int zend_lineno;

    zend_op_array *active_op_array;

    HashTable *function_table;  /* function symbol table */
    HashTable *class_table;     /* class table */

    HashTable filenames_table;

    HashTable *auto_globals;

    zend_bool parse_error;
    zend_bool in_compilation;
    zend_bool short_tags;

    zend_bool unclean_shutdown;

    zend_bool ini_parser_unbuffered_errors;

    zend_llist open_files;

    struct _zend_ini_parser_param *ini_parser_param;

    uint32_t start_lineno;
    zend_bool increment_lineno;

    zend_string *doc_comment;
    uint32_t extra_fn_flags;

    uint32_t compiler_options; /* set of ZEND_COMPILE_* constants */

    zend_oparray_context context;
    zend_file_context file_context;

    zend_arena *arena;

    HashTable interned_strings;

    const zend_encoding **script_encoding_list;
    size_t script_encoding_list_size;
    zend_bool multibyte;
    zend_bool detect_unicode;
    zend_bool encoding_declared;

    zend_ast *ast;
    zend_arena *ast_arena;

    zend_stack delayed_oplines_stack;

#ifdef ZTS
    zval **static_members_table;
    int last_static_member;
#endif
};

我們在這裏簡單看幾個ast的函數

zend_ast_alloc?他做了什麼呢?

static inline void *zend_ast_alloc(size_t size) {
    return zend_arena_alloc(&CG(ast_arena), size);
}

我認爲這是一個環形內存池,代碼展示

static zend_always_inline void* zend_arena_alloc(zend_arena **arena_ptr, size_t size)
{
    zend_arena *arena = *arena_ptr;
    char *ptr = arena->ptr;

    size = ZEND_MM_ALIGNED_SIZE(size);

    if (EXPECTED(size <= (size_t)(arena->end - ptr))) {
        arena->ptr = ptr + size;
    } else {
        size_t arena_size =
            UNEXPECTED((size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) > (size_t)(arena->end - (char*) arena)) ?
                (size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) :
                (size_t)(arena->end - (char*) arena);
        zend_arena *new_arena = (zend_arena*)emalloc(arena_size);

        ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena));
        new_arena->ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena)) + size;
        new_arena->end = (char*) new_arena + arena_size;
        new_arena->prev = arena;
        *arena_ptr = new_arena;
    }

    return (void*) ptr;
}

如果說剩餘的長度充足,那麼是從頭開始偏移size,返回對應的尺寸

arena->ptr = ptr + size;

如果說環形內存池有充足的尺寸

        size_t arena_size =
            UNEXPECTED((size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) > (size_t)(arena->end - (char*) arena)) ?
                (size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) :
                (size_t)(arena->end - (char*) arena);
        zend_arena *new_arena = (zend_arena*)emalloc(arena_size);

        ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena));
        new_arena->ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena)) + size;
        new_arena->end = (char*) new_arena + arena_size;
        new_arena->prev = arena;
        *arena_ptr = new_arena;

在說一個值得注意的內容,用一個例子來說明

static zend_always_inline zend_ast * zend_ast_create_zval_int(zval *zv, uint32_t attr, uint32_t lineno) {
    zend_ast_zval *ast;

    ast = zend_ast_alloc(sizeof(zend_ast_zval));
    ast->kind = ZEND_AST_ZVAL;
    ast->attr = attr;
    ZVAL_COPY_VALUE(&ast->val, zv);
    Z_LINENO(ast->val) = lineno;
    return (zend_ast *) ast;
}

注意一個內容 如果是一個變量:

那麼kind是ZEND_AST_ZVAL,存儲結構變爲了zend_ast_zval

則在申請一塊內存,並且把前向指針記錄爲之前的areana

分析完了內存的申請 還有一個重要的步驟我們需要再看一下樹節點的插入

ZEND_API zend_ast * ZEND_FASTCALL zend_ast_list_add(zend_ast *ast, zend_ast *op) {
    zend_ast_list *list = zend_ast_get_list(ast);
    if (list->children >= 4 && is_power_of_two(list->children)) {
            list = zend_ast_realloc(list,
            zend_ast_list_size(list->children), zend_ast_list_size(list->children * 2));
    }
    list->child[list->children++] = op;
    return (zend_ast *) list;
}

添加列表直接掛載到child上

樹的構造有前序遍歷 後序遍歷 和 中序遍歷,在這裏是中序遍歷,參考的文章是:

https://segmentfault.com/a/1190000019097615

我們還要認識幾個ast的主要結構:

/* Same as zend_ast, but with children count, which is updated dynamically */
typedef struct _zend_ast_list {
    zend_ast_kind kind;
    zend_ast_attr attr;
    uint32_t lineno;
    uint32_t children;
    zend_ast *child[1];
} zend_ast_list;

/* Lineno is stored in val.u2.lineno */
typedef struct _zend_ast_zval {
    zend_ast_kind kind;
    zend_ast_attr attr;
    zval val;
} zend_ast_zval;

/* Separate structure for function and class declaration, as they need extra information. */
typedef struct _zend_ast_decl {
    zend_ast_kind kind;
    zend_ast_attr attr; /* Unused - for structure compatibility */
    uint32_t start_lineno;
    uint32_t end_lineno;
    uint32_t flags;
    unsigned char *lex_pos;
    zend_string *doc_comment;
    zend_string *name;
    zend_ast *child[4];
} zend_ast_decl;

這幾個結構是十分重要的結構比如說ZEND_AST_CALL對應的結構體是_zend_ast_decl,ZEND_AST_ZVAL是_zend_ast_zval.....在這裏我們要注意看一下zend_ast和zend_ast_zval我們發現前面幾個kind,attr都是一樣的 最後有一個可變長,這個東西在後面會說,這個地方是要注意的一個點,有助於kind 64的類型轉由zend_ast轉換爲zend_ast_zval後數據落地存儲

後來我自己寫了一個例子,自己調試一把

測試文件:

<?php

$a = 1;
$b = 2;
$c = 3;

$b = $a + $b + $c;

var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);

開始用gdb來進行調試

gdb php

break zend_compile_top_stmt

run test.php

然後程序定到了

(gdb) run test.php 
Starting program: /usr/bin/php test.php
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".

Breakpoint 1, zend_compile_top_stmt (ast=0x7fffeea83530) at /home/zhanglei/ourc/php-7.3.11/Zend/zend_compile.c:8190
8190        if (!ast) {

我們從這個語法樹上去分析

(gdb) p *compiler_globals.ast
$2 = {kind = 132, attr = 0, lineno = 1, child = {0xf}}

我們看到zend的kind是132,132是ZEND_AST_STMT_LIST

(gdb) p *compiler_globals.ast.child@15
$17 = {0xf, 0x7fffeea83088, 0x7fffeea830e0, 0x7fffeea83138, 0x7fffeea83220, 0x7fffeea832a8, 0x7fffeea83380, 0x7fffeea83408, 0x7fffeea83490, 0x7fffeea83518, 0x7fffeea83630, 0x7fffeea836b8, 
  0x7fffeea83740, 0x7fffeea837c8, 0x7fffeea83850}
(gdb) 

下面我會一個一個分析這個list下面的每一個ast類型

(gdb) p *compiler_globals.ast.child[1]
$18 = {kind = 517, attr = 0, lineno = 3, child = {0x7fffeea83060}}
(gdb) p *compiler_globals.ast.child[2]
$19 = {kind = 517, attr = 0, lineno = 4, child = {0x7fffeea830b8}}
(gdb) p *compiler_globals.ast.child[3]
$20 = {kind = 517, attr = 0, lineno = 5, child = {0x7fffeea83110}}
(gdb) p *compiler_globals.ast.child[4]
$21 = {kind = 517, attr = 0, lineno = 7, child = {0x7fffeea83168}}
(gdb) p *compiler_globals.ast.child[5]

517是ZEND_AST_ASSIGN 賦值,是4個,我們在程序中正是使用了4個賦值

$a = 1;
$b = 2;
$c = 3;

$b = $a + $b + $c;

後面的幾個

(gdb) p *compiler_globals.ast.child[5]
$22 = {kind = 515, attr = 0, lineno = 9, child = {0x7fffeea83238}}
(gdb) p *compiler_globals.ast.child[6]
$23 = {kind = 515, attr = 0, lineno = 10, child = {0x7fffeea83310}}
(gdb) p *compiler_globals.ast.child[7]
$24 = {kind = 515, attr = 0, lineno = 11, child = {0x7fffeea83398}}
(gdb) p *compiler_globals.ast.child[8]
$25 = {kind = 515, attr = 0, lineno = 12, child = {0x7fffeea83420}}
(gdb) p *compiler_globals.ast.child[9]
$26 = {kind = 515, attr = 0, lineno = 13, child = {0x7fffeea834a8}}
(gdb) p *compiler_globals.ast.child[10]
$27 = {kind = 515, attr = 0, lineno = 14, child = {0x7fffeea835c0}}
(gdb) p *compiler_globals.ast.child[11]
$28 = {kind = 515, attr = 0, lineno = 15, child = {0x7fffeea83648}}
(gdb) p *compiler_globals.ast.child[12]
$29 = {kind = 515, attr = 0, lineno = 16, child = {0x7fffeea836d0}}
(gdb) p *compiler_globals.ast.child[13]
$30 = {kind = 515, attr = 0, lineno = 17, child = {0x7fffeea83758}}
(gdb) p *compiler_globals.ast.child[14]
$31 = {kind = 515, attr = 0, lineno = 18, child = {0x7fffeea837e0}}
(gdb) p *compiler_globals.ast.child[15]
$32 = {kind = 515, attr = 0, lineno = 19, child = {0x7fffeea83868}}

都是515,在代碼中是ZEND_AST_CALL代表的是函數調用 現在我們再繼續向下看

其實看到這裏我開始思考了一個問題就是以$a = 1,爲例子(ZEND_AST_ASSIGN)

$符很可能被分析器過濾掉 ,那麼剩下的就是$a 的”a”存在了哪裏,1放到了哪裏

我發現賦值的話直接把值存儲到ast child轉化的zend_ast_zval*的zval裏

(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[1])).val.value.str.val)
$44 = 1 '\001'
(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[2])).val.value.str.val)
$45 = 2 '\002'
(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[3])).val.value.str.val)
$46 = 3 '\003'

分別存儲了1,2,3但還是思考a的位置在哪裏,於是我全局搜索了ZEND_AST_VAR,發現了這麼一段代碼

static int zend_try_compile_cv(znode *result, zend_ast *ast) /* {{{ */
{
    zend_ast *name_ast = ast->child[0];
    if (name_ast->kind == ZEND_AST_ZVAL) {
        zval *zv = zend_ast_get_zval(name_ast);
        zend_string *name;

        if (EXPECTED(Z_TYPE_P(zv) == IS_STRING)) {
            name = zval_make_interned_string(zv);
        } else {
            name = zend_new_interned_string(zval_get_string_func(zv));
        }

        if (zend_is_auto_global(name)) {
            return FAILURE;
        }

        result->op_type = IS_CV;
        result->u.op.var = lookup_cv(CG(active_op_array), name);

        if (UNEXPECTED(Z_TYPE_P(zv) != IS_STRING)) {
            zend_string_release_ex(name, 0);
        }

        return SUCCESS;
    }

    return FAILURE;
}

說明了他是一個字符串!!因爲這一段代碼

zend_string *name;

if (EXPECTED(Z_TYPE_P(zv) == IS_STRING)) {
    name = zval_make_interned_string(zv);
} else {
    name = zend_new_interned_string(zval_get_string_func(zv));
}

好了知道了變量名字的位置我們使用gdb調試吧

(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[2].child.child).val.value.str
$96 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953479, len = 1, val = "b"}
(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[1].child.child).val.value.str
$97 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953478, len = 1, val = "a"}
(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[3].child.child).val.value.str
$98 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953480, len = 1, val = "c"}

到了這裏我們看到了abc的位置

那我們再思考這段代碼中賦值是如何存儲的?然後我們看右子葉中存儲的是type 64 ZEND_AST_ZVAL

其實這裏就說明問題了賦值的數被存放到這裏,再用gdb看

(gdb) p *(zend_ast_zval*)compiler_globals.ast.child[3].child[1]
$158 = {kind = 64, attr = 0, val = {value = {lval = 3, dval = 1.4821969375237396e-323, counted = 0x3, str = 0x3, arr = 0x3, obj = 0x3, res = 0x3, ref = 0x3, ast = 0x3, zv = 0x3, ptr = 0x3, ce = 0x3, 
      func = 0x3, ww = {w1 = 3, w2 = 0}}, u1 = {v = {type = 4 '\004', type_flags = 0 '\000', u = {call_info = 0, extra = 0}}, type_info = 4}, u2 = {next = 5, cache_slot = 5, opline_num = 5, lineno = 5, 
      num_args = 5, fe_pos = 5, fe_iter_idx = 5, access_flags = 5, property_guard = 5, constant_flags = 5, extra = 5}}}

透過這裏我們看到了zend的u1.v.type是4對應的宏定義是IS_LONG

這裏就是我們存儲的值

說實話讀代碼讀到這裏我就開始思考一個問題爲什麼zend_ast 和 zend_ast_zval的kind會一樣呢?思考了一會兒我響起了多年前學習的滴水逆向教程的彙編課程,c語言開頭kind都是一樣的,我們看結構體

struct _zend_ast {
    zend_ast_kind kind; /* Type of the node (ZEND_AST_* enum constant) */
    zend_ast_attr attr; /* Additional attribute, use depending on node type */
    uint32_t lineno;    /* Line number */
    zend_ast *child[1]; /* Array of children (using struct hack) */
};

/* Same as zend_ast, but with children count, which is updated dynamically */
typedef struct _zend_ast_list {
    zend_ast_kind kind;
    zend_ast_attr attr;
    uint32_t lineno;
    uint32_t children;
    zend_ast *child[1];
} zend_ast_list;

你會發現內存分配的位置kind都是在第一個成員變量的位置,結果很明顯了,賦值的話內存前幾個位置是相同的,後面是柔性數組存放zval的,不得不說這個數據結構設計的不錯

zend_ast_kind kind;
zend_ast_attr attr;
uint32_t lineno;

所以值也會對齊

我們在繼續看一下$b = $a + $b + $c;zend是如何處理的

用gdb看第4個child

(gdb) p *compiler_globals.ast.child[4]
$161 = {kind = 517, attr = 0, lineno = 7, child = {0x7fffeea83168}}

這裏的kind是517還是賦值

然後查看child內容

(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[4].child.child).val.value.str
$177 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953479, len = 1, val = "b"}

第一個位置存放的是聲明

gdb打印第二個位置:

(gdb) p (*compiler_globals.ast.child[4].child[1])
$182 = {kind = 520, attr = 1, lineno = 7, child = {0x7fffeea831c8}}

這個520對應的kind是ZEND_AST_BINARY_OP,我們在php源碼中去找對應的代碼看看使用情況

        case ZEND_AST_BINARY_OP:
            switch (ast->attr) {
                case ZEND_ADD:                 BINARY_OP(" + ",   200, 200, 201);
                case ZEND_SUB:                 BINARY_OP(" - ",   200, 200, 201);
                case ZEND_MUL:                 BINARY_OP(" * ",   210, 210, 211);
                case ZEND_DIV:                 BINARY_OP(" / ",   210, 210, 211);
                case ZEND_MOD:                 BINARY_OP(" % ",   210, 210, 211);
                case ZEND_SL:                  BINARY_OP(" << ",  190, 190, 191);
                case ZEND_SR:                  BINARY_OP(" >> ",  190, 190, 191);
                case ZEND_CONCAT:              BINARY_OP(" . ",   200, 200, 201);
                case ZEND_BW_OR:               BINARY_OP(" | ",   140, 140, 141);
                case ZEND_BW_AND:              BINARY_OP(" & ",   160, 160, 161);
                case ZEND_BW_XOR:              BINARY_OP(" ^ ",   150, 150, 151);
                case ZEND_IS_IDENTICAL:        BINARY_OP(" === ", 170, 171, 171);
                case ZEND_IS_NOT_IDENTICAL:    BINARY_OP(" !== ", 170, 171, 171);
                case ZEND_IS_EQUAL:            BINARY_OP(" == ",  170, 171, 171);
                case ZEND_IS_NOT_EQUAL:        BINARY_OP(" != ",  170, 171, 171);
                case ZEND_IS_SMALLER:          BINARY_OP(" < ",   180, 181, 181);
                case ZEND_IS_SMALLER_OR_EQUAL: BINARY_OP(" <= ",  180, 181, 181);
                case ZEND_POW:                 BINARY_OP(" ** ",  250, 251, 250);
                case ZEND_BOOL_XOR:            BINARY_OP(" xor ",  40,  40,  41);
                case ZEND_SPACESHIP:           BINARY_OP(" <=> ", 180, 181, 181);
                EMPTY_SWITCH_DEFAULT_CASE();

我們再繼續看var_dump

(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[6].child[0])).val.value.str.val@10)
$27 = "var_dump\000"

第一個位置放的是函數名字,然後我們在看第二個位置

(gdb) p *((compiler_globals.ast.child[5].child[1]))
$295 = {kind = 128, attr = 0, lineno = 9, child = {0x1}}

kind是128對應的是ZEND_AST_ARG_LIST 這是一個參數列表,我們看代碼

然後我再代碼中找到了這麼一段

zend_ast_list *list = zend_ast_get_list(args->child[1]->child[1]);

也就是說我們要看child的1的位置而不是0的位置,好了到這裏我們再繼續用gdb做調試,這樣我們就看到了我們的傳入參數

(gdb) p *(*(zend_ast_zval*)*((compiler_globals.ast.child[5].child[1].child[1].child))).val.value.str
$305 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953479, len = 1, val = "b"}

好了到了最後我們把我們解析這段代碼用一個圖給表示出來吧!!!!

 

 

 

到這裏zend的語法樹就已經解剖完成了,哈哈哈哈哈

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章