年底了空閒一些,開始看zend虛擬機,還有幾天過年了,寫下這篇學習筆記,簡單的介紹一下我近期對zend虛擬機的學習
我最近學習了zend虛擬機,首先瞭解到了一個東西
re2c+bison
php正是通過這個東西對php腳本進行的解析,這個我的初步瞭解是在php 胖子的tipi上
http://www.php-internals.com/book/?p=chapt07/07-00-zend-vm
然後我對裏面的demo語法進行了運行瞭解到 ,不管是re2c也好或者是bison也罷,我們都可以生成對應的c語言解析文件,只要我們封裝好良好的頭文件,再引入對應的實現文件,我們就可以實現這個解析
我們使用
re2c -F -c -o zend_language_scanner2.c zend_language_scanner.l
我們就可以實現自己的c源碼文件庫
bison語法解析也是這個樣子的 ,用一個最簡單的變量複製爲例子,在zend底層中的識別規則是,這樣我們使用re2c 可以分析出對應的token然後使用語法分析器將對應的token掛到對應的ast上,這個re2c和bison我也不是很熟,只是運行了一些簡單的demo,直接貼出幾個php片段吧
詞法分析:
<ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE>"$"{LABEL}"->"[a-zA-Z_\x80-\xff] {
yyless(yyleng - 3);
yy_push_state(ST_LOOKING_FOR_PROPERTY);
RETURN_TOKEN_WITH_STR(T_VARIABLE, 1);
}
語法解析:
case 386:
{ (yyval.ast) = zend_ast_list_add((yyvsp[-2].ast), (yyvsp[0].ast)); }
break;
case 387:
{ (yyval.ast) = zend_ast_create_list(1, ZEND_AST_CLOSURE_USES, (yyvsp[0].ast)); }
break;
詞法分析是將一句話給拆成一個一個的單詞
語法解析入口是yylex,zend把他給改名爲了
zendparse
使用的入口是在
static zend_op_array *zend_compile(int type)
{
zend_op_array *op_array = NULL;
zend_bool original_in_compilation = CG(in_compilation);
CG(in_compilation) = 1;
CG(ast) = NULL;
CG(ast_arena) = zend_arena_create(1024 * 32);
if (!zendparse()) {
int last_lineno = CG(zend_lineno);
zend_file_context original_file_context;
zend_oparray_context original_oparray_context;
zend_op_array *original_active_op_array = CG(active_op_array);
op_array = emalloc(sizeof(zend_op_array));
init_op_array(op_array, type, INITIAL_OP_ARRAY_SIZE);
CG(active_op_array) = op_array;
if (zend_ast_process) {
zend_ast_process(CG(ast));
}
zend_file_context_begin(&original_file_context);
zend_oparray_context_begin(&original_oparray_context);
zend_compile_top_stmt(CG(ast));
CG(zend_lineno) = last_lineno;
zend_emit_final_return(type == ZEND_USER_FUNCTION);
op_array->line_start = 1;
op_array->line_end = last_lineno;
pass_two(op_array);
zend_oparray_context_end(&original_oparray_context);
zend_file_context_end(&original_file_context);
CG(active_op_array) = original_active_op_array;
}
zend_ast_destroy(CG(ast));
zend_arena_destroy(CG(ast_arena));
CG(in_compilation) = original_in_compilation;
return op_array;
}
當調用到這裏zend就展開了語法解析和詞法分析
zendparse之後,zend將php語法掛到了ast樹上
struct _zend_compiler_globals {
zend_stack loop_var_stack;
zend_class_entry *active_class_entry;
zend_string *compiled_filename;
int zend_lineno;
zend_op_array *active_op_array;
HashTable *function_table; /* function symbol table */
HashTable *class_table; /* class table */
HashTable filenames_table;
HashTable *auto_globals;
zend_bool parse_error;
zend_bool in_compilation;
zend_bool short_tags;
zend_bool unclean_shutdown;
zend_bool ini_parser_unbuffered_errors;
zend_llist open_files;
struct _zend_ini_parser_param *ini_parser_param;
uint32_t start_lineno;
zend_bool increment_lineno;
zend_string *doc_comment;
uint32_t extra_fn_flags;
uint32_t compiler_options; /* set of ZEND_COMPILE_* constants */
zend_oparray_context context;
zend_file_context file_context;
zend_arena *arena;
HashTable interned_strings;
const zend_encoding **script_encoding_list;
size_t script_encoding_list_size;
zend_bool multibyte;
zend_bool detect_unicode;
zend_bool encoding_declared;
zend_ast *ast;
zend_arena *ast_arena;
zend_stack delayed_oplines_stack;
#ifdef ZTS
zval **static_members_table;
int last_static_member;
#endif
};
我們在這裏簡單看幾個ast的函數
zend_ast_alloc?他做了什麼呢?
static inline void *zend_ast_alloc(size_t size) {
return zend_arena_alloc(&CG(ast_arena), size);
}
我認爲這是一個環形內存池,代碼展示
static zend_always_inline void* zend_arena_alloc(zend_arena **arena_ptr, size_t size)
{
zend_arena *arena = *arena_ptr;
char *ptr = arena->ptr;
size = ZEND_MM_ALIGNED_SIZE(size);
if (EXPECTED(size <= (size_t)(arena->end - ptr))) {
arena->ptr = ptr + size;
} else {
size_t arena_size =
UNEXPECTED((size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) > (size_t)(arena->end - (char*) arena)) ?
(size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) :
(size_t)(arena->end - (char*) arena);
zend_arena *new_arena = (zend_arena*)emalloc(arena_size);
ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena));
new_arena->ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena)) + size;
new_arena->end = (char*) new_arena + arena_size;
new_arena->prev = arena;
*arena_ptr = new_arena;
}
return (void*) ptr;
}
如果說剩餘的長度充足,那麼是從頭開始偏移size,返回對應的尺寸
arena->ptr = ptr + size;
如果說環形內存池有充足的尺寸
size_t arena_size =
UNEXPECTED((size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) > (size_t)(arena->end - (char*) arena)) ?
(size + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena))) :
(size_t)(arena->end - (char*) arena);
zend_arena *new_arena = (zend_arena*)emalloc(arena_size);
ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena));
new_arena->ptr = (char*) new_arena + ZEND_MM_ALIGNED_SIZE(sizeof(zend_arena)) + size;
new_arena->end = (char*) new_arena + arena_size;
new_arena->prev = arena;
*arena_ptr = new_arena;
在說一個值得注意的內容,用一個例子來說明
static zend_always_inline zend_ast * zend_ast_create_zval_int(zval *zv, uint32_t attr, uint32_t lineno) {
zend_ast_zval *ast;
ast = zend_ast_alloc(sizeof(zend_ast_zval));
ast->kind = ZEND_AST_ZVAL;
ast->attr = attr;
ZVAL_COPY_VALUE(&ast->val, zv);
Z_LINENO(ast->val) = lineno;
return (zend_ast *) ast;
}
注意一個內容 如果是一個變量:
那麼kind是ZEND_AST_ZVAL,存儲結構變爲了zend_ast_zval
則在申請一塊內存,並且把前向指針記錄爲之前的areana
分析完了內存的申請 還有一個重要的步驟我們需要再看一下樹節點的插入
ZEND_API zend_ast * ZEND_FASTCALL zend_ast_list_add(zend_ast *ast, zend_ast *op) {
zend_ast_list *list = zend_ast_get_list(ast);
if (list->children >= 4 && is_power_of_two(list->children)) {
list = zend_ast_realloc(list,
zend_ast_list_size(list->children), zend_ast_list_size(list->children * 2));
}
list->child[list->children++] = op;
return (zend_ast *) list;
}
添加列表直接掛載到child上
樹的構造有前序遍歷 後序遍歷 和 中序遍歷,在這裏是中序遍歷,參考的文章是:
https://segmentfault.com/a/1190000019097615
我們還要認識幾個ast的主要結構:
/* Same as zend_ast, but with children count, which is updated dynamically */
typedef struct _zend_ast_list {
zend_ast_kind kind;
zend_ast_attr attr;
uint32_t lineno;
uint32_t children;
zend_ast *child[1];
} zend_ast_list;
/* Lineno is stored in val.u2.lineno */
typedef struct _zend_ast_zval {
zend_ast_kind kind;
zend_ast_attr attr;
zval val;
} zend_ast_zval;
/* Separate structure for function and class declaration, as they need extra information. */
typedef struct _zend_ast_decl {
zend_ast_kind kind;
zend_ast_attr attr; /* Unused - for structure compatibility */
uint32_t start_lineno;
uint32_t end_lineno;
uint32_t flags;
unsigned char *lex_pos;
zend_string *doc_comment;
zend_string *name;
zend_ast *child[4];
} zend_ast_decl;
這幾個結構是十分重要的結構比如說ZEND_AST_CALL對應的結構體是_zend_ast_decl,ZEND_AST_ZVAL是_zend_ast_zval.....在這裏我們要注意看一下zend_ast和zend_ast_zval我們發現前面幾個kind,attr都是一樣的 最後有一個可變長,這個東西在後面會說,這個地方是要注意的一個點,有助於kind 64的類型轉由zend_ast轉換爲zend_ast_zval後數據落地存儲
後來我自己寫了一個例子,自己調試一把
測試文件:
<?php
$a = 1;
$b = 2;
$c = 3;
$b = $a + $b + $c;
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
var_dump($b);
開始用gdb來進行調試
gdb php
break zend_compile_top_stmt
run test.php
然後程序定到了
(gdb) run test.php
Starting program: /usr/bin/php test.php
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Breakpoint 1, zend_compile_top_stmt (ast=0x7fffeea83530) at /home/zhanglei/ourc/php-7.3.11/Zend/zend_compile.c:8190
8190 if (!ast) {
我們從這個語法樹上去分析
(gdb) p *compiler_globals.ast
$2 = {kind = 132, attr = 0, lineno = 1, child = {0xf}}
我們看到zend的kind是132,132是ZEND_AST_STMT_LIST
(gdb) p *compiler_globals.ast.child@15
$17 = {0xf, 0x7fffeea83088, 0x7fffeea830e0, 0x7fffeea83138, 0x7fffeea83220, 0x7fffeea832a8, 0x7fffeea83380, 0x7fffeea83408, 0x7fffeea83490, 0x7fffeea83518, 0x7fffeea83630, 0x7fffeea836b8,
0x7fffeea83740, 0x7fffeea837c8, 0x7fffeea83850}
(gdb)
下面我會一個一個分析這個list下面的每一個ast類型
(gdb) p *compiler_globals.ast.child[1]
$18 = {kind = 517, attr = 0, lineno = 3, child = {0x7fffeea83060}}
(gdb) p *compiler_globals.ast.child[2]
$19 = {kind = 517, attr = 0, lineno = 4, child = {0x7fffeea830b8}}
(gdb) p *compiler_globals.ast.child[3]
$20 = {kind = 517, attr = 0, lineno = 5, child = {0x7fffeea83110}}
(gdb) p *compiler_globals.ast.child[4]
$21 = {kind = 517, attr = 0, lineno = 7, child = {0x7fffeea83168}}
(gdb) p *compiler_globals.ast.child[5]
517是ZEND_AST_ASSIGN 賦值,是4個,我們在程序中正是使用了4個賦值
$a = 1;
$b = 2;
$c = 3;
$b = $a + $b + $c;
後面的幾個
(gdb) p *compiler_globals.ast.child[5]
$22 = {kind = 515, attr = 0, lineno = 9, child = {0x7fffeea83238}}
(gdb) p *compiler_globals.ast.child[6]
$23 = {kind = 515, attr = 0, lineno = 10, child = {0x7fffeea83310}}
(gdb) p *compiler_globals.ast.child[7]
$24 = {kind = 515, attr = 0, lineno = 11, child = {0x7fffeea83398}}
(gdb) p *compiler_globals.ast.child[8]
$25 = {kind = 515, attr = 0, lineno = 12, child = {0x7fffeea83420}}
(gdb) p *compiler_globals.ast.child[9]
$26 = {kind = 515, attr = 0, lineno = 13, child = {0x7fffeea834a8}}
(gdb) p *compiler_globals.ast.child[10]
$27 = {kind = 515, attr = 0, lineno = 14, child = {0x7fffeea835c0}}
(gdb) p *compiler_globals.ast.child[11]
$28 = {kind = 515, attr = 0, lineno = 15, child = {0x7fffeea83648}}
(gdb) p *compiler_globals.ast.child[12]
$29 = {kind = 515, attr = 0, lineno = 16, child = {0x7fffeea836d0}}
(gdb) p *compiler_globals.ast.child[13]
$30 = {kind = 515, attr = 0, lineno = 17, child = {0x7fffeea83758}}
(gdb) p *compiler_globals.ast.child[14]
$31 = {kind = 515, attr = 0, lineno = 18, child = {0x7fffeea837e0}}
(gdb) p *compiler_globals.ast.child[15]
$32 = {kind = 515, attr = 0, lineno = 19, child = {0x7fffeea83868}}
都是515,在代碼中是ZEND_AST_CALL代表的是函數調用 現在我們再繼續向下看
其實看到這裏我開始思考了一個問題就是以$a = 1,爲例子(ZEND_AST_ASSIGN)
$符很可能被分析器過濾掉 ,那麼剩下的就是$a 的”a”存在了哪裏,1放到了哪裏
我發現賦值的話直接把值存儲到ast child轉化的zend_ast_zval*的zval裏
(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[1])).val.value.str.val)
$44 = 1 '\001'
(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[2])).val.value.str.val)
$45 = 2 '\002'
(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[3])).val.value.str.val)
$46 = 3 '\003'
分別存儲了1,2,3但還是思考a的位置在哪裏,於是我全局搜索了ZEND_AST_VAR,發現了這麼一段代碼
static int zend_try_compile_cv(znode *result, zend_ast *ast) /* {{{ */
{
zend_ast *name_ast = ast->child[0];
if (name_ast->kind == ZEND_AST_ZVAL) {
zval *zv = zend_ast_get_zval(name_ast);
zend_string *name;
if (EXPECTED(Z_TYPE_P(zv) == IS_STRING)) {
name = zval_make_interned_string(zv);
} else {
name = zend_new_interned_string(zval_get_string_func(zv));
}
if (zend_is_auto_global(name)) {
return FAILURE;
}
result->op_type = IS_CV;
result->u.op.var = lookup_cv(CG(active_op_array), name);
if (UNEXPECTED(Z_TYPE_P(zv) != IS_STRING)) {
zend_string_release_ex(name, 0);
}
return SUCCESS;
}
return FAILURE;
}
說明了他是一個字符串!!因爲這一段代碼
zend_string *name;
if (EXPECTED(Z_TYPE_P(zv) == IS_STRING)) {
name = zval_make_interned_string(zv);
} else {
name = zend_new_interned_string(zval_get_string_func(zv));
}
好了知道了變量名字的位置我們使用gdb調試吧
(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[2].child.child).val.value.str
$96 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953479, len = 1, val = "b"}
(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[1].child.child).val.value.str
$97 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953478, len = 1, val = "a"}
(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[3].child.child).val.value.str
$98 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953480, len = 1, val = "c"}
到了這裏我們看到了abc的位置
那我們再思考這段代碼中賦值是如何存儲的?然後我們看右子葉中存儲的是type 64 ZEND_AST_ZVAL
其實這裏就說明問題了賦值的數被存放到這裏,再用gdb看
(gdb) p *(zend_ast_zval*)compiler_globals.ast.child[3].child[1]
$158 = {kind = 64, attr = 0, val = {value = {lval = 3, dval = 1.4821969375237396e-323, counted = 0x3, str = 0x3, arr = 0x3, obj = 0x3, res = 0x3, ref = 0x3, ast = 0x3, zv = 0x3, ptr = 0x3, ce = 0x3,
func = 0x3, ww = {w1 = 3, w2 = 0}}, u1 = {v = {type = 4 '\004', type_flags = 0 '\000', u = {call_info = 0, extra = 0}}, type_info = 4}, u2 = {next = 5, cache_slot = 5, opline_num = 5, lineno = 5,
num_args = 5, fe_pos = 5, fe_iter_idx = 5, access_flags = 5, property_guard = 5, constant_flags = 5, extra = 5}}}
透過這裏我們看到了zend的u1.v.type是4對應的宏定義是IS_LONG
這裏就是我們存儲的值
說實話讀代碼讀到這裏我就開始思考一個問題爲什麼zend_ast 和 zend_ast_zval的kind會一樣呢?思考了一會兒我響起了多年前學習的滴水逆向教程的彙編課程,c語言開頭kind都是一樣的,我們看結構體
struct _zend_ast {
zend_ast_kind kind; /* Type of the node (ZEND_AST_* enum constant) */
zend_ast_attr attr; /* Additional attribute, use depending on node type */
uint32_t lineno; /* Line number */
zend_ast *child[1]; /* Array of children (using struct hack) */
};
/* Same as zend_ast, but with children count, which is updated dynamically */
typedef struct _zend_ast_list {
zend_ast_kind kind;
zend_ast_attr attr;
uint32_t lineno;
uint32_t children;
zend_ast *child[1];
} zend_ast_list;
你會發現內存分配的位置kind都是在第一個成員變量的位置,結果很明顯了,賦值的話內存前幾個位置是相同的,後面是柔性數組存放zval的,不得不說這個數據結構設計的不錯
zend_ast_kind kind;
zend_ast_attr attr;
uint32_t lineno;
所以值也會對齊
我們在繼續看一下$b = $a + $b + $c;zend是如何處理的
用gdb看第4個child
(gdb) p *compiler_globals.ast.child[4]
$161 = {kind = 517, attr = 0, lineno = 7, child = {0x7fffeea83168}}
這裏的kind是517還是賦值
然後查看child內容
(gdb) p *(*(zend_ast_zval*)*compiler_globals.ast.child[4].child.child).val.value.str
$177 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953479, len = 1, val = "b"}
第一個位置存放的是聲明
gdb打印第二個位置:
(gdb) p (*compiler_globals.ast.child[4].child[1])
$182 = {kind = 520, attr = 1, lineno = 7, child = {0x7fffeea831c8}}
這個520對應的kind是ZEND_AST_BINARY_OP,我們在php源碼中去找對應的代碼看看使用情況
case ZEND_AST_BINARY_OP:
switch (ast->attr) {
case ZEND_ADD: BINARY_OP(" + ", 200, 200, 201);
case ZEND_SUB: BINARY_OP(" - ", 200, 200, 201);
case ZEND_MUL: BINARY_OP(" * ", 210, 210, 211);
case ZEND_DIV: BINARY_OP(" / ", 210, 210, 211);
case ZEND_MOD: BINARY_OP(" % ", 210, 210, 211);
case ZEND_SL: BINARY_OP(" << ", 190, 190, 191);
case ZEND_SR: BINARY_OP(" >> ", 190, 190, 191);
case ZEND_CONCAT: BINARY_OP(" . ", 200, 200, 201);
case ZEND_BW_OR: BINARY_OP(" | ", 140, 140, 141);
case ZEND_BW_AND: BINARY_OP(" & ", 160, 160, 161);
case ZEND_BW_XOR: BINARY_OP(" ^ ", 150, 150, 151);
case ZEND_IS_IDENTICAL: BINARY_OP(" === ", 170, 171, 171);
case ZEND_IS_NOT_IDENTICAL: BINARY_OP(" !== ", 170, 171, 171);
case ZEND_IS_EQUAL: BINARY_OP(" == ", 170, 171, 171);
case ZEND_IS_NOT_EQUAL: BINARY_OP(" != ", 170, 171, 171);
case ZEND_IS_SMALLER: BINARY_OP(" < ", 180, 181, 181);
case ZEND_IS_SMALLER_OR_EQUAL: BINARY_OP(" <= ", 180, 181, 181);
case ZEND_POW: BINARY_OP(" ** ", 250, 251, 250);
case ZEND_BOOL_XOR: BINARY_OP(" xor ", 40, 40, 41);
case ZEND_SPACESHIP: BINARY_OP(" <=> ", 180, 181, 181);
EMPTY_SWITCH_DEFAULT_CASE();
我們再繼續看var_dump
(gdb) p (*(*(zend_ast_zval*)(compiler_globals.ast.child[6].child[0])).val.value.str.val@10)
$27 = "var_dump\000"
第一個位置放的是函數名字,然後我們在看第二個位置
(gdb) p *((compiler_globals.ast.child[5].child[1]))
$295 = {kind = 128, attr = 0, lineno = 9, child = {0x1}}
kind是128對應的是ZEND_AST_ARG_LIST 這是一個參數列表,我們看代碼
然後我再代碼中找到了這麼一段
zend_ast_list *list = zend_ast_get_list(args->child[1]->child[1]);
也就是說我們要看child的1的位置而不是0的位置,好了到這裏我們再繼續用gdb做調試,這樣我們就看到了我們的傳入參數
(gdb) p *(*(zend_ast_zval*)*((compiler_globals.ast.child[5].child[1].child[1].child))).val.value.str
$305 = {gc = {refcount = 1, u = {type_info = 454}}, h = 9223372036854953479, len = 1, val = "b"}
好了到了最後我們把我們解析這段代碼用一個圖給表示出來吧!!!!
到這裏zend的語法樹就已經解剖完成了,哈哈哈哈哈