Erlang Abstract Syntax Tree和彙編字節碼

一、抽象語法樹簡介
抽象語法樹(Abstract Syntax Tree)是源代碼的抽象語法結構的樹狀表示。
抽象語法樹是解析器(parser)的產物,解析器廣義來說輸入一般是程序的源碼,輸出一般是語法樹(syntax tree,也叫parse tree等)或抽象語法樹。進一步剝開來,廣義的解析器裏一般會有掃描器(scanner,也叫tokenizer或者lexical analyzer,詞法分析器),以及狹義的解析器(parser,也叫syntax analyzer,語法分析器)。掃描器的輸入一般是文本,經過詞法分析,輸出是將文本切割爲單詞的流。狹義的解析器輸入是單詞的流,經過語法分析,輸出是語法樹或者精簡過的AST。 

例如:將i = a + b * c作爲源代碼輸入到解析器裏,則廣義上的解析器的工作流程如下圖:

        

二、用途
erlang beam是寄存器虛擬機,因此erlang的源碼會被erlang解析器的詞法分析變爲AST,然後經過樹形遍歷解析器給轉換爲彙編字節碼。


三、抽象語法樹初識

1、創建一個test1.erl的文件,輸入以下代碼:

-module(test1).
-export([start/0]).

-compile({parse_transform, test_parser}).

start() ->
	"hello world".

        2、創建一個test_parser.erl的文件,輸入以下代碼:

-module(test_parser).

-export([parse_transform/2]).

parse_transform(AST, _Options) ->
	io:format("old:~w~n~n", [AST]),
	Acc = parse_ast(AST, []),
	io:format("new:~w~n~n", [Acc]),
	Acc.


parse_ast([{attribute, _, _, _} = H | R], Acc) ->
	parse_ast(R, [H | Acc]);
parse_ast([{function, _Line, _Fun, _Arity, _Args} = H | R], Acc) ->
	parse_ast(R, [parse_fun(H) | Acc]);
parse_ast([H | R], Acc) ->
	parse_ast(R, [H | Acc]);
parse_ast([], Acc) ->
	lists:reverse(Acc).

parse_fun({function, Line, Fun, Arity, Clause}) ->
	{function, Line, Fun, Arity, parse_clause(Clause, [])}.

parse_clause([{clause, Line, P, Guard, Return}], Acc) ->
	[{clause, Line, P, Guard, parse_return(Return, [])} | Acc].

parse_return([{string, Line, Value} | R], Acc) ->
	parse_return(R, [{bin, Line, [{bin_element, Line, {string, Line, Value},default,default}]} | Acc]);
parse_return([], Acc) ->
	lists:reverse(Acc).

        3、編譯

1> c(test_parser).
{ok,test_parser}
2> c(test1).
old:[{attribute,1,file,{[116,101,115,116,49,46,101,114,108],1}},{attribute,1,module,test1},{attribute,3,export,[{start,0}]},
{function,7,start,0,[{clause,7,[],[],[{string,8,[104,101,108,108,111,32,119,111,114,108,100]}]}]},
{eof,9}]

new:[{attribute,1,file,{[116,101,115,116,49,46,101,114,108],1}},{attribute,1,module,test1},{attribute,3,export,[{start,0}]},{function,7,start,0,[{clause,7,[],[],[{bin,8,[{bin_element,8,{string,8,[104,101,108,108,111,32,119,111,114,108,100]},default,default}]}]}]},{eof,9}]

{ok,test1}

    先不管io:format的打印,我們直接執行test1:start(),

3> test1:start().
<<"hello world">>
    發現終端輸出的是二進制!而我們源碼明明寫的是個字符串!!破編譯器對我們的代碼幹了什麼!!

    別驚慌,這時候我們就可以來看看我們打印的抽象語法樹了。

    先看old,attribute都是源碼文件的屬性,例如file、export、import、module名等,可以略過,以及eof表示源碼文件的結尾,也可以忽略。直接定位到function元組,{function, Line, Name, Arity, Clauses}表示一個函數的源碼行、函數名、形參數、模式匹配列表,再定位到模式匹配列表clause,其表示了一個函數的多個模式匹配和返回值,當前start函數就一個匹配,因此clause只有一個tuple元素,返回值就是一個{string, Line, "hello world"},
    那麼我們是不是修改這個{string, Line, "hello world"}爲一個二進制字符串,就能達到我們演示的效果呢?是的!

    這就是test1.erl中-compile({parse_transform, test_parser}).的作用,erlang的解析器會在詞法分析完成後,調用我們自己指定的函數對ast進行二次更改,於是test_parser.erl裏面就將start函數的返回值更改爲了一個二進制。然後返回一個新的AST,再經樹形解析編譯爲erlang彙編字節碼文件,供erlang vm調用。

四、應用
lager日誌庫 https://github.com/erlang-lager/lager
以前一直疑惑使用lager日誌庫爲什麼要在編譯選項上加一個{parse_transform, lager_transform},現在也懂了,
就是編譯我們項目每一個源文件時,都將最終的抽象語法樹再次添加一些特性代碼。

這裏我們可以打印一下,現在我在一個叫erlserver_app.erl的文件里加入lager日誌打印代碼:

-module(erlserver_app).
-behaviour(application).

-export([start/2, stop/1]).

-compile([{parse_transform, lager_transform}]).

start(_StartType, _StartArgs) ->
    lager:warning("12345678987654321~w~n", ["abcdedeba"]),
    erlserver_sup:start_link().

%%--------------------------------------------------------------------
stop(_State) ->
    ok.
    然後在lager_transform裏繼續像上面那樣打印,

    最終編譯輸出:

===> old:
[{attribute,1,file,{[47,104,111,109,101,47,108,105,107,117,110,47,115,116,117,100,121,47,101,114,108,97,110,103,47,101,114,108,115,101,114,118,101,114,47,95,98,117,105,108,100,47,100,101,102,97,117,108,116,47,108,105,98,47,101,114,108,115,101,114,118,101,114,47,115,114,99,47,101,114,108,115,101,114,118,101,114,95,97,112,112,46,101,114,108],1}},{attribute,6,module,erlserver_app},{attribute,8,behaviour,application},{attribute,11,export,[{start,2},{stop,1}]},{attribute,13,compile,[]},{function,19,start,2,[{clause,19,[{var,19,'_StartType'},{var,19,'_StartArgs'}],[],[{call,20,{remote,20,{atom,20,lager},{atom,20,warning}},[{string,20,[49,50,51,52,53,54,55,56,57,56,55,54,53,52,51,50,49,126,119,126,110]},{cons,20,{string,20,[97,98,99,100,101,100,101,98,97]},{nil,20}}]},{call,21,{remote,21,{atom,21,erlserver_sup},{atom,21,start_link}},[]}]}]},{function,24,stop,1,[{clause,24,[{var,24,'_State'}],[],[{atom,25,ok}]}]},{eof,30}]


===> new:
[{attribute,1,file,{[47,104,111,109,101,47,108,105,107,117,110,47,115,116,117,100,121,47,101,114,108,97,110,103,47,101,114,108,115,101,114,118,101,114,47,95,98,117,105,108,100,47,100,101,102,97,117,108,116,47,108,105,98,47,101,114,108,115,101,114,118,101,114,47,115,114,99,47,101,114,108,115,101,114,118,101,114,95,97,112,112,46,101,114,108],1}},{attribute,6,module,erlserver_app},{attribute,6,lager_records,[]},{attribute,8,behaviour,application},{attribute,11,export,[{start,2},{stop,1}]},{attribute,13,compile,[]},{function,19,start,2,[{clause,19,[{var,19,'_StartType'},{var,19,'_StartArgs'}],[],[{'case',20,{tuple,20,[{call,20,{atom,20,whereis},[{atom,20,lager_event}]},{call,20,{atom,20,whereis},[{atom,20,lager_event}]},{call,20,{remote,20,{atom,20,lager_config},{atom,20,get}},[{tuple,20,[{atom,20,lager_event},{atom,20,loglevel}]},{tuple,20,[{integer,20,0},{nil,20}]}]}]},[{clause,20,[{tuple,20,[{atom,20,undefined},{atom,20,undefined},{var,20,'_'}]}],[],[{call,20,{'fun',20,{clauses,[{clause,20,[],[],[{tuple,20,[{atom,20,error},{atom,20,lager_not_running}]}]}]}},[]}]},{clause,20,[{tuple,20,[{atom,20,undefined},{var,20,'_'},{var,20,'_'}]}],[],[{call,20,{'fun',20,{clauses,[{clause,20,[],[],[{tuple,20,[{atom,20,error},{tuple,20,[{atom,20,sink_not_configured},{atom,20,lager_event}]}]}]}]}},[]}]},{clause,20,[{tuple,20,[{var,20,'__Piderlserver_app20'},{var,20,'_'},{tuple,20,[{var,20,'__Levelerlserver_app20'},{var,20,'__Traceserlserver_app20'}]}]}],[[{op,20,'orelse',{op,20,'/=',{op,20,'band',{var,20,'__Levelerlserver_app20'},{integer,20,16}},{integer,20,0}},{op,20,'/=',{var,20,'__Traceserlserver_app20'},{nil,20}}}]],[{call,20,{remote,20,{atom,20,lager},{atom,20,do_log}},[{atom,20,warning},{cons,20,{tuple,20,[{atom,20,application},{atom,20,erlserver}]},{cons,20,{tuple,20,[{atom,20,module},{atom,20,erlserver_app}]},{cons,20,{tuple,20,[{atom,20,function},{atom,20,start}]},{cons,20,{tuple,20,[{atom,20,line},{integer,20,20}]},{cons,20,{tuple,20,[{atom,20,pid},{call,20,{atom,20,pid_to_list},[{call,20,{atom,20,self},[]}]}]},{cons,20,{tuple,20,[{atom,20,node},{call,20,{atom,20,node},[]}]},{call,20,{remote,20,{atom,20,lager},{atom,20,md}},[]}}}}}}},{string,20,[49,50,51,52,53,54,55,56,57,56,55,54,53,52,51,50,49,126,119,126,110]},{cons,20,{string,20,[97,98,99,100,101,100,101,98,97]},{nil,20}},{integer,20,4096},{integer,20,16},{var,20,'__Levelerlserver_app20'},{var,20,'__Traceserlserver_app20'},{atom,20,lager_event},{var,20,'__Piderlserver_app20'}]}]},{clause,20,[{var,20,'_'}],[],[{atom,20,ok}]}]},{call,21,{remote,21,{atom,21,erlserver_sup},{atom,21,start_link}},[]}]}]},{function,24,stop,1,[{clause,24,[{var,24,'_State'}],[],[{atom,25,ok}]}]},{eof,30}]
    看來添加的東西挺多,沒事,慢慢看,我們根據官方文檔The Abstract Format一個一個對比看,

    最終將添加的代碼還原出來:

case {whereis(lager_event), whereis(lager_event), lager_config:get({lager_event, loglevel}, {0, []})} of 
    {undefined, undefined, _} ->
        {error, lager_not_running};
    {undefined, _, _} ->
        {error, sink_not_configured, lager_event};
    {__Piderlserver_app20, _, {__Levelerlserver_app20, __Traceserlserver_app20}} when __Levelerlserver_app20 band 16 /= 0 orelse __Traceserlserver_app20 /= nil ->
        lager:do_log(warning, [{application, erlserver}, {atom, erlserver_app}, {function, start}, {line, 20}, {pid, pid_to_list(self())}, {node, node()}, lager:md()],
            [49,50,51,52,53,54,55,56,57,56,55,54,53,52,51,50,49,126,119,126,110], [[97,98,99,100,101,100,101,98,97], []], 4096, 16, __Levelerlserver_app20, 
                __Traceserlserver_app20, lager_event, __Piderlserver_app20);
    _ -> 
        ok
end

    小小一句lager:warning()原來有這麼多彎彎繞啊!

五、erlang彙編字節碼
一直說erlang彙編字節碼,但是具體是個什麼東西,還沒有認識。

下面新建一個文件test.erl,輸入以下內容:

-module(test).
-export([start/2]).

start(A, B) ->
      	abs(A) + abs(B).
        1、編譯.S彙編碼
        erlc -S test.erl
        2、編譯.dis彙編碼
        erl
        c(test).
        erts_debug:df(test). 

        然後打開得到的test.S文件:

{module, test}.  %% version = 0
{module, test}.  %% version = 0

{exports, [{module_info,0},{module_info,1},{start,2}]}.

{attributes, []}.

{labels, 7}.


{function, start, 2, 2}.
  {label,1}.
    {line,[{location,"test.erl",4}]}.
    {func_info,{atom,test},{atom,start},2}.
  {label,2}.
    {line,[{location,"test.erl",5}]}.
    {gc_bif,abs,{f,0},2,[{x,0}],{x,0}}.
    {line,[{location,"test.erl",5}]}.
    {gc_bif,abs,{f,0},2,[{x,1}],{x,1}}.
    {line,[{location,"test.erl",5}]}.
    {gc_bif,'+',{f,0},2,[{x,0},{x,1}],{x,0}}.
    return.


{function, module_info, 0, 4}.
  {label,3}.
    {line,[]}.
    {func_info,{atom,test},{atom,module_info},0}.
  {label,4}.
    {move,{atom,test},{x,0}}.
    {line,[]}.
    {call_ext_only,1,{extfunc,erlang,get_module_info,1}}.


{function, module_info, 1, 6}.
  {label,5}.
    {line,[]}.
    {func_info,{atom,test},{atom,module_info},1}.
  {label,6}.
    {move,{x,0},{x,1}}.
    {move,{atom,test},{x,0}}.
    {line,[]}.
    {call_ext_only,2,{extfunc,erlang,get_module_info,2}}.
    每一個function元組標明瞭模塊中的函數,我們直接定位到function start模塊,每個label表示pc指針的執行順序,跳過label1,我們直接看label2,有許多寄存器x0,x1,...xn y0,y1,...yn。x寄存器是用來存儲函數入參的,特殊的x0寄存器,函數第一個入參是存放在裏面,並且也用來作爲函數返回值,而y寄存器是棧上分配的臨時寄存器,

    有了以上知識,我們直接看彙編碼:

    (忽略某些無用行)    

    {gc_bif,abs,{f,0},2,[{x,0}],{x,0}}. //用x0去調用bif函數abs,存儲結果到x0寄存器,

    {gc_bif,abs,{f,0},2,[{x,1}],{x,1}}. //用x1去調用bif函數abs,存儲結果到x1寄存器,

    {gc_bif,'+',{f,0},2,[{x,0},{x,1}],{x,0}}. //執行bif函數+, x0+x1,結果放入x0寄存器

    return. //返回,直接返回x0的值

   這就是.S彙編字節碼,它具有一定可讀性,方便人和機器閱讀

   再打開.dis彙編碼文件:

00007FDAE5183770: i_func_info_IaaI 0 test start 2 
00007FDAE5183798: i_gc_bif1_jIsId j(0000000000000000) abs/1 x(0) 2 x(0) 
00007FDAE51837C8: i_gc_bif1_jIsId j(0000000000000000) abs/1 x(1) 2 x(1) 
00007FDAE51837F8: i_plus_jIxxd j(0000000000000000) 2 x(0) x(1) x(0) 
00007FDAE5183828: return 

00007FDAE5183830: i_func_info_IaaI 0 test module_info 0 
00007FDAE5183858: move_cr test r(0) 
00007FDAE5183868: allocate_tt 0 1 
00007FDAE5183878: call_bif_e erlang:get_module_info/1 
00007FDAE5183888: deallocate_return_Q 0 

00007FDAE5183898: i_func_info_IaaI 0 test module_info 1 
00007FDAE51838C0: move_rx r(0) x(1) 
00007FDAE51838D0: move_cr test r(0) 
00007FDAE51838E0: allocate_tt 0 2 
00007FDAE51838F0: call_bif_e erlang:get_module_info/2 
00007FDAE5183900: deallocate_return_Q 0 
其實大體上差不多,不過多了真是調用的函數和執行地址,但是erlang的彙編操作碼太多,因此這個文件可讀性很差,如果想搞懂具體指令作用可以去查閱相關文檔。
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章