詞法分析淺析

參考資料

《自制編程語言基於C語言》
《自己動手寫編譯器、連接器》
《自己動手構造編譯系統編譯、彙編與鏈接》
《編譯原理 第二版》

詞法分析

詞法分析主要就是將源碼分解成每一個定義的token,通過詞法解析分析出哪些是關鍵字,哪些是標識符,哪些是字符串等等,詞法分析是編譯的第一階段,通過生成token來爲接下來的語法分析做鋪墊。在實際的分析過程中,詞法分析其實也還包含了其他的一些如符號表相關的維護,但是由於當前只是熟悉一下詞法分析的概念所以就只是實現了一個token的生成過程。

獲取字符
if字符串
else字符串
loop
get_next_char
Token_IF
Token_ELSE

舉例如上所示,基本的處理流程就是通過不斷的獲取下一個字符來進行詞法的解析。

假如輸入如下
if cond:
	print
else:
	print
	

詞法解析之後
TOKEN_IF("if")  TOKEN_VAR("cond") TOKEN_VAR("print") TOKEN_ELSE("else") TOKEN_VAR("print")

大致的流程就是將字符串執行完成之後就輸出一個解析完成的token,並將token分成不同的類別。有關鍵字if或者變量cond等。

腳本實現簡單詞法分析

本次分析的腳本是Python文件,主要就是將字符解析成不同類型的token,其中有關鍵字token,也有變量token,還有運算符標點符號等。

import os


class FileNotExistsError(Exception):
    pass


class Token(object):

    def __init__(self, word, type_name, lineno):
        self.word = word
        self.type_name = type_name
        self.lineno = lineno

    def __str__(self):
        str_show = "type : {0}  word : {1}  lineno : {2} ".format(self.type_name, self.word, self.lineno)
        return str_show


TOKEN_LISTS = {

    ",": "TOKEN_COMMA",
    ":": "TOKEN_COLON",
    ";": "TOKEN_SEMI_COLON",
    "(": "TOKEN_LEFT_PAREN",
    ")": "TOKEN_RIGHT_PAREN",
    "[": "TOKEN_LEFT_BRACKET",
    "]": "TOKEN_RIGHT_BRACKET",
    "{": "TOKEN_LEFT_BRACE",
    "}": "TOKEN_RIGHT_BRACE",
    ".": "TOKEN_DOT",
    "..": "TOKEN_DOT_DOT",
    "\"": "TOKEN_DOUBLE_QUOTE",
    "\'": "TOKEN_QUOTE",
    "\t": "TOKEN_TAB",
    "#": "TOKEN_SHARP",

    "+": "TOKEN_ADD",
    "-": "TOKEN_SUB",
    "*": "TOKEN_MUL",
    "/": "TOKEN_DIV",
    "%": "TOKEN_MOD",
    "=": "TOKEN_ASSIGN",
    "!": "TOKEN_LOGIC_NOT",

    "&": "TOKEN_BIT_AND",
    "|": "TOKEN_BIT_OR",
    "~": "TOKEN_BIT_NOT",

    ">>": "TOKEN_BIT_SHIFT_RIGHT",
    "<<": "TOKEN_BIT_SHIFT_LEFT",
    "&&": "TOKEN_LOGIC_AND",
    "||": "TOKEN_LOGIC_OR",


    "==": "TOKEN_EQUAL",
    "!=": "TOKEN_NOT_EQUAL",
    ">": "TOKEN_GREATE",
    ">=": "TOKEN_GREATE_EQUAL",
    "<": "TOKEN_LESS",
    "<=": "TOKEN_LESS_EQUAL",

}


KEYWORDS_LISTS = {
    "while": "KEYWORD_TOKEN_WHILE",
    "if": "KEYWORD_TOKEN_IF",
    "elif": "KEYWORD_TOKEN_ELIF",
    "else": "KEYWORD_TOKEN_ELSE",
    "for": "KEYWORD_TOKEN_FOR",
    "in": "KEYWORD_TOKEN_IN",
    "continue": "KEYWORD_TOKEN_CONTINUE",
    "break": "KEYWORD_TOKEN_BREAK",

    "try": "KEYWORD_TOKEN_TRY",
    "except": "KEYWORD_TOKEN_EXCEPT",
    "finally": "KEYWORD_TOKEN_FINALLY",
    "raise": "KEYWORD_TOKEN_RAISE",

    "and": "KEYWORD_TOKEN_AND",
    "or": "KEYWORD_TOKEN_OR",
    "is": "KEYWORD_TOKEN_IS",

    "with": "KEYWORD_TOKEN_WITH",
    "as": "KEYWORD_TOKEN_AS",

    "assert": "KEYWORD_TOKEN_ASSERT",

    "return": "KEYWORD_TOKEN_RETURN",
    "pass": "KEYWORD_TOKEN_PASS",

    "None": "KEYWORD_TOKEN_NONE",
    "True": "KEYWORD_TOKEN_TRUE",
    "False": "KEYWORD_TOKEN_FALSE",

    "object": "KEYWORD_TOKEN_OBJECT",
    "class": "KEYWORD_TOKEN_CLASS",

    "import": "KEYWORD_TOKEN_IMPORT",
    "from": "KEYWORD_TOKEN_FROM",
    "def": "KEYWORD_TOKEN_DEF",
    "lambda": "KEYWORD_TOKEN_LAMBDA",
    "del": "KEYWORD_TOKEN_DEL",

    "nonlocal": "KEYWORD_TOKEN_NONLOCAL",
    "global": "KEYWORD_TOKEN_GLOBAL",
    "yield": "KEYWORD_TOKEN_YIELD",
    "await": "KEYWORD_TOKEN_AWAIT",
    "async": "KEYWORD_TOKEN_ASYNC",
}


TOKENS = []
for token in TOKEN_LISTS:
    TOKENS.append(token)

KEYWORDS = []
for token in KEYWORDS_LISTS:
    KEYWORDS.append(token)


class Parser(object):

    def __init__(self, path):
        self.cur_token = None
        self.file_path = path
        self.file = None
        self.source_file = None
        self.total_source_length = None
        self.lineno = 1
        self.postion = 0
        self.stack = []
        self.init()

    def init(self):
        if not os.path.exists(self.file_path):
            raise FileNotExistsError("file not exists")
        with open(self.file_path, "r") as f:
            self.source_file = f.read()
            self.total_source_length = len(self.source_file)

    def get_next_char(self, is_move=True):
        if self.postion < self.total_source_length:
            c = self.source_file[self.postion]
            return c

    def advance_one_char(self):
        if self.postion < self.total_source_length:
            self.postion += 1

    def parse_str(self):
        """
        解析字符串  "abc"  暫不支持 包含轉義字符
        :return:
        """
        str_list = []
        while True:
            c = self.get_next_char()
            if c is None:
                return
            self.advance_one_char()
            if c == "\"":
                return Token("".join(str_list), "TOKEN_STRING", self.lineno)
            else:
                str_list.append(c)

    def parse_ID(self, v):
        """
        解析變量或者關鍵字 變量以下劃線字母開頭,變量名可以是下劃線字母數字
        :return:
        """
        var_name = [v]
        while True:
            c = self.get_next_char()
            if self.check_ID(c, is_first=True):
                self.advance_one_char()
                var_name.append(c)
            else:
                var_str = "".join(var_name)
                if var_str in KEYWORDS:
                    return Token(var_str, KEYWORDS_LISTS[var_str], self.lineno)
                return Token("".join(var_name), "TOKEN_ID", self.lineno)

    def check_ID(self, v, is_first=False):
        """
        判斷是否是變量或者關鍵字開頭
        :param v: 待檢測的字符
        :param is_first: 是否檢查變量的首字母
        :return:
        """
        if v is None:
            return False
        # 先檢查是否是字符或者下劃線
        if (v >= "a" and v <= "z") or (v >= "A" and v <= "Z") or v == "_":
            return True
        # 如果不是變量首字母, 在檢查是否是數字
        if is_first and v.isdigit():
            return True
        return False

    def parse_NUM(self, v):
        """
        解析十進制數字 當前只解析十進制
        :param v:
        :return:
        """
        nums_list = [v]
        while True:
            c = self.get_next_char()
            if c.isdigit():
                nums_list.append(c)
                self.advance_one_char()
                continue
            return Token("".join(nums_list), "TOKEN_NUM", self.lineno)

    def skip_shard(self):
        """
        跳過# 註釋  當前不支持 三個雙引號註釋
        :return:
        """
        while True:
            c = self.get_next_char()
            if c != "\n":
                self.advance_one_char()
                continue
            else:
                return

    def get_next_token(self):
        while True:
            c = self.get_next_char()
            if c is None:
                return
            self.advance_one_char()
            if c in TOKENS:
                n_c = self.get_next_char()
                if n_c is None:
                    return
                two_char = c + n_c
                if two_char in TOKENS:
                    self.advance_one_char()
                    return Token(two_char, TOKEN_LISTS[two_char], self.lineno)
                elif c == "\"":
                    return self.parse_str()
                elif c == "#":
                    self.skip_shard()
                else:
                    return Token(c, TOKEN_LISTS[c], self.lineno)
            elif self.check_ID(c):
                # 解析變量
                return self.parse_ID(c)
            elif c.isdigit():
                return self.parse_NUM(c)
            elif c == "\n":
                self.lineno += 1


def run_file(path):
    parser = Parser(path)
    cur_token = parser.get_next_token()
    print(cur_token)
    while cur_token is not None:
        cur_token = parser.get_next_token()
        print(cur_token)


if __name__ == '__main__':
    run_file("./python_test.py")

腳本中定義了相關Python文件中的關鍵字與關鍵字符,並且在詞法解析過程中不支持三個雙引號的註釋解析("""),不支持字符串中包含轉義字符。

測試的python_test.py文件如下;

"""A pure-Python Python bytecode interpreter."""
# Adapted from:
# 1. pyvm2 by Paul Swartz (z3p), from http://www.twistedmatrix.com/users/z3p/
# 2. byterun by Ned Batchelder, github.com/nedbat/byterun

import dis, operator, sys, collections, inspect, types

class Frame(object):
    def __init__(self, code_obj, global_names, local_names, prev_frame):
        self.code_obj = code_obj
        self.global_names = global_names
        self.local_names = local_names
        self.prev_frame = prev_frame
        self.stack = []
        if prev_frame:
            self.builtin_names = prev_frame.builtin_names
        else:
            self.builtin_names = local_names['__builtins__']
            if hasattr(self.builtin_names, '__dict__'):
                self.builtin_names = self.builtin_names.__dict__

        self.last_instruction = 0
        self.block_stack = []

    # Data stack manipulation
    def top(self):
        return self.stack[-1]

    def pop(self):
        return self.stack.pop()

    def push(self, *vals):
        self.stack.extend(vals)

    def popn(self, n):
        if n:
            ret = self.stack[-n:]
            self.stack[-n:] = []
            return ret
        else:
            return []

    # Block stack manipulation
    def push_block(self, b_type, handler=None):
        stack_height = len(self.stack)
        self.block_stack.append(Block(b_type, handler, stack_height))

    def pop_block(self):
        return self.block_stack.pop()

    def unwind_block(self, block):
        if block.type == 'except-handler':
            offset = 3
        else:
            offset = 0

        while len(self.stack) > block.stack_height + offset:
            self.pop()

        if block.type == 'except-handler':
            traceback, value, exctype = self.popn(3)
            return exctype, value, traceback

Block = collections.namedtuple("Block", "type, handler, stack_height")

class Function(object):
    __slots__ = [
        'func_code', 'func_name', 'func_defaults', 'func_globals',
        'func_locals', 'func_dict', 'func_closure',
        '__name__', '__dict__', '__doc__',
        '_vm', '_func',
    ]


    def __init__(self, name, code, globs, defaults, closure, vm):
        self._vm = vm
        self.func_code = code
        self.func_name = self.__name__ = name or code.co_name
        self.func_defaults = tuple(defaults)
        self.func_globals = globs
        self.func_locals = self._vm.frame.local_names
        self.__dict__ = {}
        self.func_closure = closure
        self.__doc__ = code.co_consts[0] if code.co_consts else None

        # Sometimes, we need a real Python function.  This is for that.
        kw = {
            'argdefs': self.func_defaults,
        }
        if closure:
            kw['closure'] = tuple(make_cell(0) for _ in closure)
        self._func = types.FunctionType(code, globs, **kw)

    def __call__(self, *args, **kwargs):
        callargs = inspect.getcallargs(self._func, *args, **kwargs)
        frame = self._vm.make_frame(
            self.func_code, callargs, self.func_globals, {}
        )
        return self._vm.run_frame(frame)

def make_cell(value):
    # Thanks to Alex Gaynor for help with this bit of twistiness.
    fn = (lambda x: lambda: x)(value)
    return fn.__closure__[0]



class VirtualMachineError(Exception):
    pass

class VirtualMachine(object):
    def __init__(self):
        self.frames = []   # The call stack of frames.
        self.frame = None  # The current frame.
        self.return_value = None
        self.last_exception = None

    # Frame manipulation
    def make_frame(self, code, callargs={}, global_names=None, local_names=None):
        if global_names is not None and local_names is not None:
            local_names = global_names
        elif self.frames:
            global_names = self.frame.global_names
            local_names = {}
        else:
            global_names = local_names = {
                '__builtins__': __builtins__,
                '__name__': '__main__',
                '__doc__': None,
                '__package__': None,
            }
        local_names.update(callargs)
        frame = Frame(code, global_names, local_names, self.frame)
        return frame

    def push_frame(self, frame):
        self.frames.append(frame)
        self.frame = frame

    def pop_frame(self):
        self.frames.pop()
        if self.frames:
            self.frame = self.frames[-1]
        else:
            self.frame = None

    # Jumping through bytecode
    def jump(self, jump):
        self.frame.last_instruction = jump

    def run_code(self, code, global_names=None, local_names=None):
        frame = self.make_frame(code, global_names=global_names, local_names=local_names)

        self.run_frame(frame)
        # Check some invariants
        # if self.frames:
        #     raise VirtualMachineError("Frames left over!")
        # if self.frame and self.frame.stack:
        #     raise VirtualMachineError("Data left on stack! %r" % self.frame.stack)

        # for testing, was val = self.run_frame(frame)
        # return val # for testing

    def parse_byte_and_args(self):
        f = self.frame
        opoffset = f.last_instruction
        byteCode = f.code_obj.co_code[opoffset]
        f.last_instruction += 1
        byte_name = dis.opname[byteCode]
        if byteCode >= dis.HAVE_ARGUMENT:
            arg = f.code_obj.co_code[f.last_instruction:f.last_instruction+2]  # index into the bytecode
            f.last_instruction += 2   # advance the instruction pointer
            arg_val = arg[0] + (arg[1] << 8)
            if byteCode in dis.hasconst:   # Look up a constant
                arg = f.code_obj.co_consts[arg_val]
            elif byteCode in dis.hasname:  # Look up a name
                arg = f.code_obj.co_names[arg_val]
            elif byteCode in dis.haslocal: # Look up a local name
                arg = f.code_obj.co_varnames[arg_val]
            elif byteCode in dis.hasjrel:  # Calculate a relative jump
                arg = f.last_instruction + arg_val
            else:
                arg = arg_val
            argument = [arg]
        else:
            argument = []

        return byte_name, argument

    def dispatch(self, byte_name, argument):
        # When later unwinding the block stack,
        # we need to keep track of why we are doing it.
        why = None
        try:
            bytecode_fn = getattr(self, 'byte_%s' % byte_name, None)
            if bytecode_fn is None:
                if byte_name.startswith('UNARY_'):
                    self.unaryOperator(byte_name[6:])
                elif byte_name.startswith('BINARY_'):
                    self.binaryOperator(byte_name[7:])
                else:
                    raise VirtualMachineError(
                        "unsupported bytecode type: %s" % byte_name
                    )
            else:
                why = bytecode_fn(*argument)
        except:
            # deal with exceptions encountered while executing the op.
            self.last_exception = sys.exc_info()[:2] + (None,)
            why = 'exception'

        return why

    def manage_block_stack(self, why):
        block = self.frame.block_stack[-1]

        if block.type == 'loop' and why == 'continue':
            self.jump(self.return_value)
            why = None
            return why

        self.frame.pop_block()
        current_exc = self.frame.unwind_block(block)
        if current_exc is not None:
            self.last_exception = current_exc

        if block.type == 'loop' and why == 'break':
            self.jump(block.handler)
            why = None

        elif (block.type in ['setup-except', 'finally'] and why == 'exception'):
            self.frame.push_block('except-handler')
            exctype, value, tb = self.last_exception
            self.frame.push(tb, value, exctype)
            self.frame.push(tb, value, exctype) # yes, twice
            self.jump(block.handler)
            why = None

        elif block.type == 'finally':
            if why in ('return', 'continue'):
                self.frame.push(self.return_value)
            self.frame.push(why)
            self.jump(block.handler)
            why = None

        return why


    def run_frame(self, frame):
        self.push_frame(frame)
        while True:
            byte_name, argument = self.parse_byte_and_args()

            why = self.dispatch(byte_name, argument)

            # Deal with any block management we need to do
            while why and frame.block_stack:
                why = self.manage_block_stack(why)

            if why:
                break

        self.pop_frame()

        if why == 'exception':
            exc, val, tb = self.last_exception
            e = exc(val)
            e.__traceback__ = tb
            raise e

        return self.return_value

    ## Stack manipulation

    def byte_LOAD_CONST(self, const):
        self.frame.push(const)

    def byte_POP_TOP(self):
        self.frame.pop()

    def byte_DUP_TOP(self):
        self.frame.push(self.frame.top())

    ## Names
    def byte_LOAD_NAME(self, name):
        frame = self.frame
        if name in frame.local_names:
            val = frame.local_names[name]
        elif name in frame.global_names:
            val = frame.global_names[name]
        elif name in frame.builtin_names:
            val = frame.builtin_names[name]
        else:
            raise NameError("name '%s' is not defined" % name)
        self.frame.push(val)

    def byte_STORE_NAME(self, name):
        self.frame.local_names[name] = self.frame.pop()

    def byte_DELETE_NAME(self, name):
        del self.frame.local_names[name]

    def byte_LOAD_FAST(self, name):
        if name in self.frame.local_names:
            val = self.frame.local_names[name]
        else:
            raise UnboundLocalError(
                "local variable '%s' referenced before assignment" % name
            )
        self.frame.push(val)

    def byte_STORE_FAST(self, name):
        self.frame.local_names[name] = self.frame.pop()

    def byte_LOAD_GLOBAL(self, name):
        f = self.frame
        if name in f.global_names:
            val = f.global_names[name]
        elif name in f.builtin_names:
            val = f.builtin_names[name]
        else:
            raise NameError("global name '%s' is not defined" % name)
        f.push(val)

    ## Operators

    UNARY_OPERATORS = {
        'POSITIVE': operator.pos,
        'NEGATIVE': operator.neg,
        'NOT':      operator.not_,
        'INVERT':   operator.invert,
    }

    def unaryOperator(self, op):
        x = self.frame.pop()
        self.frame.push(self.UNARY_OPERATORS[op](x))

    BINARY_OPERATORS = {
        'POWER':    pow,
        'MULTIPLY': operator.mul,
        'FLOOR_DIVIDE': operator.floordiv,
        'TRUE_DIVIDE':  operator.truediv,
        'MODULO':   operator.mod,
        'ADD':      operator.add,
        'SUBTRACT': operator.sub,
        'SUBSCR':   operator.getitem,
        'LSHIFT':   operator.lshift,
        'RSHIFT':   operator.rshift,
        'AND':      operator.and_,
        'XOR':      operator.xor,
        'OR':       operator.or_,
    }

    def binaryOperator(self, op):
        x, y = self.frame.popn(2)
        self.frame.push(self.BINARY_OPERATORS[op](x, y))

    COMPARE_OPERATORS = [
        operator.lt,
        operator.le,
        operator.eq,
        operator.ne,
        operator.gt,
        operator.ge,
        lambda x, y: x in y,
        lambda x, y: x not in y,
        lambda x, y: x is y,
        lambda x, y: x is not y,
        lambda x, y: issubclass(x, Exception) and issubclass(x, y),
    ]

    def byte_COMPARE_OP(self, opnum):
        x, y = self.frame.popn(2)
        self.frame.push(self.COMPARE_OPERATORS[opnum](x, y))

    ## Attributes and indexing

    def byte_LOAD_ATTR(self, attr):
        obj = self.frame.pop()
        val = getattr(obj, attr)
        self.frame.push(val)

    def byte_STORE_ATTR(self, name):
        val, obj = self.frame.popn(2)
        setattr(obj, name, val)

    def byte_STORE_SUBSCR(self):
        val, obj, subscr = self.frame.popn(3)
        obj[subscr] = val

    ## Building

    def byte_BUILD_TUPLE(self, count):
        elts = self.frame.popn(count)
        self.frame.push(tuple(elts))

    def byte_BUILD_LIST(self, count):
        elts = self.frame.popn(count)
        self.frame.push(elts)

    def byte_BUILD_MAP(self, size):
        self.frame.push({})

    def byte_STORE_MAP(self):
        the_map, val, key = self.frame.popn(3)
        the_map[key] = val
        self.frame.push(the_map)

    def byte_UNPACK_SEQUENCE(self, count):
        seq = self.frame.pop()
        for x in reversed(seq):
            self.frame.push(x)

    def byte_BUILD_SLICE(self, count):
        if count == 2:
            x, y = self.frame.popn(2)
            self.frame.push(slice(x, y))
        elif count == 3:
            x, y, z = self.frame.popn(3)
            self.frame.push(slice(x, y, z))
        else:           # pragma: no cover
            raise VirtualMachineError("Strange BUILD_SLICE count: %r" % count)

    def byte_LIST_APPEND(self, count):
        val = self.frame.pop()
        the_list = self.frame.stack[-count] # peek
        the_list.append(val)


    ## Jumps

    def byte_JUMP_FORWARD(self, jump):
        self.jump(jump)

    def byte_JUMP_ABSOLUTE(self, jump):
        self.jump(jump)

    def byte_POP_JUMP_IF_TRUE(self, jump):
        val = self.frame.pop()
        if val:
            self.jump(jump)

    def byte_POP_JUMP_IF_FALSE(self, jump):
        val = self.frame.pop()
        if not val:
            self.jump(jump)

    def byte_JUMP_IF_TRUE_OR_POP(self, jump):
        val = self.frame.top()
        if val:
            self.jump(jump)
        else:
            self.frame.pop()

    def byte_JUMP_IF_FALSE_OR_POP(self, jump):
        val = self.frame.top()
        if not val:
            self.jump(jump)
        else:
            self.frame.pop()

    ## Blocks

    def byte_SETUP_LOOP(self, dest):
        self.frame.push_block('loop', dest)

    def byte_GET_ITER(self):
        self.frame.push(iter(self.frame.pop()))

    def byte_FOR_ITER(self, jump):
        iterobj = self.frame.top()
        try:
            v = next(iterobj)
            self.frame.push(v)
        except StopIteration:
            self.frame.pop()
            self.jump(jump)

    def byte_BREAK_LOOP(self):
        return 'break'

    def byte_CONTINUE_LOOP(self, dest):
        # This is a trick with the return value.
        # While unrolling blocks, continue and return both have to preserve
        # state as the finally blocks are executed.  For continue, it's
        # where to jump to, for return, it's the value to return.  It gets
        # pushed on the stack for both, so continue puts the jump destination
        # into return_value.
        self.return_value = dest
        return 'continue'

    def byte_SETUP_EXCEPT(self, dest):
        self.frame.push_block('setup-except', dest)

    def byte_SETUP_FINALLY(self, dest):
        self.frame.push_block('finally', dest)

    def byte_POP_BLOCK(self):
        self.frame.pop_block()

    def byte_RAISE_VARARGS(self, argc):
        cause = exc = None
        if argc == 2:
            cause = self.frame.pop()
            exc = self.frame.pop()
        elif argc == 1:
            exc = self.frame.pop()
        return self.do_raise(exc, cause)

    def do_raise(self, exc, cause):
        if exc is None:         # reraise
            exc_type, val, tb = self.last_exception

        elif type(exc) == type:  # As in `raise ValueError`
            exc_type = exc
            val = exc()             # Make an instance.
        elif isinstance(exc, BaseException):
            # As in `raise ValueError('foo')`
            exc_type = type(exc)
            val = exc
        else:
            return 'exception' # failure

        self.last_exception = exc_type, val, val.__traceback__
        return 'exception'

    def byte_POP_EXCEPT(self):
        block = self.frame.pop_block()
        if block.type != 'except-handler':
            raise Exception("popped block is not an except handler")
        current_exc = self.frame.unwind_block(block)
        if current_exc is not None:
            self.last_exception = current_exc

    ## Functions

    def byte_MAKE_FUNCTION(self, argc):
        name = self.frame.pop()
        code = self.frame.pop()
        defaults = self.frame.popn(argc)
        globs = self.frame.global_names
        #TODO: if we're not supporting kwargs, do we need the defaults?
        fn = Function(name, code, globs, defaults, None, self)
        self.frame.push(fn)

    def byte_CALL_FUNCTION(self, arg):
        lenKw, lenPos = divmod(arg, 256) # KWargs not supported in byterun
        posargs = self.frame.popn(lenPos)

        func = self.frame.pop()
        frame = self.frame
        retval = func(*posargs)
        self.frame.push(retval)

    def byte_RETURN_VALUE(self):
        self.return_value = self.frame.pop()
        return "return"

    ## Importing

    def byte_IMPORT_NAME(self, name):
        level, fromlist = self.frame.popn(2)
        frame = self.frame
        self.frame.push(__import__(name, frame.global_names, frame.local_names, fromlist, level))

    def byte_IMPORT_FROM(self, name):
        mod = self.frame.top()
        self.frame.push(getattr(mod, name))

    ## And the rest...
    def byte_LOAD_BUILD_CLASS(self):
        self.frame.push(__build_class__)

    def byte_STORE_LOCALS(self):
        self.frame.local_names = self.frame.pop()



測試輸出的類型如下;

type : TOKEN_STRING  word :   lineno : 1 
type : TOKEN_STRING  word : A pure-Python Python bytecode interpreter.  lineno : 1 
type : TOKEN_STRING  word :   lineno : 1 
type : KEYWORD_TOKEN_IMPORT  word : import  lineno : 6 
type : TOKEN_ID  word : dis  lineno : 6 
type : TOKEN_COMMA  word : ,  lineno : 6 
type : TOKEN_ID  word : operator  lineno : 6 
type : TOKEN_COMMA  word : ,  lineno : 6 
type : TOKEN_ID  word : sys  lineno : 6 
type : TOKEN_COMMA  word : ,  lineno : 6 
type : TOKEN_ID  word : collections  lineno : 6 
type : TOKEN_COMMA  word : ,  lineno : 6 
type : TOKEN_ID  word : inspect  lineno : 6 
type : TOKEN_COMMA  word : ,  lineno : 6 
type : TOKEN_ID  word : types  lineno : 6 
type : KEYWORD_TOKEN_CLASS  word : class  lineno : 8 
type : TOKEN_ID  word : Frame  lineno : 8 
type : TOKEN_LEFT_PAREN  word : (  lineno : 8 
type : KEYWORD_TOKEN_OBJECT  word : object  lineno : 8 
type : TOKEN_RIGHT_PAREN  word : )  lineno : 8 
type : TOKEN_COLON  word : :  lineno : 8 
type : KEYWORD_TOKEN_DEF  word : def  lineno : 9 
type : TOKEN_ID  word : __init__  lineno : 9 
type : TOKEN_LEFT_PAREN  word : (  lineno : 9 
type : TOKEN_ID  word : self  lineno : 9 
type : TOKEN_COMMA  word : ,  lineno : 9 
type : TOKEN_ID  word : code_obj  lineno : 9 
type : TOKEN_COMMA  word : ,  lineno : 9 
type : TOKEN_ID  word : global_names  lineno : 9 
type : TOKEN_COMMA  word : ,  lineno : 9 
type : TOKEN_ID  word : local_names  lineno : 9 
type : TOKEN_COMMA  word : ,  lineno : 9 
type : TOKEN_ID  word : prev_frame  lineno : 9 
type : TOKEN_RIGHT_PAREN  word : )  lineno : 9 
type : TOKEN_COLON  word : :  lineno : 9 

 ...
 省略
 ...

type : TOKEN_LEFT_PAREN  word : (  lineno : 571 
type : TOKEN_ID  word : self  lineno : 571 
type : TOKEN_RIGHT_PAREN  word : )  lineno : 571 
type : TOKEN_COLON  word : :  lineno : 571 
type : TOKEN_ID  word : self  lineno : 572 
type : TOKEN_DOT  word : .  lineno : 572 
type : TOKEN_ID  word : frame  lineno : 572 
type : TOKEN_DOT  word : .  lineno : 572 
type : TOKEN_ID  word : push  lineno : 572 
type : TOKEN_LEFT_PAREN  word : (  lineno : 572 
type : TOKEN_ID  word : __build_class__  lineno : 572 
type : TOKEN_RIGHT_PAREN  word : )  lineno : 572 
type : KEYWORD_TOKEN_DEF  word : def  lineno : 574 
type : TOKEN_ID  word : byte_STORE_LOCALS  lineno : 574 
type : TOKEN_LEFT_PAREN  word : (  lineno : 574 
type : TOKEN_ID  word : self  lineno : 574 
type : TOKEN_RIGHT_PAREN  word : )  lineno : 574 
type : TOKEN_COLON  word : :  lineno : 574 
type : TOKEN_ID  word : self  lineno : 575 
type : TOKEN_DOT  word : .  lineno : 575 
type : TOKEN_ID  word : frame  lineno : 575 
type : TOKEN_DOT  word : .  lineno : 575 
type : TOKEN_ID  word : local_names  lineno : 575 
type : TOKEN_ASSIGN  word : =  lineno : 575 
type : TOKEN_ID  word : self  lineno : 575 
type : TOKEN_DOT  word : .  lineno : 575 
type : TOKEN_ID  word : frame  lineno : 575 
type : TOKEN_DOT  word : .  lineno : 575 
type : TOKEN_ID  word : pop  lineno : 575 
type : TOKEN_LEFT_PAREN  word : (  lineno : 575 
type : TOKEN_RIGHT_PAREN  word : )  lineno : 575 
None

從輸出可以看出,在每一次的詞法解析之後生成的token都帶有標識並標註了該token在源碼文件中的位置,但是詞法解析的過程中並沒有考慮過Python中一直強調的四個空格的語法格式問題。本次代碼只是解析了對應的詞法文件。

總結

本文只是簡單的實現了一個詞法分析的流程而已,只是爲了加深對詞法解析這一基礎流程的理解程度。簡單瀏覽有關編譯器相關內容之後,確實編譯器內容比較深入與複雜,需要動手實踐理解的地方也很多,本文只是簡單的當做練習記錄。由於本人才疏學淺,如有錯誤請批評指正。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章