參考資料
《自制編程語言基於C語言》
《自己動手寫編譯器、連接器》
《自己動手構造編譯系統編譯、彙編與鏈接》
《編譯原理 第二版》
詞法分析
詞法分析主要就是將源碼分解成每一個定義的token,通過詞法解析分析出哪些是關鍵字,哪些是標識符,哪些是字符串等等,詞法分析是編譯的第一階段,通過生成token來爲接下來的語法分析做鋪墊。在實際的分析過程中,詞法分析其實也還包含了其他的一些如符號表相關的維護,但是由於當前只是熟悉一下詞法分析的概念所以就只是實現了一個token的生成過程。
舉例如上所示,基本的處理流程就是通過不斷的獲取下一個字符來進行詞法的解析。
假如輸入如下
if cond:
print
else:
print
詞法解析之後
TOKEN_IF("if") TOKEN_VAR("cond") TOKEN_VAR("print") TOKEN_ELSE("else") TOKEN_VAR("print")
大致的流程就是將字符串執行完成之後就輸出一個解析完成的token,並將token分成不同的類別。有關鍵字if或者變量cond等。
腳本實現簡單詞法分析
本次分析的腳本是Python文件,主要就是將字符解析成不同類型的token,其中有關鍵字token,也有變量token,還有運算符標點符號等。
import os
class FileNotExistsError(Exception):
pass
class Token(object):
def __init__(self, word, type_name, lineno):
self.word = word
self.type_name = type_name
self.lineno = lineno
def __str__(self):
str_show = "type : {0} word : {1} lineno : {2} ".format(self.type_name, self.word, self.lineno)
return str_show
TOKEN_LISTS = {
",": "TOKEN_COMMA",
":": "TOKEN_COLON",
";": "TOKEN_SEMI_COLON",
"(": "TOKEN_LEFT_PAREN",
")": "TOKEN_RIGHT_PAREN",
"[": "TOKEN_LEFT_BRACKET",
"]": "TOKEN_RIGHT_BRACKET",
"{": "TOKEN_LEFT_BRACE",
"}": "TOKEN_RIGHT_BRACE",
".": "TOKEN_DOT",
"..": "TOKEN_DOT_DOT",
"\"": "TOKEN_DOUBLE_QUOTE",
"\'": "TOKEN_QUOTE",
"\t": "TOKEN_TAB",
"#": "TOKEN_SHARP",
"+": "TOKEN_ADD",
"-": "TOKEN_SUB",
"*": "TOKEN_MUL",
"/": "TOKEN_DIV",
"%": "TOKEN_MOD",
"=": "TOKEN_ASSIGN",
"!": "TOKEN_LOGIC_NOT",
"&": "TOKEN_BIT_AND",
"|": "TOKEN_BIT_OR",
"~": "TOKEN_BIT_NOT",
">>": "TOKEN_BIT_SHIFT_RIGHT",
"<<": "TOKEN_BIT_SHIFT_LEFT",
"&&": "TOKEN_LOGIC_AND",
"||": "TOKEN_LOGIC_OR",
"==": "TOKEN_EQUAL",
"!=": "TOKEN_NOT_EQUAL",
">": "TOKEN_GREATE",
">=": "TOKEN_GREATE_EQUAL",
"<": "TOKEN_LESS",
"<=": "TOKEN_LESS_EQUAL",
}
KEYWORDS_LISTS = {
"while": "KEYWORD_TOKEN_WHILE",
"if": "KEYWORD_TOKEN_IF",
"elif": "KEYWORD_TOKEN_ELIF",
"else": "KEYWORD_TOKEN_ELSE",
"for": "KEYWORD_TOKEN_FOR",
"in": "KEYWORD_TOKEN_IN",
"continue": "KEYWORD_TOKEN_CONTINUE",
"break": "KEYWORD_TOKEN_BREAK",
"try": "KEYWORD_TOKEN_TRY",
"except": "KEYWORD_TOKEN_EXCEPT",
"finally": "KEYWORD_TOKEN_FINALLY",
"raise": "KEYWORD_TOKEN_RAISE",
"and": "KEYWORD_TOKEN_AND",
"or": "KEYWORD_TOKEN_OR",
"is": "KEYWORD_TOKEN_IS",
"with": "KEYWORD_TOKEN_WITH",
"as": "KEYWORD_TOKEN_AS",
"assert": "KEYWORD_TOKEN_ASSERT",
"return": "KEYWORD_TOKEN_RETURN",
"pass": "KEYWORD_TOKEN_PASS",
"None": "KEYWORD_TOKEN_NONE",
"True": "KEYWORD_TOKEN_TRUE",
"False": "KEYWORD_TOKEN_FALSE",
"object": "KEYWORD_TOKEN_OBJECT",
"class": "KEYWORD_TOKEN_CLASS",
"import": "KEYWORD_TOKEN_IMPORT",
"from": "KEYWORD_TOKEN_FROM",
"def": "KEYWORD_TOKEN_DEF",
"lambda": "KEYWORD_TOKEN_LAMBDA",
"del": "KEYWORD_TOKEN_DEL",
"nonlocal": "KEYWORD_TOKEN_NONLOCAL",
"global": "KEYWORD_TOKEN_GLOBAL",
"yield": "KEYWORD_TOKEN_YIELD",
"await": "KEYWORD_TOKEN_AWAIT",
"async": "KEYWORD_TOKEN_ASYNC",
}
TOKENS = []
for token in TOKEN_LISTS:
TOKENS.append(token)
KEYWORDS = []
for token in KEYWORDS_LISTS:
KEYWORDS.append(token)
class Parser(object):
def __init__(self, path):
self.cur_token = None
self.file_path = path
self.file = None
self.source_file = None
self.total_source_length = None
self.lineno = 1
self.postion = 0
self.stack = []
self.init()
def init(self):
if not os.path.exists(self.file_path):
raise FileNotExistsError("file not exists")
with open(self.file_path, "r") as f:
self.source_file = f.read()
self.total_source_length = len(self.source_file)
def get_next_char(self, is_move=True):
if self.postion < self.total_source_length:
c = self.source_file[self.postion]
return c
def advance_one_char(self):
if self.postion < self.total_source_length:
self.postion += 1
def parse_str(self):
"""
解析字符串 "abc" 暫不支持 包含轉義字符
:return:
"""
str_list = []
while True:
c = self.get_next_char()
if c is None:
return
self.advance_one_char()
if c == "\"":
return Token("".join(str_list), "TOKEN_STRING", self.lineno)
else:
str_list.append(c)
def parse_ID(self, v):
"""
解析變量或者關鍵字 變量以下劃線字母開頭,變量名可以是下劃線字母數字
:return:
"""
var_name = [v]
while True:
c = self.get_next_char()
if self.check_ID(c, is_first=True):
self.advance_one_char()
var_name.append(c)
else:
var_str = "".join(var_name)
if var_str in KEYWORDS:
return Token(var_str, KEYWORDS_LISTS[var_str], self.lineno)
return Token("".join(var_name), "TOKEN_ID", self.lineno)
def check_ID(self, v, is_first=False):
"""
判斷是否是變量或者關鍵字開頭
:param v: 待檢測的字符
:param is_first: 是否檢查變量的首字母
:return:
"""
if v is None:
return False
# 先檢查是否是字符或者下劃線
if (v >= "a" and v <= "z") or (v >= "A" and v <= "Z") or v == "_":
return True
# 如果不是變量首字母, 在檢查是否是數字
if is_first and v.isdigit():
return True
return False
def parse_NUM(self, v):
"""
解析十進制數字 當前只解析十進制
:param v:
:return:
"""
nums_list = [v]
while True:
c = self.get_next_char()
if c.isdigit():
nums_list.append(c)
self.advance_one_char()
continue
return Token("".join(nums_list), "TOKEN_NUM", self.lineno)
def skip_shard(self):
"""
跳過# 註釋 當前不支持 三個雙引號註釋
:return:
"""
while True:
c = self.get_next_char()
if c != "\n":
self.advance_one_char()
continue
else:
return
def get_next_token(self):
while True:
c = self.get_next_char()
if c is None:
return
self.advance_one_char()
if c in TOKENS:
n_c = self.get_next_char()
if n_c is None:
return
two_char = c + n_c
if two_char in TOKENS:
self.advance_one_char()
return Token(two_char, TOKEN_LISTS[two_char], self.lineno)
elif c == "\"":
return self.parse_str()
elif c == "#":
self.skip_shard()
else:
return Token(c, TOKEN_LISTS[c], self.lineno)
elif self.check_ID(c):
# 解析變量
return self.parse_ID(c)
elif c.isdigit():
return self.parse_NUM(c)
elif c == "\n":
self.lineno += 1
def run_file(path):
parser = Parser(path)
cur_token = parser.get_next_token()
print(cur_token)
while cur_token is not None:
cur_token = parser.get_next_token()
print(cur_token)
if __name__ == '__main__':
run_file("./python_test.py")
腳本中定義了相關Python文件中的關鍵字與關鍵字符,並且在詞法解析過程中不支持三個雙引號的註釋解析("""),不支持字符串中包含轉義字符。
測試的python_test.py文件如下;
"""A pure-Python Python bytecode interpreter."""
# Adapted from:
# 1. pyvm2 by Paul Swartz (z3p), from http://www.twistedmatrix.com/users/z3p/
# 2. byterun by Ned Batchelder, github.com/nedbat/byterun
import dis, operator, sys, collections, inspect, types
class Frame(object):
def __init__(self, code_obj, global_names, local_names, prev_frame):
self.code_obj = code_obj
self.global_names = global_names
self.local_names = local_names
self.prev_frame = prev_frame
self.stack = []
if prev_frame:
self.builtin_names = prev_frame.builtin_names
else:
self.builtin_names = local_names['__builtins__']
if hasattr(self.builtin_names, '__dict__'):
self.builtin_names = self.builtin_names.__dict__
self.last_instruction = 0
self.block_stack = []
# Data stack manipulation
def top(self):
return self.stack[-1]
def pop(self):
return self.stack.pop()
def push(self, *vals):
self.stack.extend(vals)
def popn(self, n):
if n:
ret = self.stack[-n:]
self.stack[-n:] = []
return ret
else:
return []
# Block stack manipulation
def push_block(self, b_type, handler=None):
stack_height = len(self.stack)
self.block_stack.append(Block(b_type, handler, stack_height))
def pop_block(self):
return self.block_stack.pop()
def unwind_block(self, block):
if block.type == 'except-handler':
offset = 3
else:
offset = 0
while len(self.stack) > block.stack_height + offset:
self.pop()
if block.type == 'except-handler':
traceback, value, exctype = self.popn(3)
return exctype, value, traceback
Block = collections.namedtuple("Block", "type, handler, stack_height")
class Function(object):
__slots__ = [
'func_code', 'func_name', 'func_defaults', 'func_globals',
'func_locals', 'func_dict', 'func_closure',
'__name__', '__dict__', '__doc__',
'_vm', '_func',
]
def __init__(self, name, code, globs, defaults, closure, vm):
self._vm = vm
self.func_code = code
self.func_name = self.__name__ = name or code.co_name
self.func_defaults = tuple(defaults)
self.func_globals = globs
self.func_locals = self._vm.frame.local_names
self.__dict__ = {}
self.func_closure = closure
self.__doc__ = code.co_consts[0] if code.co_consts else None
# Sometimes, we need a real Python function. This is for that.
kw = {
'argdefs': self.func_defaults,
}
if closure:
kw['closure'] = tuple(make_cell(0) for _ in closure)
self._func = types.FunctionType(code, globs, **kw)
def __call__(self, *args, **kwargs):
callargs = inspect.getcallargs(self._func, *args, **kwargs)
frame = self._vm.make_frame(
self.func_code, callargs, self.func_globals, {}
)
return self._vm.run_frame(frame)
def make_cell(value):
# Thanks to Alex Gaynor for help with this bit of twistiness.
fn = (lambda x: lambda: x)(value)
return fn.__closure__[0]
class VirtualMachineError(Exception):
pass
class VirtualMachine(object):
def __init__(self):
self.frames = [] # The call stack of frames.
self.frame = None # The current frame.
self.return_value = None
self.last_exception = None
# Frame manipulation
def make_frame(self, code, callargs={}, global_names=None, local_names=None):
if global_names is not None and local_names is not None:
local_names = global_names
elif self.frames:
global_names = self.frame.global_names
local_names = {}
else:
global_names = local_names = {
'__builtins__': __builtins__,
'__name__': '__main__',
'__doc__': None,
'__package__': None,
}
local_names.update(callargs)
frame = Frame(code, global_names, local_names, self.frame)
return frame
def push_frame(self, frame):
self.frames.append(frame)
self.frame = frame
def pop_frame(self):
self.frames.pop()
if self.frames:
self.frame = self.frames[-1]
else:
self.frame = None
# Jumping through bytecode
def jump(self, jump):
self.frame.last_instruction = jump
def run_code(self, code, global_names=None, local_names=None):
frame = self.make_frame(code, global_names=global_names, local_names=local_names)
self.run_frame(frame)
# Check some invariants
# if self.frames:
# raise VirtualMachineError("Frames left over!")
# if self.frame and self.frame.stack:
# raise VirtualMachineError("Data left on stack! %r" % self.frame.stack)
# for testing, was val = self.run_frame(frame)
# return val # for testing
def parse_byte_and_args(self):
f = self.frame
opoffset = f.last_instruction
byteCode = f.code_obj.co_code[opoffset]
f.last_instruction += 1
byte_name = dis.opname[byteCode]
if byteCode >= dis.HAVE_ARGUMENT:
arg = f.code_obj.co_code[f.last_instruction:f.last_instruction+2] # index into the bytecode
f.last_instruction += 2 # advance the instruction pointer
arg_val = arg[0] + (arg[1] << 8)
if byteCode in dis.hasconst: # Look up a constant
arg = f.code_obj.co_consts[arg_val]
elif byteCode in dis.hasname: # Look up a name
arg = f.code_obj.co_names[arg_val]
elif byteCode in dis.haslocal: # Look up a local name
arg = f.code_obj.co_varnames[arg_val]
elif byteCode in dis.hasjrel: # Calculate a relative jump
arg = f.last_instruction + arg_val
else:
arg = arg_val
argument = [arg]
else:
argument = []
return byte_name, argument
def dispatch(self, byte_name, argument):
# When later unwinding the block stack,
# we need to keep track of why we are doing it.
why = None
try:
bytecode_fn = getattr(self, 'byte_%s' % byte_name, None)
if bytecode_fn is None:
if byte_name.startswith('UNARY_'):
self.unaryOperator(byte_name[6:])
elif byte_name.startswith('BINARY_'):
self.binaryOperator(byte_name[7:])
else:
raise VirtualMachineError(
"unsupported bytecode type: %s" % byte_name
)
else:
why = bytecode_fn(*argument)
except:
# deal with exceptions encountered while executing the op.
self.last_exception = sys.exc_info()[:2] + (None,)
why = 'exception'
return why
def manage_block_stack(self, why):
block = self.frame.block_stack[-1]
if block.type == 'loop' and why == 'continue':
self.jump(self.return_value)
why = None
return why
self.frame.pop_block()
current_exc = self.frame.unwind_block(block)
if current_exc is not None:
self.last_exception = current_exc
if block.type == 'loop' and why == 'break':
self.jump(block.handler)
why = None
elif (block.type in ['setup-except', 'finally'] and why == 'exception'):
self.frame.push_block('except-handler')
exctype, value, tb = self.last_exception
self.frame.push(tb, value, exctype)
self.frame.push(tb, value, exctype) # yes, twice
self.jump(block.handler)
why = None
elif block.type == 'finally':
if why in ('return', 'continue'):
self.frame.push(self.return_value)
self.frame.push(why)
self.jump(block.handler)
why = None
return why
def run_frame(self, frame):
self.push_frame(frame)
while True:
byte_name, argument = self.parse_byte_and_args()
why = self.dispatch(byte_name, argument)
# Deal with any block management we need to do
while why and frame.block_stack:
why = self.manage_block_stack(why)
if why:
break
self.pop_frame()
if why == 'exception':
exc, val, tb = self.last_exception
e = exc(val)
e.__traceback__ = tb
raise e
return self.return_value
## Stack manipulation
def byte_LOAD_CONST(self, const):
self.frame.push(const)
def byte_POP_TOP(self):
self.frame.pop()
def byte_DUP_TOP(self):
self.frame.push(self.frame.top())
## Names
def byte_LOAD_NAME(self, name):
frame = self.frame
if name in frame.local_names:
val = frame.local_names[name]
elif name in frame.global_names:
val = frame.global_names[name]
elif name in frame.builtin_names:
val = frame.builtin_names[name]
else:
raise NameError("name '%s' is not defined" % name)
self.frame.push(val)
def byte_STORE_NAME(self, name):
self.frame.local_names[name] = self.frame.pop()
def byte_DELETE_NAME(self, name):
del self.frame.local_names[name]
def byte_LOAD_FAST(self, name):
if name in self.frame.local_names:
val = self.frame.local_names[name]
else:
raise UnboundLocalError(
"local variable '%s' referenced before assignment" % name
)
self.frame.push(val)
def byte_STORE_FAST(self, name):
self.frame.local_names[name] = self.frame.pop()
def byte_LOAD_GLOBAL(self, name):
f = self.frame
if name in f.global_names:
val = f.global_names[name]
elif name in f.builtin_names:
val = f.builtin_names[name]
else:
raise NameError("global name '%s' is not defined" % name)
f.push(val)
## Operators
UNARY_OPERATORS = {
'POSITIVE': operator.pos,
'NEGATIVE': operator.neg,
'NOT': operator.not_,
'INVERT': operator.invert,
}
def unaryOperator(self, op):
x = self.frame.pop()
self.frame.push(self.UNARY_OPERATORS[op](x))
BINARY_OPERATORS = {
'POWER': pow,
'MULTIPLY': operator.mul,
'FLOOR_DIVIDE': operator.floordiv,
'TRUE_DIVIDE': operator.truediv,
'MODULO': operator.mod,
'ADD': operator.add,
'SUBTRACT': operator.sub,
'SUBSCR': operator.getitem,
'LSHIFT': operator.lshift,
'RSHIFT': operator.rshift,
'AND': operator.and_,
'XOR': operator.xor,
'OR': operator.or_,
}
def binaryOperator(self, op):
x, y = self.frame.popn(2)
self.frame.push(self.BINARY_OPERATORS[op](x, y))
COMPARE_OPERATORS = [
operator.lt,
operator.le,
operator.eq,
operator.ne,
operator.gt,
operator.ge,
lambda x, y: x in y,
lambda x, y: x not in y,
lambda x, y: x is y,
lambda x, y: x is not y,
lambda x, y: issubclass(x, Exception) and issubclass(x, y),
]
def byte_COMPARE_OP(self, opnum):
x, y = self.frame.popn(2)
self.frame.push(self.COMPARE_OPERATORS[opnum](x, y))
## Attributes and indexing
def byte_LOAD_ATTR(self, attr):
obj = self.frame.pop()
val = getattr(obj, attr)
self.frame.push(val)
def byte_STORE_ATTR(self, name):
val, obj = self.frame.popn(2)
setattr(obj, name, val)
def byte_STORE_SUBSCR(self):
val, obj, subscr = self.frame.popn(3)
obj[subscr] = val
## Building
def byte_BUILD_TUPLE(self, count):
elts = self.frame.popn(count)
self.frame.push(tuple(elts))
def byte_BUILD_LIST(self, count):
elts = self.frame.popn(count)
self.frame.push(elts)
def byte_BUILD_MAP(self, size):
self.frame.push({})
def byte_STORE_MAP(self):
the_map, val, key = self.frame.popn(3)
the_map[key] = val
self.frame.push(the_map)
def byte_UNPACK_SEQUENCE(self, count):
seq = self.frame.pop()
for x in reversed(seq):
self.frame.push(x)
def byte_BUILD_SLICE(self, count):
if count == 2:
x, y = self.frame.popn(2)
self.frame.push(slice(x, y))
elif count == 3:
x, y, z = self.frame.popn(3)
self.frame.push(slice(x, y, z))
else: # pragma: no cover
raise VirtualMachineError("Strange BUILD_SLICE count: %r" % count)
def byte_LIST_APPEND(self, count):
val = self.frame.pop()
the_list = self.frame.stack[-count] # peek
the_list.append(val)
## Jumps
def byte_JUMP_FORWARD(self, jump):
self.jump(jump)
def byte_JUMP_ABSOLUTE(self, jump):
self.jump(jump)
def byte_POP_JUMP_IF_TRUE(self, jump):
val = self.frame.pop()
if val:
self.jump(jump)
def byte_POP_JUMP_IF_FALSE(self, jump):
val = self.frame.pop()
if not val:
self.jump(jump)
def byte_JUMP_IF_TRUE_OR_POP(self, jump):
val = self.frame.top()
if val:
self.jump(jump)
else:
self.frame.pop()
def byte_JUMP_IF_FALSE_OR_POP(self, jump):
val = self.frame.top()
if not val:
self.jump(jump)
else:
self.frame.pop()
## Blocks
def byte_SETUP_LOOP(self, dest):
self.frame.push_block('loop', dest)
def byte_GET_ITER(self):
self.frame.push(iter(self.frame.pop()))
def byte_FOR_ITER(self, jump):
iterobj = self.frame.top()
try:
v = next(iterobj)
self.frame.push(v)
except StopIteration:
self.frame.pop()
self.jump(jump)
def byte_BREAK_LOOP(self):
return 'break'
def byte_CONTINUE_LOOP(self, dest):
# This is a trick with the return value.
# While unrolling blocks, continue and return both have to preserve
# state as the finally blocks are executed. For continue, it's
# where to jump to, for return, it's the value to return. It gets
# pushed on the stack for both, so continue puts the jump destination
# into return_value.
self.return_value = dest
return 'continue'
def byte_SETUP_EXCEPT(self, dest):
self.frame.push_block('setup-except', dest)
def byte_SETUP_FINALLY(self, dest):
self.frame.push_block('finally', dest)
def byte_POP_BLOCK(self):
self.frame.pop_block()
def byte_RAISE_VARARGS(self, argc):
cause = exc = None
if argc == 2:
cause = self.frame.pop()
exc = self.frame.pop()
elif argc == 1:
exc = self.frame.pop()
return self.do_raise(exc, cause)
def do_raise(self, exc, cause):
if exc is None: # reraise
exc_type, val, tb = self.last_exception
elif type(exc) == type: # As in `raise ValueError`
exc_type = exc
val = exc() # Make an instance.
elif isinstance(exc, BaseException):
# As in `raise ValueError('foo')`
exc_type = type(exc)
val = exc
else:
return 'exception' # failure
self.last_exception = exc_type, val, val.__traceback__
return 'exception'
def byte_POP_EXCEPT(self):
block = self.frame.pop_block()
if block.type != 'except-handler':
raise Exception("popped block is not an except handler")
current_exc = self.frame.unwind_block(block)
if current_exc is not None:
self.last_exception = current_exc
## Functions
def byte_MAKE_FUNCTION(self, argc):
name = self.frame.pop()
code = self.frame.pop()
defaults = self.frame.popn(argc)
globs = self.frame.global_names
#TODO: if we're not supporting kwargs, do we need the defaults?
fn = Function(name, code, globs, defaults, None, self)
self.frame.push(fn)
def byte_CALL_FUNCTION(self, arg):
lenKw, lenPos = divmod(arg, 256) # KWargs not supported in byterun
posargs = self.frame.popn(lenPos)
func = self.frame.pop()
frame = self.frame
retval = func(*posargs)
self.frame.push(retval)
def byte_RETURN_VALUE(self):
self.return_value = self.frame.pop()
return "return"
## Importing
def byte_IMPORT_NAME(self, name):
level, fromlist = self.frame.popn(2)
frame = self.frame
self.frame.push(__import__(name, frame.global_names, frame.local_names, fromlist, level))
def byte_IMPORT_FROM(self, name):
mod = self.frame.top()
self.frame.push(getattr(mod, name))
## And the rest...
def byte_LOAD_BUILD_CLASS(self):
self.frame.push(__build_class__)
def byte_STORE_LOCALS(self):
self.frame.local_names = self.frame.pop()
測試輸出的類型如下;
type : TOKEN_STRING word : lineno : 1
type : TOKEN_STRING word : A pure-Python Python bytecode interpreter. lineno : 1
type : TOKEN_STRING word : lineno : 1
type : KEYWORD_TOKEN_IMPORT word : import lineno : 6
type : TOKEN_ID word : dis lineno : 6
type : TOKEN_COMMA word : , lineno : 6
type : TOKEN_ID word : operator lineno : 6
type : TOKEN_COMMA word : , lineno : 6
type : TOKEN_ID word : sys lineno : 6
type : TOKEN_COMMA word : , lineno : 6
type : TOKEN_ID word : collections lineno : 6
type : TOKEN_COMMA word : , lineno : 6
type : TOKEN_ID word : inspect lineno : 6
type : TOKEN_COMMA word : , lineno : 6
type : TOKEN_ID word : types lineno : 6
type : KEYWORD_TOKEN_CLASS word : class lineno : 8
type : TOKEN_ID word : Frame lineno : 8
type : TOKEN_LEFT_PAREN word : ( lineno : 8
type : KEYWORD_TOKEN_OBJECT word : object lineno : 8
type : TOKEN_RIGHT_PAREN word : ) lineno : 8
type : TOKEN_COLON word : : lineno : 8
type : KEYWORD_TOKEN_DEF word : def lineno : 9
type : TOKEN_ID word : __init__ lineno : 9
type : TOKEN_LEFT_PAREN word : ( lineno : 9
type : TOKEN_ID word : self lineno : 9
type : TOKEN_COMMA word : , lineno : 9
type : TOKEN_ID word : code_obj lineno : 9
type : TOKEN_COMMA word : , lineno : 9
type : TOKEN_ID word : global_names lineno : 9
type : TOKEN_COMMA word : , lineno : 9
type : TOKEN_ID word : local_names lineno : 9
type : TOKEN_COMMA word : , lineno : 9
type : TOKEN_ID word : prev_frame lineno : 9
type : TOKEN_RIGHT_PAREN word : ) lineno : 9
type : TOKEN_COLON word : : lineno : 9
...
省略
...
type : TOKEN_LEFT_PAREN word : ( lineno : 571
type : TOKEN_ID word : self lineno : 571
type : TOKEN_RIGHT_PAREN word : ) lineno : 571
type : TOKEN_COLON word : : lineno : 571
type : TOKEN_ID word : self lineno : 572
type : TOKEN_DOT word : . lineno : 572
type : TOKEN_ID word : frame lineno : 572
type : TOKEN_DOT word : . lineno : 572
type : TOKEN_ID word : push lineno : 572
type : TOKEN_LEFT_PAREN word : ( lineno : 572
type : TOKEN_ID word : __build_class__ lineno : 572
type : TOKEN_RIGHT_PAREN word : ) lineno : 572
type : KEYWORD_TOKEN_DEF word : def lineno : 574
type : TOKEN_ID word : byte_STORE_LOCALS lineno : 574
type : TOKEN_LEFT_PAREN word : ( lineno : 574
type : TOKEN_ID word : self lineno : 574
type : TOKEN_RIGHT_PAREN word : ) lineno : 574
type : TOKEN_COLON word : : lineno : 574
type : TOKEN_ID word : self lineno : 575
type : TOKEN_DOT word : . lineno : 575
type : TOKEN_ID word : frame lineno : 575
type : TOKEN_DOT word : . lineno : 575
type : TOKEN_ID word : local_names lineno : 575
type : TOKEN_ASSIGN word : = lineno : 575
type : TOKEN_ID word : self lineno : 575
type : TOKEN_DOT word : . lineno : 575
type : TOKEN_ID word : frame lineno : 575
type : TOKEN_DOT word : . lineno : 575
type : TOKEN_ID word : pop lineno : 575
type : TOKEN_LEFT_PAREN word : ( lineno : 575
type : TOKEN_RIGHT_PAREN word : ) lineno : 575
None
從輸出可以看出,在每一次的詞法解析之後生成的token都帶有標識並標註了該token在源碼文件中的位置,但是詞法解析的過程中並沒有考慮過Python中一直強調的四個空格的語法格式問題。本次代碼只是解析了對應的詞法文件。
總結
本文只是簡單的實現了一個詞法分析的流程而已,只是爲了加深對詞法解析這一基礎流程的理解程度。簡單瀏覽有關編譯器相關內容之後,確實編譯器內容比較深入與複雜,需要動手實踐理解的地方也很多,本文只是簡單的當做練習記錄。由於本人才疏學淺,如有錯誤請批評指正。