0. arquitetura geral
O interpretador e dividido em quatro modulos independentes. Cada modulo tem uma responsabilidade unica.
+-------------+ +--------+ +---------+ +--------------+
| lexer | --> | parser | --> | ast | --> | interpreter |
| (quebra em | | (valida| | (arvore | | (executa nos)|
| tokens) | | sintaxe)| | abstrata) | |
+-------------+ +--------+ +---------+ +--------------+
1. lexer (analise lexica)
O lexer le caractere por caractere e agrupa em tokens: numeros, strings, identificadores, palavras-chave, operadores.
1.1 classe Token e TokenType
from enum import Enum
from dataclasses import dataclass
class TokenType(Enum):
# keywords
IF = 'IF'
ELSE = 'ELSE'
WHILE = 'WHILE'
VAR = 'VAR'
PRINT = 'PRINT'
# literals
NUMBER = 'NUMBER'
STRING = 'STRING'
IDENTIFIER = 'IDENTIFIER'
# operators
PLUS = 'PLUS'
MINUS = 'MINUS'
MULTIPLY = 'MULTIPLY'
DIVIDE = 'DIVIDE'
ASSIGN = 'ASSIGN'
EQUALS = 'EQUALS'
NOT_EQUALS = 'NOT_EQUALS'
LESS = 'LESS'
GREATER = 'GREATER'
LESS_EQUAL = 'LESS_EQUAL'
GREATER_EQUAL = 'GREATER_EQUAL'
# delimiters
LPAREN = 'LPAREN'
RPAREN = 'RPAREN'
LBRACE = 'LBRACE'
RBRACE = 'RBRACE'
SEMICOLON = 'SEMICOLON'
EOF = 'EOF'
@dataclass
class Token:
type: TokenType
value: any
line: int
column: int
1.2 implementacao do Lexer
class Lexer:
def __init__(self, source):
self.source = source # source code as string
self.pos = 0 # current index
self.line = 1 # current line
self.column = 1 # current column
self.current_char = source[0] if source else None
def advance(self):
# advances one character, updates line/column
if self.current_char == '\n':
self.line += 1
self.column = 1
else:
self.column += 1
self.pos += 1
if self.pos < len(self.source):
self.current_char = self.source[self.pos]
else:
self.current_char = None
def read_number(self):
# reads integer or decimal numbers
result = ''
while self.current_char and (self.current_char.isdigit() or self.current_char == '.'):
result += self.current_char
self.advance()
return float(result) if '.' in result else int(result)
def read_identifier(self):
# reads identifier or keyword
result = ''
while self.current_char and (self.current_char.isalnum() or self.current_char == '_'):
result += self.current_char
self.advance()
keywords = {
'if': TokenType.IF, 'else': TokenType.ELSE,
'while': TokenType.WHILE, 'var': TokenType.VAR,
'print': TokenType.PRINT
}
token_type = keywords.get(result, TokenType.IDENTIFIER)
return token_type, result
def get_next_token(self):
# main method: returns the next token from the stream
while self.current_char:
# skip whitespace
if self.current_char in ' \t\r':
self.skip_whitespace()
continue
# skip comments '#'
if self.current_char == '#':
self.skip_comment()
continue
if self.current_char.isdigit():
return Token(TokenType.NUMBER, self.read_number(), self.line, self.column)
if self.current_char == '"':
return Token(TokenType.STRING, self.read_string(), self.line, self.column)
if self.current_char.isalpha() or self.current_char == '_':
tok_type, value = self.read_identifier()
return Token(tok_type, value, self.line, self.column)
# simple operators
if self.current_char == '+':
self.advance(); return Token(TokenType.PLUS, '+', self.line, self.column-1)
if self.current_char == '-':
self.advance(); return Token(TokenType.MINUS, '-', self.line, self.column-1)
if self.current_char == '*':
self.advance(); return Token(TokenType.MULTIPLY, '*', self.line, self.column-1)
if self.current_char == '/':
self.advance(); return Token(TokenType.DIVIDE, '/', self.line, self.column-1)
if self.current_char == '=':
self.advance()
if self.current_char == '=':
self.advance()
return Token(TokenType.EQUALS, '==', self.line, self.column-2)
return Token(TokenType.ASSIGN, '=', self.line, self.column-1)
if self.current_char == '<':
self.advance()
if self.current_char == '=':
self.advance()
return Token(TokenType.LESS_EQUAL, '<=', self.line, self.column-2)
return Token(TokenType.LESS, '<', self.line, self.column-1)
if self.current_char == '>':
self.advance()
if self.current_char == '=':
self.advance()
return Token(TokenType.GREATER_EQUAL, '>=', self.line, self.column-2)
return Token(TokenType.GREATER, '>', self.line, self.column-1)
# parentheses, braces, semicolon
if self.current_char == '(':
self.advance(); return Token(TokenType.LPAREN, '(', self.line, self.column-1)
if self.current_char == ')':
self.advance(); return Token(TokenType.RPAREN, ')', self.line, self.column-1)
if self.current_char == '{':
self.advance(); return Token(TokenType.LBRACE, '{', self.line, self.column-1)
if self.current_char == '}':
self.advance(); return Token(TokenType.RBRACE, '}', self.line, self.column-1)
if self.current_char == ';':
self.advance(); return Token(TokenType.SEMICOLON, ';', self.line, self.column-1)
raise SyntaxError(f"invalid character {self.current_char} line {self.line}")
return Token(TokenType.EOF, None, self.line, self.column)
2. parser e arvore sintatica abstrata (AST)
2.1 no da AST
class ASTNode:
def __init__(self, node_type, value=None, children=None):
self.type = node_type # 'program', 'binop', 'var_decl', 'if', 'while', etc
self.value = value # variable name, numeric value, or operator
self.children = children if children else [] # child nodes
2.2 parser recursivo descendente
class Parser:
def __init__(self, lexer):
self.lexer = lexer
self.current_token = self.lexer.get_next_token()
def eat(self, token_type):
# consumes current token if it matches expected type
if self.current_token.type == token_type:
self.current_token = self.lexer.get_next_token()
else:
raise SyntaxError(f"expected {token_type}, got {self.current_token.type}")
def program(self):
# program : statement_list
nodes = self.statement_list()
return ASTNode('program', children=nodes)
def statement_list(self):
statements = []
while self.current_token.type not in (TokenType.EOF, TokenType.RBRACE):
statements.append(self.statement())
return statements
def statement(self):
# dispatch for each command type
if self.current_token.type == TokenType.VAR:
return self.var_decl()
if self.current_token.type == TokenType.IF:
return self.if_statement()
if self.current_token.type == TokenType.WHILE:
return self.while_statement()
if self.current_token.type == TokenType.PRINT:
return self.print_statement()
if self.current_token.type == TokenType.LBRACE:
return self.block()
return self.assignment_or_expression()
def var_decl(self):
self.eat(TokenType.VAR)
var_name = self.current_token.value
self.eat(TokenType.IDENTIFIER)
if self.current_token.type == TokenType.ASSIGN:
self.eat(TokenType.ASSIGN)
expr = self.expression()
else:
expr = ASTNode('number', value=0)
self.eat(TokenType.SEMICOLON)
return ASTNode('var_decl', value=var_name, children=[expr])
def if_statement(self):
self.eat(TokenType.IF)
self.eat(TokenType.LPAREN)
condition = self.expression()
self.eat(TokenType.RPAREN)
then_branch = self.statement()
if self.current_token.type == TokenType.ELSE:
self.eat(TokenType.ELSE)
else_branch = self.statement()
return ASTNode('if', children=[condition, then_branch, else_branch])
return ASTNode('if', children=[condition, then_branch])
def while_statement(self):
self.eat(TokenType.WHILE)
self.eat(TokenType.LPAREN)
condition = self.expression()
self.eat(TokenType.RPAREN)
body = self.statement()
return ASTNode('while', children=[condition, body])
def print_statement(self):
self.eat(TokenType.PRINT)
expr = self.expression()
self.eat(TokenType.SEMICOLON)
return ASTNode('print', children=[expr])
def block(self):
self.eat(TokenType.LBRACE)
statements = self.statement_list()
self.eat(TokenType.RBRACE)
return ASTNode('block', children=statements)
# operator precedence:
def expression(self): # == , !=
node = self.comparison()
while self.current_token.type in (TokenType.EQUALS, TokenType.NOT_EQUALS):
op = self.current_token
self.eat(op.type)
node = ASTNode('binop', value=op.type, children=[node, self.comparison()])
return node
def comparison(self): # < , > , <= , >=
node = self.term()
while self.current_token.type in (TokenType.LESS, TokenType.GREATER, TokenType.LESS_EQUAL, TokenType.GREATER_EQUAL):
op = self.current_token
self.eat(op.type)
node = ASTNode('binop', value=op.type, children=[node, self.term()])
return node
def term(self): # + , -
node = self.factor()
while self.current_token.type in (TokenType.PLUS, TokenType.MINUS):
op = self.current_token
self.eat(op.type)
node = ASTNode('binop', value=op.type, children=[node, self.factor()])
return node
def factor(self): # * , /
node = self.primary()
while self.current_token.type in (TokenType.MULTIPLY, TokenType.DIVIDE):
op = self.current_token
self.eat(op.type)
node = ASTNode('binop', value=op.type, children=[node, self.primary()])
return node
def primary(self):
token = self.current_token
if token.type == TokenType.NUMBER:
self.eat(TokenType.NUMBER)
return ASTNode('number', value=token.value)
if token.type == TokenType.STRING:
self.eat(TokenType.STRING)
return ASTNode('string', value=token.value)
if token.type == TokenType.IDENTIFIER:
self.eat(TokenType.IDENTIFIER)
return ASTNode('variable', value=token.value)
if token.type == TokenType.LPAREN:
self.eat(TokenType.LPAREN)
node = self.expression()
self.eat(TokenType.RPAREN)
return node
raise SyntaxError(f"unexpected token in primary: {token.type}")
def parse(self):
return self.program()
3. interpreter e ambiente de execucao
3.1 Environment (escopo de variaveis)
class Environment:
def __init__(self, parent=None):
self.variables = {} # name -> value dictionary
self.parent = parent # parent scope (for nested blocks)
def get(self, name):
if name in self.variables:
return self.variables[name]
if self.parent:
return self.parent.get(name)
raise NameError(f"variable '{name}' not defined")
def set(self, name, value):
self.variables[name] = value # creates in current scope
def update(self, name, value):
if name in self.variables:
self.variables[name] = value
elif self.parent:
self.parent.update(name, value)
else:
raise NameError(f"variable '{name}' not defined for update")
3.2 classe Interpreter (padrao visitor)
class Interpreter:
def __init__(self):
self.global_env = Environment()
self.current_env = self.global_env
def interpret(self, node):
method_name = f'visit_{node.type}'
visitor = getattr(self, method_name, self.generic_visit)
return visitor(node)
def visit_program(self, node):
result = None
for child in node.children:
result = self.interpret(child)
return result
def visit_block(self, node):
# creates new child scope
previous = self.current_env
self.current_env = Environment(previous)
result = None
for child in node.children:
result = self.interpret(child)
self.current_env = previous
return result
def visit_var_decl(self, node):
var_name = node.value
value = self.interpret(node.children[0]) if node.children else 0
self.current_env.set(var_name, value)
return value
def visit_assign(self, node):
var_name = node.value
value = self.interpret(node.children[0])
try:
self.current_env.get(var_name)
self.current_env.update(var_name, value)
except NameError:
self.current_env.set(var_name, value)
return value
def visit_variable(self, node):
return self.current_env.get(node.value)
def visit_number(self, node):
return node.value
def visit_string(self, node):
return node.value
def visit_binop(self, node):
from lexer import TokenType
left = self.interpret(node.children[0])
right = self.interpret(node.children[1])
op = node.value
if op == TokenType.PLUS: return left + right
if op == TokenType.MINUS: return left - right
if op == TokenType.MULTIPLY: return left * right
if op == TokenType.DIVIDE:
if right == 0: raise ZeroDivisionError("division by zero")
return left / right
if op == TokenType.EQUALS: return left == right
if op == TokenType.NOT_EQUALS: return left != right
if op == TokenType.LESS: return left < right
if op == TokenType.GREATER: return left > right
if op == TokenType.LESS_EQUAL: return left <= right
if op == TokenType.GREATER_EQUAL: return left >= right
raise Exception(f"unknown operator {op}")
def visit_if(self, node):
if self.interpret(node.children[0]):
return self.interpret(node.children[1])
elif len(node.children) > 2:
return self.interpret(node.children[2])
return None
def visit_while(self, node):
max_iter = 1000
it = 0
result = None
while self.interpret(node.children[0]) and it < max_iter:
result = self.interpret(node.children[1])
it += 1
if it >= max_iter:
raise RuntimeError("infinite loop detected (limit 1000)")
return result
def visit_print(self, node):
value = self.interpret(node.children[0])
print(value)
return value
4. main: REPL e execucao de arquivos
import sys
from lexer import Lexer
from parser import Parser
from interpreter import Interpreter
class ProgrammingLanguage:
def __init__(self):
self.interpreter = Interpreter()
def run_file(self, filename):
with open(filename, 'r', encoding='utf-8') as f:
source = f.read()
self.execute(source, filename)
def run_prompt(self):
print("bazzscript repl -> type 'exit' to quit")
while True:
try:
line = input(">>> ")
if line.lower() in ('sair', 'exit'):
break
if line.strip():
self.execute(line, "<repl>")
except KeyboardInterrupt:
print("\nuse 'exit'")
def execute(self, source, source_name):
try:
lexer = Lexer(source)
parser = Parser(lexer)
ast = parser.parse()
self.interpreter.interpret(ast)
except SyntaxError as e:
print(f"syntax error [{source_name}]: {e}")
except NameError as e:
print(f"name error: {e}")
except Exception as e:
print(f"error: {e}")
def main():
lang = ProgrammingLanguage()
if len(sys.argv) > 1:
lang.run_file(sys.argv[1])
else:
lang.run_prompt()
if __name__ == "__main__":
main()
5. gramatica formal da linguagem
program -> statement_list
statement_list -> { statement }
statement -> var_decl | assignment | if_statement | while_statement | print_statement | block | expression ';'
var_decl -> 'var' IDENTIFIER ('=' expression)? ';'
assignment -> IDENTIFIER '=' expression ';'
if_statement -> 'if' '(' expression ')' statement ('else' statement)?
while_statement -> 'while' '(' expression ')' statement
print_statement -> 'print' expression ';'
block -> '{' statement_list '}'
expression -> comparison (('==' | '!=') comparison)*
comparison -> term (('<' | '>' | '<=' | '>=') term)*
term -> factor (('+' | '-') factor)*
factor -> primary (('*' | '/') primary)*
primary -> NUMBER | STRING | IDENTIFIER | '(' expression ')'
6. expandindo a linguagem (funcoes, arrays)
Para adicionar funcoes definidas pelo usuario:
- No lexer: incluir tokens FUNC e RETURN.
- No parser: criar metodo function_decl() que retorna ASTNode('func_decl').
- No interpreter: guardar funcoes no Environment, implementar visit_func_call.
Para arrays literais: adicionar colchetes como delimitadores e no primary tratar '[' expression_list ']'.
7. personalizacao: bazzscript
Renomeie os arquivos para bazz_lexer.py, bazz_parser.py, bazz_interpreter.py. Altere o nome da classe principal para BazzScriptLanguage. Troque a extensao padrao para .bazz. Modifique as mensagens do REPL.
# custom header example
print("BazzScript v1.0 - educational language")
Para gerar um executavel: pyinstaller --onefile --name bazzscript main.py.