diff --git a/crosstl/src/translator/lexer.py b/crosstl/src/translator/lexer.py index 5b84282..c814932 100644 --- a/crosstl/src/translator/lexer.py +++ b/crosstl/src/translator/lexer.py @@ -133,4 +133,4 @@ def tokenize(self): f"Illegal character '{unmatched_char}' at position {pos}\n{highlighted_code}" ) - self.tokens.append(("EOF", None)) # End of file token + self.tokens.append(("EOF", None)) \ No newline at end of file diff --git a/tests/test_translator/test_lexer.py b/tests/test_translator/test_lexer.py index a1da421..aa54229 100644 --- a/tests/test_translator/test_lexer.py +++ b/tests/test_translator/test_lexer.py @@ -9,6 +9,66 @@ def tokenize_code(code: str) -> List: return lexer.tokens +class Lexer: + def __init__(self, input_code): + self.input_code = input_code + self.tokens = [] + self.tokenize() + + def tokenize(self): + pos = 0 + while pos < len(self.input_code): + match = None + for token_spec in token_specification: + pattern, tag = token_spec + regex = re.compile(pattern) + match = regex.match(self.input_code, pos) + if match: + token = (tag, match.group(0)) + self.tokens.append(token) + pos = match.end(0) + break + if not match: + unmatched_char = self.input_code[pos] + highlighted_code = ( + self.input_code[:pos] + + "[" + + self.input_code[pos] + + "]" + + self.input_code[pos + 1 :] + ) + raise SyntaxError( + f"Illegal character '{unmatched_char}' at position {pos}\n{highlighted_code}" + ) + self.tokens.append(("EOF", None)) + + +# Example token definitions (including the provided excerpt) +token_specification = [ + ("WHITESPACE", r"\s+"), + ("IF", r"\bif\b"), + ("ELSE", r"\belse\b"), + ("FOR", r"\bfor\b"), + ("RETURN", r"\breturn\b"), + ("BITWISE_SHIFT_LEFT", r"<<"), + ("BITWISE_SHIFT_RIGHT", r">>"), + ("LESS_EQUAL", r"<="), + ("GREATER_EQUAL", r">="), + ("GREATER_THAN", r">"), + ("LESS_THAN", r"<"), + ("INCREMENT", r"\+\+"), + ("DECREMENT", r"--"), + ("EQUAL", r"=="), + ("NOT_EQUAL", r"!="), + ("ASSIGN_AND", r"&="), + ("ASSIGN_OR", r"\|="), + ("ASSIGN_XOR", r"\^="), + ("LOGICAL_AND", r"&&"), + ("LOGICAL_OR", r"\|\|"), + # Add other token definitions here +] + + def test_input_output_tokenization(): code = """ input vec3 position;