Source code for crosstl.translator.lexer

"""Lexer for tokenizing CrossGL source code."""

import re
from collections import OrderedDict

TOKENS = OrderedDict(
    [
        # Comments
        ("COMMENT_SINGLE", r"//.*"),
        ("COMMENT_MULTI", r"/\*[\s\S]*?\*/"),
        # Preprocessor directives
        ("PREPROCESSOR", r"#[^\n]*"),
        # Keywords - Core Language
        ("SHADER", r"\bshader\b"),
        ("STRUCT", r"\bstruct\b"),
        ("ENUM", r"\benum\b"),
        ("IMPL", r"\bimpl\b"),
        ("TRAIT", r"\btrait\b"),
        ("CLASS", r"\bclass\b"),
        ("INTERFACE", r"\binterface\b"),
        ("NAMESPACE", r"\bnamespace\b"),
        ("MODULE", r"\bmodule\b"),
        ("IMPORT", r"\bimport\b"),
        ("USE", r"\buse\b"),
        ("FROM", r"\bfrom\b"),
        ("AS", r"\bas\b"),
        # Function/Method Keywords
        ("FUNCTION", r"\bfn\b"),
        ("VOID", r"\bvoid\b"),
        ("RETURN", r"\breturn\b"),
        ("YIELD", r"\byield\b"),
        ("ASYNC", r"\basync\b"),
        ("AWAIT", r"\bawait\b"),
        # Control Flow
        ("IF", r"\bif\b"),
        ("ELSE", r"\belse\b"),
        ("ELIF", r"\belif\b"),
        ("MATCH", r"\bmatch\b"),
        ("SWITCH", r"\bswitch\b"),
        ("CASE", r"\bcase\b"),
        ("DEFAULT", r"\bdefault\b"),
        ("FOR", r"\bfor\b"),
        ("WHILE", r"\bwhile\b"),
        ("LOOP", r"\bloop\b"),
        ("IN", r"\bin\b"),
        ("BREAK", r"\bbreak\b"),
        ("CONTINUE", r"\bcontinue\b"),
        # Variable/Memory Keywords
        ("LET", r"\blet\b"),
        ("VAR", r"\bvar\b"),
        ("MUT", r"\bmut\b"),
        ("CONST", r"\bconst\b"),
        ("STATIC", r"\bstatic\b"),
        ("EXTERN", r"\bextern\b"),
        ("UNIFORM", r"\buniform\b"),
        ("CBUFFER", r"\bcbuffer\b"),
        ("BUFFER", r"\bbuffer\b"),
        ("BUFFER", r"\bbuffer\b"),
        # Visibility/Access
        ("PUBLIC", r"\bpub\b"),
        ("PRIVATE", r"\bpriv\b"),
        ("PROTECTED", r"\bprotected\b"),
        ("INTERNAL", r"\binternal\b"),
        # Safety/Memory
        ("UNSAFE", r"\bunsafe\b"),
        ("SAFE", r"\bsafe\b"),
        ("REF", r"\bref\b"),
        ("BOX", r"\bbox\b"),
        ("MOVE", r"\bmove\b"),
        # Shader Stages
        ("VERTEX", r"\bvertex\b"),
        ("FRAGMENT", r"\bfragment\b"),
        ("COMPUTE", r"\bcompute\b"),
        ("GEOMETRY", r"\bgeometry\b"),
        ("TESSELLATION", r"\btessellation\b"),
        # GPU/Parallel Keywords
        ("KERNEL", r"\bkernel\b"),
        ("GLOBAL", r"\bglobal\b"),
        ("LOCAL", r"\blocal\b"),
        ("SHARED", r"\bshared\b"),
        ("THREADGROUP", r"\bthreadgroup\b"),
        ("WORKGROUP", r"\bworkgroup\b"),
        ("LAYOUT", r"\blayout\b"),
        # Types - Primitives
        ("BOOL", r"\bbool\b"),
        ("I8", r"\bi8\b"),
        ("I16", r"\bi16\b"),
        ("I32", r"\bi32\b"),
        ("I64", r"\bi64\b"),
        ("U8", r"\bu8\b"),
        ("U16", r"\bu16\b"),
        ("U32", r"\bu32\b"),
        ("U64", r"\bu64\b"),
        ("F16", r"\bf16\b"),
        ("F32", r"\bf32\b"),
        ("F64", r"\bf64\b"),
        ("INT", r"\bint\b"),
        ("UINT", r"\buint\b"),
        ("FLOAT", r"\bfloat\b"),
        ("DOUBLE", r"\bdouble\b"),
        ("HALF", r"\bhalf\b"),
        ("CHAR", r"\bchar\b"),
        ("STRING", r"\bstring\b"),
        # Types - Vectors
        ("VEC2", r"\bvec2\b"),
        ("VEC3", r"\bvec3\b"),
        ("VEC4", r"\bvec4\b"),
        ("IVEC2", r"\bivec2\b"),
        ("IVEC3", r"\bivec3\b"),
        ("IVEC4", r"\bivec4\b"),
        ("UVEC2", r"\buvec2\b"),
        ("UVEC3", r"\buvec3\b"),
        ("UVEC4", r"\buvec4\b"),
        ("DVEC2", r"\bdvec2\b"),
        ("DVEC3", r"\bdvec3\b"),
        ("DVEC4", r"\bdvec4\b"),
        ("BVEC2", r"\bbvec2\b"),
        ("BVEC3", r"\bbvec3\b"),
        ("BVEC4", r"\bbvec4\b"),
        # Types - Matrices
        ("MAT2", r"\bmat2\b"),
        ("MAT3", r"\bmat3\b"),
        ("MAT4", r"\bmat4\b"),
        ("MAT2X2", r"\bmat2x2\b"),
        ("MAT2X3", r"\bmat2x3\b"),
        ("MAT2X4", r"\bmat2x4\b"),
        ("MAT3X2", r"\bmat3x2\b"),
        ("MAT3X3", r"\bmat3x3\b"),
        ("MAT3X4", r"\bmat3x4\b"),
        ("MAT4X2", r"\bmat4x2\b"),
        ("MAT4X3", r"\bmat4x3\b"),
        ("MAT4X4", r"\bmat4x4\b"),
        ("DMAT2", r"\bdmat2\b"),
        ("DMAT3", r"\bdmat3\b"),
        ("DMAT4", r"\bdmat4\b"),
        ("DMAT2X2", r"\bdmat2x2\b"),
        ("DMAT2X3", r"\bdmat2x3\b"),
        ("DMAT2X4", r"\bdmat2x4\b"),
        ("DMAT3X2", r"\bdmat3x2\b"),
        ("DMAT3X3", r"\bdmat3x3\b"),
        ("DMAT3X4", r"\bdmat3x4\b"),
        ("DMAT4X2", r"\bdmat4x2\b"),
        ("DMAT4X3", r"\bdmat4x3\b"),
        ("DMAT4X4", r"\bdmat4x4\b"),
        # Types - Textures/Samplers
        ("TEXTURE1D", r"\btexture1d\b"),
        ("TEXTURE2D", r"\btexture2d\b"),
        ("TEXTURE3D", r"\btexture3d\b"),
        ("TEXTURECUBE", r"\btexturecube\b"),
        ("TEXTURE2DARRAY", r"\btexture2darray\b"),
        ("SAMPLER", r"\bsampler\b"),
        ("SAMPLER1D", r"\bsampler1d\b"),
        ("SAMPLER2D", r"\bsampler2d\b"),
        ("SAMPLER3D", r"\bsampler3d\b"),
        ("SAMPLERCUBE", r"\bsamplercube\b"),
        ("SAMPLER2DARRAY", r"\bsampler2darray\b"),
        ("SAMPLER2DSHADOW", r"\bsampler2dshadow\b"),
        ("SAMPLER2DARRAYSHADOW", r"\bsampler2darrayshadow\b"),
        ("SAMPLERCUBESHADOW", r"\bsamplercubeshadow\b"),
        ("SAMPLERCUBEARRAY", r"\bsamplercubearray\b"),
        ("SAMPLERCUBEARRAYSHADOW", r"\bsamplercubearrayshadow\b"),
        ("SAMPLER2DMS", r"\bsampler2dms\b"),
        ("SAMPLER2DMSARRAY", r"\bsampler2dmsarray\b"),
        ("IIMAGE2D", r"\biimage2[Dd]\b"),
        ("IIMAGE3D", r"\biimage3[Dd]\b"),
        ("IIMAGE2DARRAY", r"\biimage2[Dd][Aa]rray\b"),
        ("IIMAGE2DMS", r"\biimage2[Dd][Mm][Ss]\b"),
        ("IIMAGE2DMSARRAY", r"\biimage2[Dd][Mm][Ss][Aa]rray\b"),
        ("UIMAGE2D", r"\buimage2[Dd]\b"),
        ("UIMAGE3D", r"\buimage3[Dd]\b"),
        ("UIMAGE2DARRAY", r"\buimage2[Dd][Aa]rray\b"),
        ("UIMAGE2DMS", r"\buimage2[Dd][Mm][Ss]\b"),
        ("UIMAGE2DMSARRAY", r"\buimage2[Dd][Mm][Ss][Aa]rray\b"),
        ("IMAGE2D", r"\bimage2[Dd]\b"),
        ("IMAGE3D", r"\bimage3[Dd]\b"),
        ("IMAGECUBE", r"\bimage[Cc]ube\b"),
        ("IMAGE2DARRAY", r"\bimage2[Dd][Aa]rray\b"),
        ("IMAGE2DMS", r"\bimage2[Dd][Mm][Ss]\b"),
        ("IMAGE2DMSARRAY", r"\bimage2[Dd][Mm][Ss][Aa]rray\b"),
        # Generics/Templates
        ("WHERE", r"\bwhere\b"),
        ("IMPL_FOR", r"\bfor\b"),  # Different context from for loop
        # Attributes/Annotations
        ("ATTRIBUTE", r"@[a-zA-Z_][a-zA-Z_0-9]*"),
        ("HASH", r"#"),
        ("DOLLAR", r"\$"),
        # Literals
        ("FLOAT_NUMBER", r"\d*\.\d+[fF]?|\d+\.(?!\.)\d*[fF]?|\d+[fF]"),
        ("HEX_NUMBER", r"0[xX][0-9a-fA-F]+[uU]?"),
        ("BIN_NUMBER", r"0[bB][01]+[uU]?"),
        ("OCT_NUMBER", r"0[oO][0-7]+[uU]?"),
        ("NUMBER", r"\d+[uU]?"),
        ("STRING_LITERAL", r'"(?:[^"\\]|\\.)*"'),
        ("CHAR_LITERAL", r"'(?:[^'\\]|\\.)'"),
        # Operators - Assignment
        ("ASSIGN_ADD", r"\+="),
        ("ASSIGN_SUB", r"-="),
        ("ASSIGN_MUL", r"\*="),
        ("ASSIGN_DIV", r"/="),
        ("ASSIGN_MOD", r"%="),
        ("ASSIGN_AND", r"&="),
        ("ASSIGN_OR", r"\|="),
        ("ASSIGN_XOR", r"\^="),
        ("ASSIGN_SHIFT_LEFT", r"<<="),
        ("ASSIGN_SHIFT_RIGHT", r">>="),
        # Operators - Comparison
        ("EQUAL", r"=="),
        ("NOT_EQUAL", r"!="),
        ("LESS_EQUAL", r"<="),
        ("GREATER_EQUAL", r">="),
        ("SPACESHIP", r"<=>"),
        # Operators - Logical
        ("LOGICAL_AND", r"&&"),
        ("LOGICAL_OR", r"\|\|"),
        ("NOT", r"!"),
        # Operators - Bitwise
        ("BITWISE_SHIFT_LEFT", r"<<"),
        ("BITWISE_SHIFT_RIGHT", r">>"),
        ("BITWISE_AND", r"&"),
        ("BITWISE_OR", r"\|"),
        ("BITWISE_XOR", r"\^"),
        ("BITWISE_NOT", r"~"),
        # Operators - Arithmetic
        ("INCREMENT", r"\+\+"),
        ("DECREMENT", r"--"),
        ("PLUS", r"\+"),
        ("MINUS", r"-"),
        ("MULTIPLY", r"\*"),
        ("DIVIDE", r"/"),
        ("MOD", r"%"),
        ("POWER", r"\*\*"),
        # Operators - Other
        ("ARROW", r"->"),
        ("FAT_ARROW", r"=>"),
        ("DOUBLE_COLON", r"::"),
        ("RANGE_INCLUSIVE", r"\.\.="),
        ("RANGE", r"\.\."),
        ("ELVIS", r"\?:"),
        ("QUESTION", r"\?"),
        ("PIPE", r"\|"),
        # Punctuation
        ("SEMICOLON", r";"),
        ("COMMA", r","),
        ("DOT", r"\."),
        ("COLON", r":"),
        ("EQUALS", r"="),
        # Brackets
        ("LBRACE", r"\{"),
        ("RBRACE", r"\}"),
        ("LPAREN", r"\("),
        ("RPAREN", r"\)"),
        ("LBRACKET", r"\["),
        ("RBRACKET", r"\]"),
        ("LESS_THAN", r"<"),
        ("GREATER_THAN", r">"),
        # Special Characters
        ("AT", r"@"),
        ("AMPERSAND", r"&"),
        # Identifier (must be last)
        ("IDENTIFIER", r"[a-zA-Z_][a-zA-Z_0-9]*"),
        # Whitespace
        ("WHITESPACE", r"\s+"),
    ]
)

KEYWORDS = {
    # Core Language
    "shader": "SHADER",
    "struct": "STRUCT",
    "enum": "ENUM",
    "impl": "IMPL",
    "trait": "TRAIT",
    "class": "CLASS",
    "interface": "INTERFACE",
    "namespace": "NAMESPACE",
    "module": "MODULE",
    "import": "IMPORT",
    "use": "USE",
    "from": "FROM",
    "as": "AS",
    # Functions
    "fn": "FUNCTION",
    "void": "VOID",
    "return": "RETURN",
    "yield": "YIELD",
    "async": "ASYNC",
    "await": "AWAIT",
    # Control Flow
    "if": "IF",
    "else": "ELSE",
    "elif": "ELIF",
    "match": "MATCH",
    "switch": "SWITCH",
    "case": "CASE",
    "default": "DEFAULT",
    "for": "FOR",
    "while": "WHILE",
    "loop": "LOOP",
    "in": "IN",
    "break": "BREAK",
    "continue": "CONTINUE",
    # Variables
    "let": "LET",
    "var": "VAR",
    "mut": "MUT",
    "const": "CONST",
    "static": "STATIC",
    "extern": "EXTERN",
    "uniform": "UNIFORM",
    "cbuffer": "CBUFFER",
    "buffer": "BUFFER",
    "precision": "PRECISION",
    # Visibility
    "pub": "PUBLIC",
    "priv": "PRIVATE",
    "protected": "PROTECTED",
    "internal": "INTERNAL",
    # Safety
    "unsafe": "UNSAFE",
    "safe": "SAFE",
    "ref": "REF",
    "box": "BOX",
    "move": "MOVE",
    # Shader Stages
    "vertex": "VERTEX",
    "fragment": "FRAGMENT",
    "compute": "COMPUTE",
    "geometry": "GEOMETRY",
    "tessellation": "TESSELLATION",
    "tessellation_control": "TESSELLATION_CONTROL",
    "tessellation_evaluation": "TESSELLATION_EVALUATION",
    "hull": "TESSELLATION_CONTROL",
    "domain": "TESSELLATION_EVALUATION",
    "task": "TASK",
    "amplification": "AMPLIFICATION",
    "object": "OBJECT",
    "mesh": "MESH",
    "ray_generation": "RAY_GENERATION",
    "ray_intersection": "RAY_INTERSECTION",
    "ray_closest_hit": "RAY_CLOSEST_HIT",
    "ray_miss": "RAY_MISS",
    "ray_any_hit": "RAY_ANY_HIT",
    "ray_callable": "RAY_CALLABLE",
    "intersection": "RAY_INTERSECTION",
    "anyhit": "RAY_ANY_HIT",
    "closesthit": "RAY_CLOSEST_HIT",
    "miss": "RAY_MISS",
    "callable": "RAY_CALLABLE",
    # GPU
    "kernel": "KERNEL",
    "global": "GLOBAL",
    "local": "LOCAL",
    "shared": "SHARED",
    "threadgroup": "THREADGROUP",
    "workgroup": "WORKGROUP",
    "layout": "LAYOUT",
    # Types
    "bool": "BOOL",
    "i8": "I8",
    "i16": "I16",
    "i32": "I32",
    "i64": "I64",
    "u8": "U8",
    "u16": "U16",
    "u32": "U32",
    "u64": "U64",
    "f16": "F16",
    "f32": "F32",
    "f64": "F64",
    "int": "INT",
    "uint": "UINT",
    "float": "FLOAT",
    "double": "DOUBLE",
    "half": "HALF",
    "char": "CHAR",
    "string": "STRING",
    # Vectors
    "vec2": "VEC2",
    "vec3": "VEC3",
    "vec4": "VEC4",
    "ivec2": "IVEC2",
    "ivec3": "IVEC3",
    "ivec4": "IVEC4",
    "uvec2": "UVEC2",
    "uvec3": "UVEC3",
    "uvec4": "UVEC4",
    "dvec2": "DVEC2",
    "dvec3": "DVEC3",
    "dvec4": "DVEC4",
    "bvec2": "BVEC2",
    "bvec3": "BVEC3",
    "bvec4": "BVEC4",
    # Matrices
    "mat2": "MAT2",
    "mat3": "MAT3",
    "mat4": "MAT4",
    "mat2x2": "MAT2X2",
    "mat2x3": "MAT2X3",
    "mat2x4": "MAT2X4",
    "mat3x2": "MAT3X2",
    "mat3x3": "MAT3X3",
    "mat3x4": "MAT3X4",
    "mat4x2": "MAT4X2",
    "mat4x3": "MAT4X3",
    "mat4x4": "MAT4X4",
    "dmat2": "DMAT2",
    "dmat3": "DMAT3",
    "dmat4": "DMAT4",
    "dmat2x2": "DMAT2X2",
    "dmat2x3": "DMAT2X3",
    "dmat2x4": "DMAT2X4",
    "dmat3x2": "DMAT3X2",
    "dmat3x3": "DMAT3X3",
    "dmat3x4": "DMAT3X4",
    "dmat4x2": "DMAT4X2",
    "dmat4x3": "DMAT4X3",
    "dmat4x4": "DMAT4X4",
    # Textures/Samplers
    "texture1d": "TEXTURE1D",
    "texture2d": "TEXTURE2D",
    "texture3d": "TEXTURE3D",
    "texturecube": "TEXTURECUBE",
    "texture2darray": "TEXTURE2DARRAY",
    "sampler": "SAMPLER",
    "sampler1d": "SAMPLER1D",
    "sampler2d": "SAMPLER2D",
    "sampler3d": "SAMPLER3D",
    "samplercube": "SAMPLERCUBE",
    "sampler2darray": "SAMPLER2DARRAY",
    "sampler2dshadow": "SAMPLER2DSHADOW",
    "sampler2darrayshadow": "SAMPLER2DARRAYSHADOW",
    "samplercubeshadow": "SAMPLERCUBESHADOW",
    "samplercubearray": "SAMPLERCUBEARRAY",
    "samplercubearrayshadow": "SAMPLERCUBEARRAYSHADOW",
    "sampler2dms": "SAMPLER2DMS",
    "sampler2dmsarray": "SAMPLER2DMSARRAY",
    "iimage2d": "IIMAGE2D",
    "iimage3d": "IIMAGE3D",
    "iimage2darray": "IIMAGE2DARRAY",
    "iimage2dms": "IIMAGE2DMS",
    "iimage2dmsarray": "IIMAGE2DMSARRAY",
    "uimage2d": "UIMAGE2D",
    "uimage3d": "UIMAGE3D",
    "uimage2darray": "UIMAGE2DARRAY",
    "uimage2dms": "UIMAGE2DMS",
    "uimage2dmsarray": "UIMAGE2DMSARRAY",
    "image2d": "IMAGE2D",
    "image3d": "IMAGE3D",
    "imagecube": "IMAGECUBE",
    "image2darray": "IMAGE2DARRAY",
    "image2dms": "IMAGE2DMS",
    "image2dmsarray": "IMAGE2DMSARRAY",
    # Generics
    "where": "WHERE",
    # Literals
    "true": "BOOLEAN_LITERAL",
    "false": "BOOLEAN_LITERAL",
}


[docs] class Lexer: """Tokenizer for CrossGL Universal IR.""" def __init__(self, code): """Tokenize CrossGL source text immediately on construction.""" self.code = code self.tokens = [] self.token_cache = {} self.regex_cache = self._compile_patterns() self.tokenize() def _compile_patterns(self): """Compile the ordered token specification into one regex.""" combined_pattern = "|".join( f"(?P<{name}>{pattern})" for name, pattern in TOKENS.items() ) return re.compile(combined_pattern) def _get_cached_token(self, text, token_type): """Return a stable tuple object for repeated token text/type pairs.""" cache_key = (text, token_type) if cache_key not in self.token_cache: self.token_cache[cache_key] = (token_type, text) return self.token_cache[cache_key]
[docs] def tokenize(self): """Scan source text into parser-ready tokens.""" pos = 0 length = len(self.code) while pos < length: match = self.regex_cache.match(self.code, pos) if match: token_type = match.lastgroup text = match.group(token_type) if token_type == "IDENTIFIER" and text in KEYWORDS: token_type = KEYWORDS[text] if token_type != "WHITESPACE": token = self._get_cached_token(text, token_type) self.tokens.append(token) pos = match.end(0) else: bad_char = self.code[pos] line_num = self.code[:pos].count("\n") + 1 col_num = pos - self.code.rfind("\n", 0, pos) line_start = self.code.rfind("\n", 0, pos) + 1 line_end = self.code.find("\n", pos) if line_end == -1: line_end = len(self.code) line_content = self.code[line_start:line_end] error_pointer = " " * (col_num - 1) + "^" raise SyntaxError( f"Illegal character '{bad_char}' at line {line_num}, column {col_num}\n" f"{line_content}\n{error_pointer}" ) self.tokens.append(self._get_cached_token(None, "EOF"))
[docs] def get_tokens(self): """Return the token list produced by the lexer.""" return self.tokens
[docs] def debug_print(self): """Print token indexes, types, and text for grammar debugging.""" for i, (token_type, text) in enumerate(self.tokens): print(f"{i:3d}: {token_type:20s} '{text}'")