"""Lexer for tokenizing CrossGL source code."""
import re
from collections import OrderedDict
TOKENS = OrderedDict(
[
# Comments
("COMMENT_SINGLE", r"//.*"),
("COMMENT_MULTI", r"/\*[\s\S]*?\*/"),
# Preprocessor directives
("PREPROCESSOR", r"#[^\n]*"),
# Keywords - Core Language
("SHADER", r"\bshader\b"),
("STRUCT", r"\bstruct\b"),
("ENUM", r"\benum\b"),
("IMPL", r"\bimpl\b"),
("TRAIT", r"\btrait\b"),
("CLASS", r"\bclass\b"),
("INTERFACE", r"\binterface\b"),
("NAMESPACE", r"\bnamespace\b"),
("MODULE", r"\bmodule\b"),
("IMPORT", r"\bimport\b"),
("USE", r"\buse\b"),
("FROM", r"\bfrom\b"),
("AS", r"\bas\b"),
# Function/Method Keywords
("FUNCTION", r"\bfn\b"),
("VOID", r"\bvoid\b"),
("RETURN", r"\breturn\b"),
("YIELD", r"\byield\b"),
("ASYNC", r"\basync\b"),
("AWAIT", r"\bawait\b"),
# Control Flow
("IF", r"\bif\b"),
("ELSE", r"\belse\b"),
("ELIF", r"\belif\b"),
("MATCH", r"\bmatch\b"),
("SWITCH", r"\bswitch\b"),
("CASE", r"\bcase\b"),
("DEFAULT", r"\bdefault\b"),
("FOR", r"\bfor\b"),
("WHILE", r"\bwhile\b"),
("LOOP", r"\bloop\b"),
("IN", r"\bin\b"),
("BREAK", r"\bbreak\b"),
("CONTINUE", r"\bcontinue\b"),
# Variable/Memory Keywords
("LET", r"\blet\b"),
("VAR", r"\bvar\b"),
("MUT", r"\bmut\b"),
("CONST", r"\bconst\b"),
("STATIC", r"\bstatic\b"),
("EXTERN", r"\bextern\b"),
("UNIFORM", r"\buniform\b"),
("CBUFFER", r"\bcbuffer\b"),
("BUFFER", r"\bbuffer\b"),
("BUFFER", r"\bbuffer\b"),
# Visibility/Access
("PUBLIC", r"\bpub\b"),
("PRIVATE", r"\bpriv\b"),
("PROTECTED", r"\bprotected\b"),
("INTERNAL", r"\binternal\b"),
# Safety/Memory
("UNSAFE", r"\bunsafe\b"),
("SAFE", r"\bsafe\b"),
("REF", r"\bref\b"),
("BOX", r"\bbox\b"),
("MOVE", r"\bmove\b"),
# Shader Stages
("VERTEX", r"\bvertex\b"),
("FRAGMENT", r"\bfragment\b"),
("COMPUTE", r"\bcompute\b"),
("GEOMETRY", r"\bgeometry\b"),
("TESSELLATION", r"\btessellation\b"),
# GPU/Parallel Keywords
("KERNEL", r"\bkernel\b"),
("GLOBAL", r"\bglobal\b"),
("LOCAL", r"\blocal\b"),
("SHARED", r"\bshared\b"),
("THREADGROUP", r"\bthreadgroup\b"),
("WORKGROUP", r"\bworkgroup\b"),
("LAYOUT", r"\blayout\b"),
# Types - Primitives
("BOOL", r"\bbool\b"),
("I8", r"\bi8\b"),
("I16", r"\bi16\b"),
("I32", r"\bi32\b"),
("I64", r"\bi64\b"),
("U8", r"\bu8\b"),
("U16", r"\bu16\b"),
("U32", r"\bu32\b"),
("U64", r"\bu64\b"),
("F16", r"\bf16\b"),
("F32", r"\bf32\b"),
("F64", r"\bf64\b"),
("INT", r"\bint\b"),
("UINT", r"\buint\b"),
("FLOAT", r"\bfloat\b"),
("DOUBLE", r"\bdouble\b"),
("HALF", r"\bhalf\b"),
("CHAR", r"\bchar\b"),
("STRING", r"\bstring\b"),
# Types - Vectors
("VEC2", r"\bvec2\b"),
("VEC3", r"\bvec3\b"),
("VEC4", r"\bvec4\b"),
("IVEC2", r"\bivec2\b"),
("IVEC3", r"\bivec3\b"),
("IVEC4", r"\bivec4\b"),
("UVEC2", r"\buvec2\b"),
("UVEC3", r"\buvec3\b"),
("UVEC4", r"\buvec4\b"),
("DVEC2", r"\bdvec2\b"),
("DVEC3", r"\bdvec3\b"),
("DVEC4", r"\bdvec4\b"),
("BVEC2", r"\bbvec2\b"),
("BVEC3", r"\bbvec3\b"),
("BVEC4", r"\bbvec4\b"),
# Types - Matrices
("MAT2", r"\bmat2\b"),
("MAT3", r"\bmat3\b"),
("MAT4", r"\bmat4\b"),
("MAT2X2", r"\bmat2x2\b"),
("MAT2X3", r"\bmat2x3\b"),
("MAT2X4", r"\bmat2x4\b"),
("MAT3X2", r"\bmat3x2\b"),
("MAT3X3", r"\bmat3x3\b"),
("MAT3X4", r"\bmat3x4\b"),
("MAT4X2", r"\bmat4x2\b"),
("MAT4X3", r"\bmat4x3\b"),
("MAT4X4", r"\bmat4x4\b"),
("DMAT2", r"\bdmat2\b"),
("DMAT3", r"\bdmat3\b"),
("DMAT4", r"\bdmat4\b"),
("DMAT2X2", r"\bdmat2x2\b"),
("DMAT2X3", r"\bdmat2x3\b"),
("DMAT2X4", r"\bdmat2x4\b"),
("DMAT3X2", r"\bdmat3x2\b"),
("DMAT3X3", r"\bdmat3x3\b"),
("DMAT3X4", r"\bdmat3x4\b"),
("DMAT4X2", r"\bdmat4x2\b"),
("DMAT4X3", r"\bdmat4x3\b"),
("DMAT4X4", r"\bdmat4x4\b"),
# Types - Textures/Samplers
("TEXTURE1D", r"\btexture1d\b"),
("TEXTURE2D", r"\btexture2d\b"),
("TEXTURE3D", r"\btexture3d\b"),
("TEXTURECUBE", r"\btexturecube\b"),
("TEXTURE2DARRAY", r"\btexture2darray\b"),
("SAMPLER", r"\bsampler\b"),
("SAMPLER1D", r"\bsampler1d\b"),
("SAMPLER2D", r"\bsampler2d\b"),
("SAMPLER3D", r"\bsampler3d\b"),
("SAMPLERCUBE", r"\bsamplercube\b"),
("SAMPLER2DARRAY", r"\bsampler2darray\b"),
("SAMPLER2DSHADOW", r"\bsampler2dshadow\b"),
("SAMPLER2DARRAYSHADOW", r"\bsampler2darrayshadow\b"),
("SAMPLERCUBESHADOW", r"\bsamplercubeshadow\b"),
("SAMPLERCUBEARRAY", r"\bsamplercubearray\b"),
("SAMPLERCUBEARRAYSHADOW", r"\bsamplercubearrayshadow\b"),
("SAMPLER2DMS", r"\bsampler2dms\b"),
("SAMPLER2DMSARRAY", r"\bsampler2dmsarray\b"),
("IIMAGE2D", r"\biimage2[Dd]\b"),
("IIMAGE3D", r"\biimage3[Dd]\b"),
("IIMAGE2DARRAY", r"\biimage2[Dd][Aa]rray\b"),
("IIMAGE2DMS", r"\biimage2[Dd][Mm][Ss]\b"),
("IIMAGE2DMSARRAY", r"\biimage2[Dd][Mm][Ss][Aa]rray\b"),
("UIMAGE2D", r"\buimage2[Dd]\b"),
("UIMAGE3D", r"\buimage3[Dd]\b"),
("UIMAGE2DARRAY", r"\buimage2[Dd][Aa]rray\b"),
("UIMAGE2DMS", r"\buimage2[Dd][Mm][Ss]\b"),
("UIMAGE2DMSARRAY", r"\buimage2[Dd][Mm][Ss][Aa]rray\b"),
("IMAGE2D", r"\bimage2[Dd]\b"),
("IMAGE3D", r"\bimage3[Dd]\b"),
("IMAGECUBE", r"\bimage[Cc]ube\b"),
("IMAGE2DARRAY", r"\bimage2[Dd][Aa]rray\b"),
("IMAGE2DMS", r"\bimage2[Dd][Mm][Ss]\b"),
("IMAGE2DMSARRAY", r"\bimage2[Dd][Mm][Ss][Aa]rray\b"),
# Generics/Templates
("WHERE", r"\bwhere\b"),
("IMPL_FOR", r"\bfor\b"), # Different context from for loop
# Attributes/Annotations
("ATTRIBUTE", r"@[a-zA-Z_][a-zA-Z_0-9]*"),
("HASH", r"#"),
("DOLLAR", r"\$"),
# Literals
("FLOAT_NUMBER", r"\d*\.\d+[fF]?|\d+\.(?!\.)\d*[fF]?|\d+[fF]"),
("HEX_NUMBER", r"0[xX][0-9a-fA-F]+[uU]?"),
("BIN_NUMBER", r"0[bB][01]+[uU]?"),
("OCT_NUMBER", r"0[oO][0-7]+[uU]?"),
("NUMBER", r"\d+[uU]?"),
("STRING_LITERAL", r'"(?:[^"\\]|\\.)*"'),
("CHAR_LITERAL", r"'(?:[^'\\]|\\.)'"),
# Operators - Assignment
("ASSIGN_ADD", r"\+="),
("ASSIGN_SUB", r"-="),
("ASSIGN_MUL", r"\*="),
("ASSIGN_DIV", r"/="),
("ASSIGN_MOD", r"%="),
("ASSIGN_AND", r"&="),
("ASSIGN_OR", r"\|="),
("ASSIGN_XOR", r"\^="),
("ASSIGN_SHIFT_LEFT", r"<<="),
("ASSIGN_SHIFT_RIGHT", r">>="),
# Operators - Comparison
("EQUAL", r"=="),
("NOT_EQUAL", r"!="),
("LESS_EQUAL", r"<="),
("GREATER_EQUAL", r">="),
("SPACESHIP", r"<=>"),
# Operators - Logical
("LOGICAL_AND", r"&&"),
("LOGICAL_OR", r"\|\|"),
("NOT", r"!"),
# Operators - Bitwise
("BITWISE_SHIFT_LEFT", r"<<"),
("BITWISE_SHIFT_RIGHT", r">>"),
("BITWISE_AND", r"&"),
("BITWISE_OR", r"\|"),
("BITWISE_XOR", r"\^"),
("BITWISE_NOT", r"~"),
# Operators - Arithmetic
("INCREMENT", r"\+\+"),
("DECREMENT", r"--"),
("PLUS", r"\+"),
("MINUS", r"-"),
("MULTIPLY", r"\*"),
("DIVIDE", r"/"),
("MOD", r"%"),
("POWER", r"\*\*"),
# Operators - Other
("ARROW", r"->"),
("FAT_ARROW", r"=>"),
("DOUBLE_COLON", r"::"),
("RANGE_INCLUSIVE", r"\.\.="),
("RANGE", r"\.\."),
("ELVIS", r"\?:"),
("QUESTION", r"\?"),
("PIPE", r"\|"),
# Punctuation
("SEMICOLON", r";"),
("COMMA", r","),
("DOT", r"\."),
("COLON", r":"),
("EQUALS", r"="),
# Brackets
("LBRACE", r"\{"),
("RBRACE", r"\}"),
("LPAREN", r"\("),
("RPAREN", r"\)"),
("LBRACKET", r"\["),
("RBRACKET", r"\]"),
("LESS_THAN", r"<"),
("GREATER_THAN", r">"),
# Special Characters
("AT", r"@"),
("AMPERSAND", r"&"),
# Identifier (must be last)
("IDENTIFIER", r"[a-zA-Z_][a-zA-Z_0-9]*"),
# Whitespace
("WHITESPACE", r"\s+"),
]
)
KEYWORDS = {
# Core Language
"shader": "SHADER",
"struct": "STRUCT",
"enum": "ENUM",
"impl": "IMPL",
"trait": "TRAIT",
"class": "CLASS",
"interface": "INTERFACE",
"namespace": "NAMESPACE",
"module": "MODULE",
"import": "IMPORT",
"use": "USE",
"from": "FROM",
"as": "AS",
# Functions
"fn": "FUNCTION",
"void": "VOID",
"return": "RETURN",
"yield": "YIELD",
"async": "ASYNC",
"await": "AWAIT",
# Control Flow
"if": "IF",
"else": "ELSE",
"elif": "ELIF",
"match": "MATCH",
"switch": "SWITCH",
"case": "CASE",
"default": "DEFAULT",
"for": "FOR",
"while": "WHILE",
"loop": "LOOP",
"in": "IN",
"break": "BREAK",
"continue": "CONTINUE",
# Variables
"let": "LET",
"var": "VAR",
"mut": "MUT",
"const": "CONST",
"static": "STATIC",
"extern": "EXTERN",
"uniform": "UNIFORM",
"cbuffer": "CBUFFER",
"buffer": "BUFFER",
"precision": "PRECISION",
# Visibility
"pub": "PUBLIC",
"priv": "PRIVATE",
"protected": "PROTECTED",
"internal": "INTERNAL",
# Safety
"unsafe": "UNSAFE",
"safe": "SAFE",
"ref": "REF",
"box": "BOX",
"move": "MOVE",
# Shader Stages
"vertex": "VERTEX",
"fragment": "FRAGMENT",
"compute": "COMPUTE",
"geometry": "GEOMETRY",
"tessellation": "TESSELLATION",
"tessellation_control": "TESSELLATION_CONTROL",
"tessellation_evaluation": "TESSELLATION_EVALUATION",
"hull": "TESSELLATION_CONTROL",
"domain": "TESSELLATION_EVALUATION",
"task": "TASK",
"amplification": "AMPLIFICATION",
"object": "OBJECT",
"mesh": "MESH",
"ray_generation": "RAY_GENERATION",
"ray_intersection": "RAY_INTERSECTION",
"ray_closest_hit": "RAY_CLOSEST_HIT",
"ray_miss": "RAY_MISS",
"ray_any_hit": "RAY_ANY_HIT",
"ray_callable": "RAY_CALLABLE",
"intersection": "RAY_INTERSECTION",
"anyhit": "RAY_ANY_HIT",
"closesthit": "RAY_CLOSEST_HIT",
"miss": "RAY_MISS",
"callable": "RAY_CALLABLE",
# GPU
"kernel": "KERNEL",
"global": "GLOBAL",
"local": "LOCAL",
"shared": "SHARED",
"threadgroup": "THREADGROUP",
"workgroup": "WORKGROUP",
"layout": "LAYOUT",
# Types
"bool": "BOOL",
"i8": "I8",
"i16": "I16",
"i32": "I32",
"i64": "I64",
"u8": "U8",
"u16": "U16",
"u32": "U32",
"u64": "U64",
"f16": "F16",
"f32": "F32",
"f64": "F64",
"int": "INT",
"uint": "UINT",
"float": "FLOAT",
"double": "DOUBLE",
"half": "HALF",
"char": "CHAR",
"string": "STRING",
# Vectors
"vec2": "VEC2",
"vec3": "VEC3",
"vec4": "VEC4",
"ivec2": "IVEC2",
"ivec3": "IVEC3",
"ivec4": "IVEC4",
"uvec2": "UVEC2",
"uvec3": "UVEC3",
"uvec4": "UVEC4",
"dvec2": "DVEC2",
"dvec3": "DVEC3",
"dvec4": "DVEC4",
"bvec2": "BVEC2",
"bvec3": "BVEC3",
"bvec4": "BVEC4",
# Matrices
"mat2": "MAT2",
"mat3": "MAT3",
"mat4": "MAT4",
"mat2x2": "MAT2X2",
"mat2x3": "MAT2X3",
"mat2x4": "MAT2X4",
"mat3x2": "MAT3X2",
"mat3x3": "MAT3X3",
"mat3x4": "MAT3X4",
"mat4x2": "MAT4X2",
"mat4x3": "MAT4X3",
"mat4x4": "MAT4X4",
"dmat2": "DMAT2",
"dmat3": "DMAT3",
"dmat4": "DMAT4",
"dmat2x2": "DMAT2X2",
"dmat2x3": "DMAT2X3",
"dmat2x4": "DMAT2X4",
"dmat3x2": "DMAT3X2",
"dmat3x3": "DMAT3X3",
"dmat3x4": "DMAT3X4",
"dmat4x2": "DMAT4X2",
"dmat4x3": "DMAT4X3",
"dmat4x4": "DMAT4X4",
# Textures/Samplers
"texture1d": "TEXTURE1D",
"texture2d": "TEXTURE2D",
"texture3d": "TEXTURE3D",
"texturecube": "TEXTURECUBE",
"texture2darray": "TEXTURE2DARRAY",
"sampler": "SAMPLER",
"sampler1d": "SAMPLER1D",
"sampler2d": "SAMPLER2D",
"sampler3d": "SAMPLER3D",
"samplercube": "SAMPLERCUBE",
"sampler2darray": "SAMPLER2DARRAY",
"sampler2dshadow": "SAMPLER2DSHADOW",
"sampler2darrayshadow": "SAMPLER2DARRAYSHADOW",
"samplercubeshadow": "SAMPLERCUBESHADOW",
"samplercubearray": "SAMPLERCUBEARRAY",
"samplercubearrayshadow": "SAMPLERCUBEARRAYSHADOW",
"sampler2dms": "SAMPLER2DMS",
"sampler2dmsarray": "SAMPLER2DMSARRAY",
"iimage2d": "IIMAGE2D",
"iimage3d": "IIMAGE3D",
"iimage2darray": "IIMAGE2DARRAY",
"iimage2dms": "IIMAGE2DMS",
"iimage2dmsarray": "IIMAGE2DMSARRAY",
"uimage2d": "UIMAGE2D",
"uimage3d": "UIMAGE3D",
"uimage2darray": "UIMAGE2DARRAY",
"uimage2dms": "UIMAGE2DMS",
"uimage2dmsarray": "UIMAGE2DMSARRAY",
"image2d": "IMAGE2D",
"image3d": "IMAGE3D",
"imagecube": "IMAGECUBE",
"image2darray": "IMAGE2DARRAY",
"image2dms": "IMAGE2DMS",
"image2dmsarray": "IMAGE2DMSARRAY",
# Generics
"where": "WHERE",
# Literals
"true": "BOOLEAN_LITERAL",
"false": "BOOLEAN_LITERAL",
}
[docs]
class Lexer:
"""Tokenizer for CrossGL Universal IR."""
def __init__(self, code):
"""Tokenize CrossGL source text immediately on construction."""
self.code = code
self.tokens = []
self.token_cache = {}
self.regex_cache = self._compile_patterns()
self.tokenize()
def _compile_patterns(self):
"""Compile the ordered token specification into one regex."""
combined_pattern = "|".join(
f"(?P<{name}>{pattern})" for name, pattern in TOKENS.items()
)
return re.compile(combined_pattern)
def _get_cached_token(self, text, token_type):
"""Return a stable tuple object for repeated token text/type pairs."""
cache_key = (text, token_type)
if cache_key not in self.token_cache:
self.token_cache[cache_key] = (token_type, text)
return self.token_cache[cache_key]
[docs]
def tokenize(self):
"""Scan source text into parser-ready tokens."""
pos = 0
length = len(self.code)
while pos < length:
match = self.regex_cache.match(self.code, pos)
if match:
token_type = match.lastgroup
text = match.group(token_type)
if token_type == "IDENTIFIER" and text in KEYWORDS:
token_type = KEYWORDS[text]
if token_type != "WHITESPACE":
token = self._get_cached_token(text, token_type)
self.tokens.append(token)
pos = match.end(0)
else:
bad_char = self.code[pos]
line_num = self.code[:pos].count("\n") + 1
col_num = pos - self.code.rfind("\n", 0, pos)
line_start = self.code.rfind("\n", 0, pos) + 1
line_end = self.code.find("\n", pos)
if line_end == -1:
line_end = len(self.code)
line_content = self.code[line_start:line_end]
error_pointer = " " * (col_num - 1) + "^"
raise SyntaxError(
f"Illegal character '{bad_char}' at line {line_num}, column {col_num}\n"
f"{line_content}\n{error_pointer}"
)
self.tokens.append(self._get_cached_token(None, "EOF"))
[docs]
def get_tokens(self):
"""Return the token list produced by the lexer."""
return self.tokens
[docs]
def debug_print(self):
"""Print token indexes, types, and text for grammar debugging."""
for i, (token_type, text) in enumerate(self.tokens):
print(f"{i:3d}: {token_type:20s} '{text}'")