Skip to content

lexer

Classes:

Lexer

Lexer()

Methods:

Source code in src/arx/lexer.py
338
339
340
341
342
343
344
345
346
347
348
349
def __init__(self) -> None:
    """
    title: Initialize Lexer.
    """
    # self.cur_loc: SourceLocation = SourceLocation(0, 0)
    self.lex_loc: SourceLocation = SourceLocation(0, 0)
    self.last_char: str = ""
    self.new_line: bool = True

    self._keyword_map: dict[str, TokenKind] = copy.deepcopy(
        self._keyword_map
    )

advance

advance() -> str
Source code in src/arx/lexer.py
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
def advance(self) -> str:
    """
    title: Advance the token from the buffer.
    returns:
      type: str
      description: TokenKind in integer form.
    """
    last_char = ArxIO.get_char()

    if last_char in ("\n", "\r"):
        self.lex_loc.line += 1
        self.lex_loc.col = 0
    else:
        self.lex_loc.col += 1

    return last_char

clean

clean() -> None
Source code in src/arx/lexer.py
351
352
353
354
355
356
357
358
def clean(self) -> None:
    """
    title: Reset the Lexer attributes.
    """
    # self.cur_loc = SourceLocation(0, 0)
    self.lex_loc = SourceLocation(0, 0)
    self.last_char = ""
    self.new_line = True

get_token

get_token() -> Token
Source code in src/arx/lexer.py
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
def get_token(self) -> Token:
    """
    title: Get the next token.
    returns:
      type: Token
      description: The next token from standard input.
    """
    if self.last_char == "":
        self.new_line = True
        self.last_char = self.advance()

    # Skip any whitespace.
    indent = 0
    while self.last_char.isspace():
        if self.new_line:
            indent += 1

        if self.last_char == "\n":
            # note: if it is an empty line it is not necessary to keep
            #       the record about the indentation
            self.new_line = True
            indent = 0

        self.last_char = self.advance()

    self.new_line = False

    if indent:
        return Token(
            kind=TokenKind.indent, value=indent, location=self.lex_loc
        )

    # self.cur_loc = self.lex_loc

    if self.last_char.isalpha() or self.last_char == "_":
        # Identifier
        identifier = self.last_char
        self.last_char = self.advance()

        while self.last_char.isalnum() or self.last_char == "_":
            identifier += self.last_char
            self.last_char = self.advance()

        if identifier in ("and", "or"):
            return Token(
                kind=TokenKind.operator,
                value=identifier,
                location=self.lex_loc,
            )

        if identifier == "true":
            return Token(
                kind=TokenKind.bool_literal,
                value=True,
                location=self.lex_loc,
            )

        if identifier == "false":
            return Token(
                kind=TokenKind.bool_literal,
                value=False,
                location=self.lex_loc,
            )

        if identifier == "none":
            return Token(
                kind=TokenKind.none_literal,
                value=None,
                location=self.lex_loc,
            )

        if identifier in self._keyword_map:
            return Token(
                kind=self._keyword_map[identifier],
                value=identifier,
                location=self.lex_loc,
            )

        return Token(
            kind=TokenKind.identifier,
            value=identifier,
            location=self.lex_loc,
        )

    # Number: [0-9.]+
    if self.last_char.isdigit() or self.last_char == ".":
        num_str = ""
        dot_count = 0

        while self.last_char.isdigit() or self.last_char == ".":
            if self.last_char == ".":
                dot_count += 1
                if dot_count > 1:
                    raise LexerError(
                        "Invalid number format: multiple decimal points",
                        self.lex_loc,
                    )
            num_str += self.last_char
            self.last_char = self.advance()

        if num_str == ".":
            return Token(
                kind=TokenKind.operator,
                value=".",
                location=self.lex_loc,
            )

        if dot_count == 0:
            return Token(
                kind=TokenKind.int_literal,
                value=int(num_str),
                location=self.lex_loc,
            )

        return Token(
            kind=TokenKind.float_literal,
            value=float(num_str),
            location=self.lex_loc,
        )

    if self.last_char in ('"', "'"):
        return self._parse_quoted_literal()

    # Docstring: ```...```
    if self.last_char == "`":
        return self._parse_docstring()

    # Comment until end of line.
    if self.last_char == "#":
        while self.last_char not in (EOF, "\n", "\r"):
            self.last_char = self.advance()

        if self.last_char != EOF:
            return self.get_token()

    if self.last_char in ("=", "!", "<", ">", "-", "&", "|", "+"):
        return self._parse_operator()

    # Check for end of file. Don't eat the EOF.
    if self.last_char:
        this_char = self.last_char
        self.last_char = self.advance()
        return Token(
            kind=TokenKind.operator, value=this_char, location=self.lex_loc
        )
    return Token(kind=TokenKind.eof, value="", location=self.lex_loc)

lex

lex() -> TokenList
Source code in src/arx/lexer.py
631
632
633
634
635
636
637
638
639
640
641
642
643
def lex(self) -> TokenList:
    """
    title: Create a list of tokens from input source.
    returns:
      type: TokenList
    """
    self.clean()
    cur_tok = Token(kind=TokenKind.not_initialized, value="")
    tokens: list[Token] = []
    while cur_tok.kind != TokenKind.eof:
        cur_tok = self.get_token()
        tokens.append(cur_tok)
    return TokenList(tokens)

LexerError

LexerError(message: str, location: SourceLocation)

Bases: Exception

Source code in src/arx/lexer.py
290
291
292
293
294
295
296
297
298
299
300
301
302
def __init__(self, message: str, location: SourceLocation):
    """
    title: Initialize LexerError.
    parameters:
      message:
        type: str
      location:
        type: SourceLocation
    """
    super().__init__(
        f"{message} at line {location.line}, col {location.col}"
    )
    self.location = location

Token dataclass

Token(
    kind: TokenKind,
    value: Any,
    location: SourceLocation = SourceLocation(0, 0),
)

Methods:

Source code in src/arx/lexer.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def __init__(
    self,
    kind: TokenKind,
    value: Any,
    location: SourceLocation = SourceLocation(0, 0),
) -> None:
    """
    title: Initialize Token.
    parameters:
      kind:
        type: TokenKind
      value:
        type: Any
      location:
        type: SourceLocation
    """
    self.kind = kind
    self.value = value
    self.location = copy.deepcopy(location)

get_display_value

get_display_value() -> str
Source code in src/arx/lexer.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def get_display_value(self) -> str:
    """
    title: Return the string representation of a token value.
    returns:
      type: str
      description: The string representation of the token value.
    """
    if self.kind == TokenKind.identifier:
        return "(" + str(self.value) + ")"
    if self.kind == TokenKind.indent:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.float_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.int_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.string_literal:
        return "(...)"
    elif self.kind == TokenKind.char_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.bool_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.none_literal:
        return ""
    elif self.kind == TokenKind.docstring:
        return "(...)"
    return ""

get_name

get_name() -> str
Source code in src/arx/lexer.py
155
156
157
158
159
160
161
162
def get_name(self) -> str:
    """
    title: Get the name of the specified token.
    returns:
      type: str
      description: Name of the token.
    """
    return MAP_KW_TOKEN_TO_NAME.get(self.kind, str(self.value))

TokenKind

Bases: Enum

TokenList

TokenList(tokens: list[Token])

Methods:

Source code in src/arx/lexer.py
228
229
230
231
232
233
234
235
236
237
def __init__(self, tokens: list[Token]) -> None:
    """
    title: Instantiate a TokenList object.
    parameters:
      tokens:
        type: list[Token]
    """
    self.tokens = tokens
    self.position = 0
    self.cur_tok: Token = Token(kind=TokenKind.not_initialized, value="")

get_next_token

get_next_token() -> Token
Source code in src/arx/lexer.py
269
270
271
272
273
274
275
276
277
278
279
def get_next_token(self) -> Token:
    """
    title: Provide a simple token buffer.
    returns:
      type: Token
      description: >-
        The current token the parser is looking at. Reads another token
        from the lexer and updates cur_tok with its results.
    """
    self.cur_tok = self.get_token()
    return self.cur_tok

get_token

get_token() -> Token
Source code in src/arx/lexer.py
258
259
260
261
262
263
264
265
266
267
def get_token(self) -> Token:
    """
    title: Get the next token.
    returns:
      type: Token
      description: The next token from standard input.
    """
    tok = self.tokens[self.position]
    self.position += 1
    return tok