Skip to content

lexer

Classes:

Lexer

Lexer()

Methods:

Source code in src/arx/lexer.py
320
321
322
323
324
325
326
327
328
def __init__(self) -> None:
    # self.cur_loc: SourceLocation = SourceLocation(0, 0)
    self.lex_loc: SourceLocation = SourceLocation(0, 0)
    self.last_char: str = ""
    self.new_line: bool = True

    self._keyword_map: Dict[str, TokenKind] = copy.deepcopy(
        self._keyword_map
    )

advance

advance() -> str
Source code in src/arx/lexer.py
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
def advance(self) -> str:
    """
    title: Advance the token from the buffer.
    returns:
      type: str
      description: TokenKind in integer form.
    """
    last_char = ArxIO.get_char()

    if last_char in ("\n", "\r"):
        self.lex_loc.line += 1
        self.lex_loc.col = 0
    else:
        self.lex_loc.col += 1

    return last_char

clean

clean() -> None
Source code in src/arx/lexer.py
330
331
332
333
334
335
336
337
def clean(self) -> None:
    """
    title: Reset the Lexer attributes.
    """
    # self.cur_loc = SourceLocation(0, 0)
    self.lex_loc = SourceLocation(0, 0)
    self.last_char = ""
    self.new_line = True

get_token

get_token() -> Token
Source code in src/arx/lexer.py
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
def get_token(self) -> Token:
    """
    title: Get the next token.
    returns:
      type: Token
      description: The next token from standard input.
    """
    if self.last_char == "":
        self.new_line = True
        self.last_char = self.advance()

    # Skip any whitespace.
    indent = 0
    while self.last_char.isspace():
        if self.new_line:
            indent += 1

        if self.last_char == "\n":
            # note: if it is an empty line it is not necessary to keep
            #       the record about the indentation
            self.new_line = True
            indent = 0

        self.last_char = self.advance()

    self.new_line = False

    if indent:
        return Token(
            kind=TokenKind.indent, value=indent, location=self.lex_loc
        )

    # self.cur_loc = self.lex_loc

    if self.last_char.isalpha() or self.last_char == "_":
        # Identifier
        identifier = self.last_char
        self.last_char = self.advance()

        while self.last_char.isalnum() or self.last_char == "_":
            identifier += self.last_char
            self.last_char = self.advance()

        if identifier in ("and", "or"):
            return Token(
                kind=TokenKind.operator,
                value=identifier,
                location=self.lex_loc,
            )

        if identifier == "true":
            return Token(
                kind=TokenKind.bool_literal,
                value=True,
                location=self.lex_loc,
            )

        if identifier == "false":
            return Token(
                kind=TokenKind.bool_literal,
                value=False,
                location=self.lex_loc,
            )

        if identifier == "none":
            return Token(
                kind=TokenKind.none_literal,
                value=None,
                location=self.lex_loc,
            )

        if identifier in self._keyword_map:
            return Token(
                kind=self._keyword_map[identifier],
                value=identifier,
                location=self.lex_loc,
            )

        return Token(
            kind=TokenKind.identifier,
            value=identifier,
            location=self.lex_loc,
        )

    # Number: [0-9.]+
    if self.last_char.isdigit() or self.last_char == ".":
        num_str = ""
        dot_count = 0

        while self.last_char.isdigit() or self.last_char == ".":
            if self.last_char == ".":
                dot_count += 1
                if dot_count > 1:
                    raise LexerError(
                        "Invalid number format: multiple decimal points",
                        self.lex_loc,
                    )
            num_str += self.last_char
            self.last_char = self.advance()

        if num_str == ".":
            return Token(
                kind=TokenKind.operator,
                value=".",
                location=self.lex_loc,
            )

        if dot_count == 0:
            return Token(
                kind=TokenKind.int_literal,
                value=int(num_str),
                location=self.lex_loc,
            )

        return Token(
            kind=TokenKind.float_literal,
            value=float(num_str),
            location=self.lex_loc,
        )

    if self.last_char in ('"', "'"):
        return self._parse_quoted_literal()

    # Docstring: ```...```
    if self.last_char == "`":
        return self._parse_docstring()

    # Comment until end of line.
    if self.last_char == "#":
        while self.last_char not in (EOF, "\n", "\r"):
            self.last_char = self.advance()

        if self.last_char != EOF:
            return self.get_token()

    if self.last_char in ("=", "!", "<", ">", "-", "&", "|", "+"):
        return self._parse_operator()

    # Check for end of file. Don't eat the EOF.
    if self.last_char:
        this_char = self.last_char
        self.last_char = self.advance()
        return Token(
            kind=TokenKind.operator, value=this_char, location=self.lex_loc
        )
    return Token(kind=TokenKind.eof, value="", location=self.lex_loc)

lex

lex() -> TokenList
Source code in src/arx/lexer.py
610
611
612
613
614
615
616
617
618
619
620
621
622
def lex(self) -> TokenList:
    """
    title: Create a list of tokens from input source.
    returns:
      type: TokenList
    """
    self.clean()
    cur_tok = Token(kind=TokenKind.not_initialized, value="")
    tokens: List[Token] = []
    while cur_tok.kind != TokenKind.eof:
        cur_tok = self.get_token()
        tokens.append(cur_tok)
    return TokenList(tokens)

LexerError

LexerError(message: str, location: SourceLocation)

Bases: Exception

Source code in src/arx/lexer.py
280
281
282
283
284
def __init__(self, message: str, location: SourceLocation):
    super().__init__(
        f"{message} at line {location.line}, col {location.col}"
    )
    self.location = location

Token dataclass

Token(
    kind: TokenKind,
    value: Any,
    location: SourceLocation = SourceLocation(0, 0),
)

Methods:

Source code in src/arx/lexer.py
127
128
129
130
131
132
133
134
135
def __init__(
    self,
    kind: TokenKind,
    value: Any,
    location: SourceLocation = SourceLocation(0, 0),
) -> None:
    self.kind = kind
    self.value = value
    self.location = copy.deepcopy(location)

get_display_value

get_display_value() -> str
Source code in src/arx/lexer.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def get_display_value(self) -> str:
    """
    title: Return the string representation of a token value.
    returns:
      type: str
      description: The string representation of the token value.
    """
    if self.kind == TokenKind.identifier:
        return "(" + str(self.value) + ")"
    if self.kind == TokenKind.indent:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.float_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.int_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.string_literal:
        return "(...)"
    elif self.kind == TokenKind.char_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.bool_literal:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.none_literal:
        return ""
    elif self.kind == TokenKind.docstring:
        return "(...)"
    return ""

get_name

get_name() -> str
Source code in src/arx/lexer.py
145
146
147
148
149
150
151
152
def get_name(self) -> str:
    """
    title: Get the name of the specified token.
    returns:
      type: str
      description: Name of the token.
    """
    return MAP_KW_TOKEN_TO_NAME.get(self.kind, str(self.value))

TokenKind

Bases: Enum

TokenList

TokenList(tokens: List[Token])

Methods:

Source code in src/arx/lexer.py
218
219
220
221
222
223
224
225
226
227
def __init__(self, tokens: List[Token]) -> None:
    """
    title: Instantiate a TokenList object.
    parameters:
      tokens:
        type: List[Token]
    """
    self.tokens = tokens
    self.position = 0
    self.cur_tok: Token = Token(kind=TokenKind.not_initialized, value="")

get_next_token

get_next_token() -> Token
Source code in src/arx/lexer.py
259
260
261
262
263
264
265
266
267
268
269
def get_next_token(self) -> Token:
    """
    title: Provide a simple token buffer.
    returns:
      type: Token
      description: >-
        The current token the parser is looking at. Reads another token
        from the lexer and updates cur_tok with its results.
    """
    self.cur_tok = self.get_token()
    return self.cur_tok

get_token

get_token() -> Token
Source code in src/arx/lexer.py
248
249
250
251
252
253
254
255
256
257
def get_token(self) -> Token:
    """
    title: Get the next token.
    returns:
      type: Token
      description: The next token from standard input.
    """
    tok = self.tokens[self.position]
    self.position += 1
    return tok