Skip to content

lexer

Module for handling the lexer analysis.

Classes:

  • Lexer

    Lexer class for tokenizing known variables.

  • Token

    Token class store the kind and the value of the token.

  • TokenKind

    TokenKind enumeration for known variables returned by the lexer.

  • TokenList

    Class for handle a List of tokens.

Lexer

Lexer()

Lexer class for tokenizing known variables.

Attributes:

  • cur_loc (SourceLocation) –

    Current source location.

  • lex_loc (SourceLocation) –

    Source location for lexer.

Methods:

  • advance

    Advance the token from the buffer.

  • clean

    Reset the Lexer attributes.

  • get_token

    Get the next token.

  • lex

    Create a list of tokens from input source.

Source code in src/arx/lexer.py
236
237
238
239
240
241
242
243
244
def __init__(self) -> None:
    # self.cur_loc: SourceLocation = SourceLocation(0, 0)
    self.lex_loc: SourceLocation = SourceLocation(0, 0)
    self.last_char: str = ""
    self.new_line: bool = True

    self._keyword_map: Dict[str, TokenKind] = copy.deepcopy(
        self._keyword_map
    )

advance

advance() -> str

Advance the token from the buffer.

Returns:

  • int

    TokenKind in integer form.

Source code in src/arx/lexer.py
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def advance(self) -> str:
    """
    Advance the token from the buffer.

    Returns
    -------
    int
        TokenKind in integer form.
    """
    last_char = ArxIO.get_char()

    if last_char in ("\n", "\r"):
        self.lex_loc.line += 1
        self.lex_loc.col = 0
    else:
        self.lex_loc.col += 1

    return last_char

clean

clean() -> None

Reset the Lexer attributes.

Source code in src/arx/lexer.py
246
247
248
249
250
251
def clean(self) -> None:
    """Reset the Lexer attributes."""
    # self.cur_loc = SourceLocation(0, 0)
    self.lex_loc = SourceLocation(0, 0)
    self.last_char = ""
    self.new_line = True

get_token

get_token() -> Token

Get the next token.

Returns:

  • int

    The next token from standard input.

Source code in src/arx/lexer.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def get_token(self) -> Token:
    """
    Get the next token.

    Returns
    -------
    int
        The next token from standard input.
    """
    if self.last_char == "":
        self.new_line = True
        self.last_char = self.advance()

    # Skip any whitespace.
    indent = 0
    while self.last_char.isspace():
        if self.new_line:
            indent += 1

        if self.last_char == "\n":
            # note: if it is an empty line it is not necessary to keep
            #       the record about the indentation
            self.new_line = True
            indent = 0

        self.last_char = self.advance()

    self.new_line = False

    if indent:
        return Token(
            kind=TokenKind.indent, value=indent, location=self.lex_loc
        )

    # self.cur_loc = self.lex_loc

    if self.last_char.isalpha() or self.last_char == "_":
        # Identifier
        identifier = self.last_char
        self.last_char = self.advance()

        while self.last_char.isalnum() or self.last_char == "_":
            identifier += self.last_char
            self.last_char = self.advance()

        if identifier in self._keyword_map:
            return Token(
                kind=self._keyword_map[identifier],
                value=identifier,
                location=self.lex_loc,
            )

        return Token(
            kind=TokenKind.identifier,
            value=identifier,
            location=self.lex_loc,
        )

    # Number: [0-9.]+
    if self.last_char.isdigit() or self.last_char == ".":
        num_str = ""
        while self.last_char.isdigit() or self.last_char == ".":
            num_str += self.last_char
            self.last_char = self.advance()

        return Token(
            kind=TokenKind.float_literal,
            value=float(num_str),
            location=self.lex_loc,
        )

    # Comment until end of line.
    if self.last_char == "#":
        while self.last_char not in (EOF, "\n", "\r"):
            self.last_char = self.advance()

        if self.last_char != EOF:
            return self.get_token()

    # Check for end of file. Don't eat the EOF.
    if self.last_char:
        this_char = self.last_char
        self.last_char = self.advance()
        return Token(
            kind=TokenKind.operator, value=this_char, location=self.lex_loc
        )
    return Token(kind=TokenKind.eof, value="", location=self.lex_loc)

lex

lex() -> TokenList

Create a list of tokens from input source.

Source code in src/arx/lexer.py
360
361
362
363
364
365
366
367
368
def lex(self) -> TokenList:
    """Create a list of tokens from input source."""
    self.clean()
    cur_tok = Token(kind=TokenKind.not_initialized, value="")
    tokens: List[Token] = []
    while cur_tok.kind != TokenKind.eof:
        cur_tok = self.get_token()
        tokens.append(cur_tok)
    return TokenList(tokens)

Token dataclass

Token(
    kind: TokenKind,
    value: Any,
    location: SourceLocation = SourceLocation(0, 0),
)

Token class store the kind and the value of the token.

Methods:

Source code in src/arx/lexer.py
 99
100
101
102
103
104
105
106
107
def __init__(
    self,
    kind: TokenKind,
    value: Any,
    location: SourceLocation = SourceLocation(0, 0),
) -> None:
    self.kind = kind
    self.value = value
    self.location = copy.deepcopy(location)

get_display_value

get_display_value() -> str

Return the string representation of a token value.

Returns:

  • str: The string representation of the token value.
Source code in src/arx/lexer.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def get_display_value(self) -> str:
    """
    Return the string representation of a token value.

    Returns
    -------
        str: The string representation of the token value.
    """
    if self.kind == TokenKind.identifier:
        return "(" + str(self.value) + ")"
    if self.kind == TokenKind.indent:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.float_literal:
        return "(" + str(self.value) + ")"
    return ""

get_name

get_name() -> str

Get the name of the specified token.

Parameters:

  • tok (int) –

    TokenKind value.

Returns:

  • str

    Name of the token.

Source code in src/arx/lexer.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def get_name(self) -> str:
    """
    Get the name of the specified token.

    Parameters
    ----------
    tok : int
        TokenKind value.

    Returns
    -------
    str
        Name of the token.
    """
    return MAP_KW_TOKEN_TO_NAME.get(self.kind, str(self.value))

TokenKind

Bases: Enum

TokenKind enumeration for known variables returned by the lexer.

TokenList

TokenList(tokens: List[Token])

Class for handle a List of tokens.

Methods:

Source code in src/arx/lexer.py
162
163
164
165
166
def __init__(self, tokens: List[Token]) -> None:
    """Instantiate a TokenList object."""
    self.tokens = tokens
    self.position = 0
    self.cur_tok: Token = Token(kind=TokenKind.not_initialized, value="")

get_next_token

get_next_token() -> Token

Provide a simple token buffer.

Returns:

  • int

    The current token the parser is looking at. Reads another token from the lexer and updates cur_tok with its results.

Source code in src/arx/lexer.py
192
193
194
195
196
197
198
199
200
201
202
203
204
def get_next_token(self) -> Token:
    """
    Provide a simple token buffer.

    Returns
    -------
    int
        The current token the parser is looking at.
        Reads another token from the lexer and updates
        cur_tok with its results.
    """
    self.cur_tok = self.get_token()
    return self.cur_tok

get_token

get_token() -> Token

Get the next token.

Returns:

  • int

    The next token from standard input.

Source code in src/arx/lexer.py
179
180
181
182
183
184
185
186
187
188
189
190
def get_token(self) -> Token:
    """
    Get the next token.

    Returns
    -------
    int
        The next token from standard input.
    """
    tok = self.tokens[self.position]
    self.position += 1
    return tok