Skip to content

Lexical Analysis

Module for handling the lexer analysis.

Lexer

Lexer class for tokenizing known variables.

Attributes

cur_loc : SourceLocation Current source location. lex_loc : SourceLocation Source location for lexer.

Source code in src/arx/lexer.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
class Lexer:
    """
    Lexer class for tokenizing known variables.

    Attributes
    ----------
    cur_loc : SourceLocation
        Current source location.
    lex_loc : SourceLocation
        Source location for lexer.
    """

    lex_loc: SourceLocation = SourceLocation(0, 0)
    last_char: str = ""
    new_line: bool = True

    _keyword_map: Dict[str, TokenKind] = {  # noqa: RUF012
        "fn": TokenKind.kw_function,
        "extern": TokenKind.kw_extern,
        "return": TokenKind.kw_return,
        "if": TokenKind.kw_if,
        "then": TokenKind.kw_then,
        "else": TokenKind.kw_else,
        "for": TokenKind.kw_for,
        "in": TokenKind.kw_in,
        "var": TokenKind.kw_var,
        "const": TokenKind.kw_const,
    }

    def __init__(self) -> None:
        # self.cur_loc: SourceLocation = SourceLocation(0, 0)
        self.lex_loc: SourceLocation = SourceLocation(0, 0)
        self.last_char: str = ""
        self.new_line: bool = True

        self._keyword_map: Dict[str, TokenKind] = copy.deepcopy(
            self._keyword_map
        )

    def clean(self) -> None:
        """Reset the Lexer attributes."""
        # self.cur_loc = SourceLocation(0, 0)
        self.lex_loc = SourceLocation(0, 0)
        self.last_char = ""
        self.new_line = True

    def get_token(self) -> Token:
        """
        Get the next token.

        Returns
        -------
        int
            The next token from standard input.
        """
        if self.last_char == "":
            self.new_line = True
            self.last_char = self.advance()

        # Skip any whitespace.
        indent = 0
        while self.last_char.isspace():
            if self.new_line:
                indent += 1

            if self.last_char == "\n":
                # note: if it is an empty line it is not necessary to keep
                #       the record about the indentation
                self.new_line = True
                indent = 0

            self.last_char = self.advance()

        self.new_line = False

        if indent:
            return Token(
                kind=TokenKind.indent, value=indent, location=self.lex_loc
            )

        # self.cur_loc = self.lex_loc

        if self.last_char.isalpha() or self.last_char == "_":
            # Identifier
            identifier = self.last_char
            self.last_char = self.advance()

            while self.last_char.isalnum() or self.last_char == "_":
                identifier += self.last_char
                self.last_char = self.advance()

            if identifier in self._keyword_map:
                return Token(
                    kind=self._keyword_map[identifier],
                    value=identifier,
                    location=self.lex_loc,
                )

            return Token(
                kind=TokenKind.identifier,
                value=identifier,
                location=self.lex_loc,
            )

        # Number: [0-9.]+
        if self.last_char.isdigit() or self.last_char == ".":
            num_str = ""
            while self.last_char.isdigit() or self.last_char == ".":
                num_str += self.last_char
                self.last_char = self.advance()

            return Token(
                kind=TokenKind.float_literal,
                value=float(num_str),
                location=self.lex_loc,
            )

        # Comment until end of line.
        if self.last_char == "#":
            while (
                self.last_char != EOF
                and self.last_char != "\n"
                and self.last_char != "\r"
            ):
                self.last_char = self.advance()

            if self.last_char != EOF:
                return self.get_token()

        # Check for end of file. Don't eat the EOF.
        if self.last_char:
            this_char = self.last_char
            self.last_char = self.advance()
            return Token(
                kind=TokenKind.operator, value=this_char, location=self.lex_loc
            )
        return Token(kind=TokenKind.eof, value="", location=self.lex_loc)

    def advance(self) -> str:
        """
        Advance the token from the buffer.

        Returns
        -------
        int
            TokenKind in integer form.
        """
        last_char = ArxIO.get_char()

        if last_char == "\n" or last_char == "\r":
            self.lex_loc.line += 1
            self.lex_loc.col = 0
        else:
            self.lex_loc.col += 1

        return last_char

    def lex(self) -> TokenList:
        """Create a list of tokens from input source."""
        self.clean()
        cur_tok = Token(kind=TokenKind.not_initialized, value="")
        tokens: List[Token] = []
        while cur_tok.kind != TokenKind.eof:
            cur_tok = self.get_token()
            tokens.append(cur_tok)
        return TokenList(tokens)

advance()

Advance the token from the buffer.

Returns

int TokenKind in integer form.

Source code in src/arx/lexer.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def advance(self) -> str:
    """
    Advance the token from the buffer.

    Returns
    -------
    int
        TokenKind in integer form.
    """
    last_char = ArxIO.get_char()

    if last_char == "\n" or last_char == "\r":
        self.lex_loc.line += 1
        self.lex_loc.col = 0
    else:
        self.lex_loc.col += 1

    return last_char

clean()

Reset the Lexer attributes.

Source code in src/arx/lexer.py
257
258
259
260
261
262
def clean(self) -> None:
    """Reset the Lexer attributes."""
    # self.cur_loc = SourceLocation(0, 0)
    self.lex_loc = SourceLocation(0, 0)
    self.last_char = ""
    self.new_line = True

get_token()

Get the next token.

Returns

int The next token from standard input.

Source code in src/arx/lexer.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def get_token(self) -> Token:
    """
    Get the next token.

    Returns
    -------
    int
        The next token from standard input.
    """
    if self.last_char == "":
        self.new_line = True
        self.last_char = self.advance()

    # Skip any whitespace.
    indent = 0
    while self.last_char.isspace():
        if self.new_line:
            indent += 1

        if self.last_char == "\n":
            # note: if it is an empty line it is not necessary to keep
            #       the record about the indentation
            self.new_line = True
            indent = 0

        self.last_char = self.advance()

    self.new_line = False

    if indent:
        return Token(
            kind=TokenKind.indent, value=indent, location=self.lex_loc
        )

    # self.cur_loc = self.lex_loc

    if self.last_char.isalpha() or self.last_char == "_":
        # Identifier
        identifier = self.last_char
        self.last_char = self.advance()

        while self.last_char.isalnum() or self.last_char == "_":
            identifier += self.last_char
            self.last_char = self.advance()

        if identifier in self._keyword_map:
            return Token(
                kind=self._keyword_map[identifier],
                value=identifier,
                location=self.lex_loc,
            )

        return Token(
            kind=TokenKind.identifier,
            value=identifier,
            location=self.lex_loc,
        )

    # Number: [0-9.]+
    if self.last_char.isdigit() or self.last_char == ".":
        num_str = ""
        while self.last_char.isdigit() or self.last_char == ".":
            num_str += self.last_char
            self.last_char = self.advance()

        return Token(
            kind=TokenKind.float_literal,
            value=float(num_str),
            location=self.lex_loc,
        )

    # Comment until end of line.
    if self.last_char == "#":
        while (
            self.last_char != EOF
            and self.last_char != "\n"
            and self.last_char != "\r"
        ):
            self.last_char = self.advance()

        if self.last_char != EOF:
            return self.get_token()

    # Check for end of file. Don't eat the EOF.
    if self.last_char:
        this_char = self.last_char
        self.last_char = self.advance()
        return Token(
            kind=TokenKind.operator, value=this_char, location=self.lex_loc
        )
    return Token(kind=TokenKind.eof, value="", location=self.lex_loc)

lex()

Create a list of tokens from input source.

Source code in src/arx/lexer.py
375
376
377
378
379
380
381
382
383
def lex(self) -> TokenList:
    """Create a list of tokens from input source."""
    self.clean()
    cur_tok = Token(kind=TokenKind.not_initialized, value="")
    tokens: List[Token] = []
    while cur_tok.kind != TokenKind.eof:
        cur_tok = self.get_token()
        tokens.append(cur_tok)
    return TokenList(tokens)

SourceLocation dataclass

Represents the source location with line and column information.

Attributes

line : int Line number. col : int Column number.

Source code in src/arx/lexer.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@dataclass
class SourceLocation:
    """
    Represents the source location with line and column information.

    Attributes
    ----------
    line : int
        Line number.
    col : int
        Column number.
    """

    line: int = 0
    col: int = 0

Token dataclass

Token class store the kind and the value of the token.

Source code in src/arx/lexer.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@dataclass
class Token:
    """Token class store the kind and the value of the token."""

    kind: TokenKind
    value: Any
    location: SourceLocation

    def __init__(
        self,
        kind: TokenKind,
        value: Any,
        location: SourceLocation = SourceLocation(0, 0),
    ) -> None:
        self.kind = kind
        self.value = value
        self.location = copy.deepcopy(location)

    def get_name(self) -> str:
        """
        Get the name of the specified token.

        Parameters
        ----------
        tok : int
            TokenKind value.

        Returns
        -------
        str
            Name of the token.
        """
        return MAP_KW_TOKEN_TO_NAME.get(self.kind, str(self.value))

    def get_display_value(self) -> str:
        """
        Return the string representation of a token value.

        Returns
        -------
            str: The string representation of the token value.
        """
        if self.kind == TokenKind.identifier:
            return "(" + str(self.value) + ")"
        if self.kind == TokenKind.indent:
            return "(" + str(self.value) + ")"
        elif self.kind == TokenKind.float_literal:
            return "(" + str(self.value) + ")"
        return ""

    def __eq__(self, other: object) -> bool:
        """Overload __eq__ operator."""
        tok_other = cast(Token, other)
        return (self.kind, self.value) == (tok_other.kind, tok_other.value)

    def __str__(self) -> str:
        """Display the token in a readable way."""
        return f"{self.get_name()}{self.get_display_value()}"

__eq__(other)

Overload eq operator.

Source code in src/arx/lexer.py
156
157
158
159
def __eq__(self, other: object) -> bool:
    """Overload __eq__ operator."""
    tok_other = cast(Token, other)
    return (self.kind, self.value) == (tok_other.kind, tok_other.value)

__str__()

Display the token in a readable way.

Source code in src/arx/lexer.py
161
162
163
def __str__(self) -> str:
    """Display the token in a readable way."""
    return f"{self.get_name()}{self.get_display_value()}"

get_display_value()

Return the string representation of a token value.

Returns
str: The string representation of the token value.
Source code in src/arx/lexer.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def get_display_value(self) -> str:
    """
    Return the string representation of a token value.

    Returns
    -------
        str: The string representation of the token value.
    """
    if self.kind == TokenKind.identifier:
        return "(" + str(self.value) + ")"
    if self.kind == TokenKind.indent:
        return "(" + str(self.value) + ")"
    elif self.kind == TokenKind.float_literal:
        return "(" + str(self.value) + ")"
    return ""

get_name()

Get the name of the specified token.

Parameters

tok : int TokenKind value.

Returns

str Name of the token.

Source code in src/arx/lexer.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def get_name(self) -> str:
    """
    Get the name of the specified token.

    Parameters
    ----------
    tok : int
        TokenKind value.

    Returns
    -------
    str
        Name of the token.
    """
    return MAP_KW_TOKEN_TO_NAME.get(self.kind, str(self.value))

TokenKind

Bases: Enum

TokenKind enumeration for known variables returned by the lexer.

Source code in src/arx/lexer.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class TokenKind(Enum):
    """TokenKind enumeration for known variables returned by the lexer."""

    eof: int = -1

    # function
    kw_function: int = -2
    kw_extern: int = -3
    kw_return: int = -4

    # data types
    identifier: int = -10
    float_literal: int = -11

    # control flow
    kw_if: int = -20
    kw_then: int = -21
    kw_else: int = -22
    kw_for: int = -23
    kw_in: int = -24

    # operators
    binary_op: int = -30
    unary_op: int = -31
    operator: int = -32

    # variables
    kw_var: int = -40
    kw_const: int = -41

    # flow and structure control
    indent = -50
    dedent = -51

    # generic control
    not_initialized: int = -9999

TokenList

Class for handle a List of tokens.

Source code in src/arx/lexer.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
class TokenList:
    """Class for handle a List of tokens."""

    tokens: List[Token]
    position: int = 0
    cur_tok: Token

    def __init__(self, tokens: List[Token]) -> None:
        """Instantiate a TokenList object."""
        self.tokens = tokens
        self.position = 0
        self.cur_tok: Token = Token(kind=TokenKind.not_initialized, value="")

    def __iter__(self) -> TokenList:
        """Overload the iterator operation."""
        self.position = 0
        return self

    def __next__(self) -> Token:
        """Overload the next method used by the iteration."""
        if self.position == len(self.tokens):
            raise StopIteration
        return self.get_token()

    def get_token(self) -> Token:
        """
        Get the next token.

        Returns
        -------
        int
            The next token from standard input.
        """
        tok = self.tokens[self.position]
        self.position += 1
        return tok

    def get_next_token(self) -> Token:
        """
        Provide a simple token buffer.

        Returns
        -------
        int
            The current token the parser is looking at.
            Reads another token from the lexer and updates
            cur_tok with its results.
        """
        self.cur_tok = self.get_token()
        return self.cur_tok

__init__(tokens)

Instantiate a TokenList object.

Source code in src/arx/lexer.py
173
174
175
176
177
def __init__(self, tokens: List[Token]) -> None:
    """Instantiate a TokenList object."""
    self.tokens = tokens
    self.position = 0
    self.cur_tok: Token = Token(kind=TokenKind.not_initialized, value="")

__iter__()

Overload the iterator operation.

Source code in src/arx/lexer.py
179
180
181
182
def __iter__(self) -> TokenList:
    """Overload the iterator operation."""
    self.position = 0
    return self

__next__()

Overload the next method used by the iteration.

Source code in src/arx/lexer.py
184
185
186
187
188
def __next__(self) -> Token:
    """Overload the next method used by the iteration."""
    if self.position == len(self.tokens):
        raise StopIteration
    return self.get_token()

get_next_token()

Provide a simple token buffer.

Returns

int The current token the parser is looking at. Reads another token from the lexer and updates cur_tok with its results.

Source code in src/arx/lexer.py
203
204
205
206
207
208
209
210
211
212
213
214
215
def get_next_token(self) -> Token:
    """
    Provide a simple token buffer.

    Returns
    -------
    int
        The current token the parser is looking at.
        Reads another token from the lexer and updates
        cur_tok with its results.
    """
    self.cur_tok = self.get_token()
    return self.cur_tok

get_token()

Get the next token.

Returns

int The next token from standard input.

Source code in src/arx/lexer.py
190
191
192
193
194
195
196
197
198
199
200
201
def get_token(self) -> Token:
    """
    Get the next token.

    Returns
    -------
    int
        The next token from standard input.
    """
    tok = self.tokens[self.position]
    self.position += 1
    return tok