339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484 | def get_token(self) -> Token:
"""
title: Get the next token.
returns:
type: Token
description: The next token from standard input.
"""
if self.last_char == "":
self.new_line = True
self.last_char = self.advance()
# Skip any whitespace.
indent = 0
while self.last_char.isspace():
if self.new_line:
indent += 1
if self.last_char == "\n":
# note: if it is an empty line it is not necessary to keep
# the record about the indentation
self.new_line = True
indent = 0
self.last_char = self.advance()
self.new_line = False
if indent:
return Token(
kind=TokenKind.indent, value=indent, location=self.lex_loc
)
# self.cur_loc = self.lex_loc
if self.last_char.isalpha() or self.last_char == "_":
# Identifier
identifier = self.last_char
self.last_char = self.advance()
while self.last_char.isalnum() or self.last_char == "_":
identifier += self.last_char
self.last_char = self.advance()
if identifier in ("and", "or"):
return Token(
kind=TokenKind.operator,
value=identifier,
location=self.lex_loc,
)
if identifier == "true":
return Token(
kind=TokenKind.bool_literal,
value=True,
location=self.lex_loc,
)
if identifier == "false":
return Token(
kind=TokenKind.bool_literal,
value=False,
location=self.lex_loc,
)
if identifier == "none":
return Token(
kind=TokenKind.none_literal,
value=None,
location=self.lex_loc,
)
if identifier in self._keyword_map:
return Token(
kind=self._keyword_map[identifier],
value=identifier,
location=self.lex_loc,
)
return Token(
kind=TokenKind.identifier,
value=identifier,
location=self.lex_loc,
)
# Number: [0-9.]+
if self.last_char.isdigit() or self.last_char == ".":
num_str = ""
dot_count = 0
while self.last_char.isdigit() or self.last_char == ".":
if self.last_char == ".":
dot_count += 1
if dot_count > 1:
raise LexerError(
"Invalid number format: multiple decimal points",
self.lex_loc,
)
num_str += self.last_char
self.last_char = self.advance()
if num_str == ".":
return Token(
kind=TokenKind.operator,
value=".",
location=self.lex_loc,
)
if dot_count == 0:
return Token(
kind=TokenKind.int_literal,
value=int(num_str),
location=self.lex_loc,
)
return Token(
kind=TokenKind.float_literal,
value=float(num_str),
location=self.lex_loc,
)
if self.last_char in ('"', "'"):
return self._parse_quoted_literal()
# Docstring: ```...```
if self.last_char == "`":
return self._parse_docstring()
# Comment until end of line.
if self.last_char == "#":
while self.last_char not in (EOF, "\n", "\r"):
self.last_char = self.advance()
if self.last_char != EOF:
return self.get_token()
if self.last_char in ("=", "!", "<", ">", "-", "&", "|", "+"):
return self._parse_operator()
# Check for end of file. Don't eat the EOF.
if self.last_char:
this_char = self.last_char
self.last_char = self.advance()
return Token(
kind=TokenKind.operator, value=this_char, location=self.lex_loc
)
return Token(kind=TokenKind.eof, value="", location=self.lex_loc)
|