Module refinery.lib.scripts.js.lexer

Expand source code Browse git
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Generator

from refinery.lib.scripts.js.token import KEYWORDS, JsToken, JsTokenKind

_ESCAPE_MAP: dict[str, str] = {
    'b'  : '\b',
    'f'  : '\f',
    'n'  : '\n',
    'r'  : '\r',
    't'  : '\t',
    'v'  : '\v',
    '0'  : '\0',
    '\\' : '\\',
    "'"  : "'",
    '"'  : '"',
    '`'  : '`',
}

_FOUR_CHAR_OPS: dict[str, JsTokenKind] = {
    '>>>=' : JsTokenKind.GT3_ASSIGN,
}

_THREE_CHAR_OPS: dict[str, JsTokenKind] = {
    '===' : JsTokenKind.EQ3,
    '!==' : JsTokenKind.BANG_EQ2,
    '>>>' : JsTokenKind.GT3,
    '**=' : JsTokenKind.STAR2_ASSIGN,
    '<<=' : JsTokenKind.LT2_ASSIGN,
    '>>=' : JsTokenKind.GT2_ASSIGN,
    '&&=' : JsTokenKind.AND_ASSIGN,
    '||=' : JsTokenKind.OR_ASSIGN,
    '??=' : JsTokenKind.NULLISH_ASSIGN,
    '...' : JsTokenKind.ELLIPSIS,
}

_TWO_CHAR_OPS: dict[str, JsTokenKind] = {
    '==' : JsTokenKind.EQ2,
    '!=' : JsTokenKind.BANG_EQ,
    '<=' : JsTokenKind.LT_EQ,
    '>=' : JsTokenKind.GT_EQ,
    '+=' : JsTokenKind.PLUS_ASSIGN,
    '-=' : JsTokenKind.MINUS_ASSIGN,
    '*=' : JsTokenKind.STAR_ASSIGN,
    '/=' : JsTokenKind.SLASH_ASSIGN,
    '%=' : JsTokenKind.PERCENT_ASSIGN,
    '&=' : JsTokenKind.AMP_ASSIGN,
    '|=' : JsTokenKind.PIPE_ASSIGN,
    '^=' : JsTokenKind.CARET_ASSIGN,
    '**' : JsTokenKind.STAR2,
    '++' : JsTokenKind.INC,
    '--' : JsTokenKind.DEC,
    '&&' : JsTokenKind.AND,
    '||' : JsTokenKind.OR,
    '??' : JsTokenKind.QQ,
    '?.' : JsTokenKind.QUESTION_DOT,
    '=>' : JsTokenKind.ARROW,
    '<<' : JsTokenKind.LT2,
    '>>' : JsTokenKind.GT2,
}

_ONE_CHAR_OPS: dict[str, JsTokenKind] = {
    '+' : JsTokenKind.PLUS,
    '-' : JsTokenKind.MINUS,
    '*' : JsTokenKind.STAR,
    '%' : JsTokenKind.PERCENT,
    '=' : JsTokenKind.EQUALS,
    '!' : JsTokenKind.BANG,
    '<' : JsTokenKind.LT,
    '>' : JsTokenKind.GT,
    '&' : JsTokenKind.AMP,
    '|' : JsTokenKind.PIPE,
    '^' : JsTokenKind.CARET,
    '~' : JsTokenKind.TILDE,
    '.' : JsTokenKind.DOT,
    '?' : JsTokenKind.QUESTION,
    ':' : JsTokenKind.COLON,
    '(' : JsTokenKind.LPAREN,
    ')' : JsTokenKind.RPAREN,
    '{' : JsTokenKind.LBRACE,
    '}' : JsTokenKind.RBRACE,
    '[' : JsTokenKind.LBRACKET,
    ']' : JsTokenKind.RBRACKET,
    ';' : JsTokenKind.SEMICOLON,
    ',' : JsTokenKind.COMMA,
}

_EXPR_END_KINDS = frozenset({
    JsTokenKind.IDENTIFIER,
    JsTokenKind.INTEGER,
    JsTokenKind.FLOAT,
    JsTokenKind.BIGINT,
    JsTokenKind.STRING_SINGLE,
    JsTokenKind.STRING_DOUBLE,
    JsTokenKind.TEMPLATE_FULL,
    JsTokenKind.TEMPLATE_TAIL,
    JsTokenKind.REGEXP,
    JsTokenKind.RPAREN,
    JsTokenKind.RBRACKET,
    JsTokenKind.INC,
    JsTokenKind.DEC,
    JsTokenKind.TRUE,
    JsTokenKind.FALSE,
    JsTokenKind.NULL,
    JsTokenKind.THIS,
    JsTokenKind.SUPER,
})


@dataclass
class JsLexer:
    source: str
    pos: int = 0
    _template_depth: int = 0
    _brace_stack: list[int] = field(default_factory=list)

    def _peek(self, count: int = 1) -> str:
        return self.source[self.pos:self.pos + count]

    def _at_end(self) -> bool:
        return self.pos >= len(self.source)

    def _skip_whitespace(self) -> bool:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] in ' \t':
            self.pos += 1
        return self.pos > start

    def _read_line_comment(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 2
        while self.pos < length and src[self.pos] != '\n':
            self.pos += 1
        return src[start:self.pos]

    def _read_block_comment(self) -> tuple[str, bool]:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 2
        has_newline = False
        while self.pos < length - 1:
            if src[self.pos] == '*' and src[self.pos + 1] == '/':
                self.pos += 2
                return src[start:self.pos], has_newline
            if src[self.pos] in '\r\n':
                has_newline = True
            self.pos += 1
        self.pos = length
        return src[start:self.pos], has_newline

    def _read_string_escape(self) -> str:
        src = self.source
        length = len(src)
        self.pos += 1
        if self.pos >= length:
            return ''
        c = src[self.pos]
        self.pos += 1
        mapped = _ESCAPE_MAP.get(c)
        if mapped is not None:
            return mapped
        if c == 'x' and self.pos + 1 < length:
            hexstr = src[self.pos:self.pos + 2]
            if len(hexstr) == 2 and all(
                h in '0123456789abcdefABCDEF' for h in hexstr
            ):
                self.pos += 2
                return chr(int(hexstr, 16))
            return 'x'
        if c == 'u':
            if self.pos < length and src[self.pos] == '{':
                end = src.find('}', self.pos + 1)
                if end != -1:
                    hexstr = src[self.pos + 1:end]
                    if hexstr and all(
                        h in '0123456789abcdefABCDEF' for h in hexstr
                    ):
                        self.pos = end + 1
                        return chr(int(hexstr, 16))
                    self.pos = end + 1
                    return 'u'
            elif self.pos + 3 < length:
                hexstr = src[self.pos:self.pos + 4]
                if len(hexstr) == 4 and all(
                    h in '0123456789abcdefABCDEF' for h in hexstr
                ):
                    self.pos += 4
                    return chr(int(hexstr, 16))
            return 'u'
        if c in '\r\n':
            if c == '\r' and self.pos < length and src[self.pos] == '\n':
                self.pos += 1
            return ''
        return c

    def _read_single_string(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            self.pos += 1
            if c == "'":
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
        return src[start:self.pos]

    def _read_double_string(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            self.pos += 1
            if c == '"':
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
        return src[start:self.pos]

    def _read_template(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            if c == '`':
                self.pos += 1
                return JsToken(JsTokenKind.TEMPLATE_FULL, src[start:self.pos], start)
            if c == '$' and self.pos + 1 < length and src[self.pos + 1] == '{':
                self.pos += 2
                self._template_depth += 1
                self._brace_stack.append(0)
                return JsToken(JsTokenKind.TEMPLATE_HEAD, src[start:self.pos], start)
            self.pos += 1
        return JsToken(JsTokenKind.TEMPLATE_FULL, src[start:self.pos], start)

    def _resume_template(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            if c == '`':
                self.pos += 1
                self._template_depth -= 1
                return JsToken(JsTokenKind.TEMPLATE_TAIL, src[start:self.pos], start)
            if c == '$' and self.pos + 1 < length and src[self.pos + 1] == '{':
                self.pos += 2
                self._brace_stack.append(0)
                return JsToken(JsTokenKind.TEMPLATE_MIDDLE, src[start:self.pos], start)
            self.pos += 1
        self._template_depth -= 1
        return JsToken(JsTokenKind.TEMPLATE_TAIL, src[start:self.pos], start)

    def _read_regexp(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        in_class = False
        while self.pos < length:
            c = src[self.pos]
            if c == '\\' and self.pos + 1 < length:
                self.pos += 2
                continue
            if c == '[':
                in_class = True
                self.pos += 1
                continue
            if c == ']' and in_class:
                in_class = False
                self.pos += 1
                continue
            if c == '/' and not in_class:
                self.pos += 1
                while self.pos < length and src[self.pos].isalpha():
                    self.pos += 1
                return src[start:self.pos]
            if c in '\r\n':
                break
            self.pos += 1
        return src[start:self.pos]

    def _read_number(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)

        if src[self.pos] == '0' and self.pos + 1 < length:
            nc = src[self.pos + 1]
            if nc in 'xX':
                self.pos += 2
                while self.pos < length and (
                    src[self.pos] in '0123456789abcdefABCDEF_'
                ):
                    self.pos += 1
                if self.pos < length and src[self.pos] == 'n':
                    self.pos += 1
                    return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)
                return JsToken(JsTokenKind.INTEGER, src[start:self.pos], start)
            if nc in 'oO':
                self.pos += 2
                while self.pos < length and (
                    src[self.pos] in '01234567_'
                ):
                    self.pos += 1
                if self.pos < length and src[self.pos] == 'n':
                    self.pos += 1
                    return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)
                return JsToken(JsTokenKind.INTEGER, src[start:self.pos], start)
            if nc in 'bB':
                self.pos += 2
                while self.pos < length and src[self.pos] in '01_':
                    self.pos += 1
                if self.pos < length and src[self.pos] == 'n':
                    self.pos += 1
                    return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)
                return JsToken(JsTokenKind.INTEGER, src[start:self.pos], start)

        while self.pos < length and (src[self.pos].isdigit() or src[self.pos] == '_'):
            self.pos += 1

        is_float = False
        if self.pos < length and src[self.pos] == '.':
            next_pos = self.pos + 1
            if next_pos < length and src[next_pos].isdigit():
                is_float = True
                self.pos += 1
                while self.pos < length and (
                    src[self.pos].isdigit() or src[self.pos] == '_'
                ):
                    self.pos += 1

        if self.pos < length and src[self.pos] in 'eE':
            is_float = True
            self.pos += 1
            if self.pos < length and src[self.pos] in '+-':
                self.pos += 1
            while self.pos < length and (
                src[self.pos].isdigit() or src[self.pos] == '_'
            ):
                self.pos += 1

        if not is_float and self.pos < length and src[self.pos] == 'n':
            self.pos += 1
            return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)

        kind = JsTokenKind.FLOAT if is_float else JsTokenKind.INTEGER
        return JsToken(kind, src[start:self.pos], start)

    def _read_identifier_or_keyword(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length:
            c = src[self.pos]
            if c.isalnum() or c == '_' or c == '$':
                self.pos += 1
            elif c == '\\' and self.pos + 1 < length and src[self.pos + 1] == 'u':
                self._read_string_escape()
            else:
                break
        word = src[start:self.pos]
        kw = KEYWORDS.get(word)
        if kw is not None:
            return JsToken(kw, word, start)
        return JsToken(JsTokenKind.IDENTIFIER, word, start)

    def tokenize(self) -> Generator[JsToken, None, None]:
        src = self.source
        length = len(src)
        prev_allows_regex = True

        while True:
            self._skip_whitespace()
            if self._at_end():
                yield JsToken(JsTokenKind.EOF, '', self.pos)
                return

            start = self.pos
            c = src[self.pos]
            c2 = src[self.pos:self.pos + 2]

            if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                self.pos += 2
                yield JsToken(JsTokenKind.NEWLINE, '\r\n', start)
                continue
            if c in '\r\n':
                self.pos += 1
                yield JsToken(JsTokenKind.NEWLINE, c, start)
                continue

            if c2 == '//':
                text = self._read_line_comment()
                yield JsToken(JsTokenKind.COMMENT, text, start)
                continue
            if c2 == '/*':
                text, has_newline = self._read_block_comment()
                yield JsToken(JsTokenKind.COMMENT, text, start)
                if has_newline:
                    yield JsToken(JsTokenKind.NEWLINE, '', self.pos)
                continue

            if c == "'":
                text = self._read_single_string()
                prev_allows_regex = False
                yield JsToken(JsTokenKind.STRING_SINGLE, text, start)
                continue
            if c == '"':
                text = self._read_double_string()
                prev_allows_regex = False
                yield JsToken(JsTokenKind.STRING_DOUBLE, text, start)
                continue
            if c == '`':
                tok = self._read_template()
                prev_allows_regex = False
                yield tok
                continue

            if c == '}' and self._template_depth > 0 and self._brace_stack:
                if self._brace_stack[-1] == 0:
                    self._brace_stack.pop()
                    tok = self._resume_template()
                    prev_allows_regex = False
                    yield tok
                    continue
                else:
                    self._brace_stack[-1] -= 1

            if c.isdigit() or (
                c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()
            ):
                tok = self._read_number()
                prev_allows_regex = False
                yield tok
                continue

            if c.isalpha() or c == '_' or c == '$' or c == '\\':
                tok = self._read_identifier_or_keyword()
                prev_allows_regex = tok.kind not in _EXPR_END_KINDS
                yield tok
                continue

            if c == '/':
                if prev_allows_regex:
                    text = self._read_regexp()
                    prev_allows_regex = False
                    yield JsToken(JsTokenKind.REGEXP, text, start)
                    continue
                c2_slash = src[self.pos:self.pos + 2]
                if c2_slash == '/=':
                    self.pos += 2
                    prev_allows_regex = True
                    yield JsToken(JsTokenKind.SLASH_ASSIGN, '/=', start)
                    continue
                self.pos += 1
                prev_allows_regex = True
                yield JsToken(JsTokenKind.SLASH, '/', start)
                continue

            c4 = src[self.pos:self.pos + 4]
            if c4 in _FOUR_CHAR_OPS:
                self.pos += 4
                kind = _FOUR_CHAR_OPS[c4]
                prev_allows_regex = True
                yield JsToken(kind, c4, start)
                continue

            c3 = src[self.pos:self.pos + 3]
            if c3 in _THREE_CHAR_OPS:
                self.pos += 3
                kind = _THREE_CHAR_OPS[c3]
                prev_allows_regex = True
                yield JsToken(kind, c3, start)
                continue

            if c2 in _TWO_CHAR_OPS:
                self.pos += 2
                kind = _TWO_CHAR_OPS[c2]
                if kind in (JsTokenKind.INC, JsTokenKind.DEC):
                    pass
                else:
                    prev_allows_regex = True
                yield JsToken(kind, c2, start)
                continue

            if c in _ONE_CHAR_OPS:
                self.pos += 1
                kind = _ONE_CHAR_OPS[c]
                if kind in (
                    JsTokenKind.RPAREN,
                    JsTokenKind.RBRACKET,
                ):
                    prev_allows_regex = False
                elif kind == JsTokenKind.RBRACE:
                    prev_allows_regex = True
                else:
                    prev_allows_regex = kind not in _EXPR_END_KINDS
                if kind == JsTokenKind.LBRACE and self._brace_stack:
                    self._brace_stack[-1] += 1
                yield JsToken(kind, c, start)
                continue

            self.pos += 1
            prev_allows_regex = True
            yield JsToken(JsTokenKind.ERROR, c, start)

Classes

class JsLexer (source, pos=0)

JsLexer(source: 'str', pos: 'int' = 0, _template_depth: 'int' = 0, _brace_stack: 'list[int]' = )

Expand source code Browse git
@dataclass
class JsLexer:
    source: str
    pos: int = 0
    _template_depth: int = 0
    _brace_stack: list[int] = field(default_factory=list)

    def _peek(self, count: int = 1) -> str:
        return self.source[self.pos:self.pos + count]

    def _at_end(self) -> bool:
        return self.pos >= len(self.source)

    def _skip_whitespace(self) -> bool:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] in ' \t':
            self.pos += 1
        return self.pos > start

    def _read_line_comment(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 2
        while self.pos < length and src[self.pos] != '\n':
            self.pos += 1
        return src[start:self.pos]

    def _read_block_comment(self) -> tuple[str, bool]:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 2
        has_newline = False
        while self.pos < length - 1:
            if src[self.pos] == '*' and src[self.pos + 1] == '/':
                self.pos += 2
                return src[start:self.pos], has_newline
            if src[self.pos] in '\r\n':
                has_newline = True
            self.pos += 1
        self.pos = length
        return src[start:self.pos], has_newline

    def _read_string_escape(self) -> str:
        src = self.source
        length = len(src)
        self.pos += 1
        if self.pos >= length:
            return ''
        c = src[self.pos]
        self.pos += 1
        mapped = _ESCAPE_MAP.get(c)
        if mapped is not None:
            return mapped
        if c == 'x' and self.pos + 1 < length:
            hexstr = src[self.pos:self.pos + 2]
            if len(hexstr) == 2 and all(
                h in '0123456789abcdefABCDEF' for h in hexstr
            ):
                self.pos += 2
                return chr(int(hexstr, 16))
            return 'x'
        if c == 'u':
            if self.pos < length and src[self.pos] == '{':
                end = src.find('}', self.pos + 1)
                if end != -1:
                    hexstr = src[self.pos + 1:end]
                    if hexstr and all(
                        h in '0123456789abcdefABCDEF' for h in hexstr
                    ):
                        self.pos = end + 1
                        return chr(int(hexstr, 16))
                    self.pos = end + 1
                    return 'u'
            elif self.pos + 3 < length:
                hexstr = src[self.pos:self.pos + 4]
                if len(hexstr) == 4 and all(
                    h in '0123456789abcdefABCDEF' for h in hexstr
                ):
                    self.pos += 4
                    return chr(int(hexstr, 16))
            return 'u'
        if c in '\r\n':
            if c == '\r' and self.pos < length and src[self.pos] == '\n':
                self.pos += 1
            return ''
        return c

    def _read_single_string(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            self.pos += 1
            if c == "'":
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
        return src[start:self.pos]

    def _read_double_string(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            self.pos += 1
            if c == '"':
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
        return src[start:self.pos]

    def _read_template(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            if c == '`':
                self.pos += 1
                return JsToken(JsTokenKind.TEMPLATE_FULL, src[start:self.pos], start)
            if c == '$' and self.pos + 1 < length and src[self.pos + 1] == '{':
                self.pos += 2
                self._template_depth += 1
                self._brace_stack.append(0)
                return JsToken(JsTokenKind.TEMPLATE_HEAD, src[start:self.pos], start)
            self.pos += 1
        return JsToken(JsTokenKind.TEMPLATE_FULL, src[start:self.pos], start)

    def _resume_template(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '\\':
                self._read_string_escape()
                continue
            if c == '`':
                self.pos += 1
                self._template_depth -= 1
                return JsToken(JsTokenKind.TEMPLATE_TAIL, src[start:self.pos], start)
            if c == '$' and self.pos + 1 < length and src[self.pos + 1] == '{':
                self.pos += 2
                self._brace_stack.append(0)
                return JsToken(JsTokenKind.TEMPLATE_MIDDLE, src[start:self.pos], start)
            self.pos += 1
        self._template_depth -= 1
        return JsToken(JsTokenKind.TEMPLATE_TAIL, src[start:self.pos], start)

    def _read_regexp(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        in_class = False
        while self.pos < length:
            c = src[self.pos]
            if c == '\\' and self.pos + 1 < length:
                self.pos += 2
                continue
            if c == '[':
                in_class = True
                self.pos += 1
                continue
            if c == ']' and in_class:
                in_class = False
                self.pos += 1
                continue
            if c == '/' and not in_class:
                self.pos += 1
                while self.pos < length and src[self.pos].isalpha():
                    self.pos += 1
                return src[start:self.pos]
            if c in '\r\n':
                break
            self.pos += 1
        return src[start:self.pos]

    def _read_number(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)

        if src[self.pos] == '0' and self.pos + 1 < length:
            nc = src[self.pos + 1]
            if nc in 'xX':
                self.pos += 2
                while self.pos < length and (
                    src[self.pos] in '0123456789abcdefABCDEF_'
                ):
                    self.pos += 1
                if self.pos < length and src[self.pos] == 'n':
                    self.pos += 1
                    return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)
                return JsToken(JsTokenKind.INTEGER, src[start:self.pos], start)
            if nc in 'oO':
                self.pos += 2
                while self.pos < length and (
                    src[self.pos] in '01234567_'
                ):
                    self.pos += 1
                if self.pos < length and src[self.pos] == 'n':
                    self.pos += 1
                    return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)
                return JsToken(JsTokenKind.INTEGER, src[start:self.pos], start)
            if nc in 'bB':
                self.pos += 2
                while self.pos < length and src[self.pos] in '01_':
                    self.pos += 1
                if self.pos < length and src[self.pos] == 'n':
                    self.pos += 1
                    return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)
                return JsToken(JsTokenKind.INTEGER, src[start:self.pos], start)

        while self.pos < length and (src[self.pos].isdigit() or src[self.pos] == '_'):
            self.pos += 1

        is_float = False
        if self.pos < length and src[self.pos] == '.':
            next_pos = self.pos + 1
            if next_pos < length and src[next_pos].isdigit():
                is_float = True
                self.pos += 1
                while self.pos < length and (
                    src[self.pos].isdigit() or src[self.pos] == '_'
                ):
                    self.pos += 1

        if self.pos < length and src[self.pos] in 'eE':
            is_float = True
            self.pos += 1
            if self.pos < length and src[self.pos] in '+-':
                self.pos += 1
            while self.pos < length and (
                src[self.pos].isdigit() or src[self.pos] == '_'
            ):
                self.pos += 1

        if not is_float and self.pos < length and src[self.pos] == 'n':
            self.pos += 1
            return JsToken(JsTokenKind.BIGINT, src[start:self.pos], start)

        kind = JsTokenKind.FLOAT if is_float else JsTokenKind.INTEGER
        return JsToken(kind, src[start:self.pos], start)

    def _read_identifier_or_keyword(self) -> JsToken:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length:
            c = src[self.pos]
            if c.isalnum() or c == '_' or c == '$':
                self.pos += 1
            elif c == '\\' and self.pos + 1 < length and src[self.pos + 1] == 'u':
                self._read_string_escape()
            else:
                break
        word = src[start:self.pos]
        kw = KEYWORDS.get(word)
        if kw is not None:
            return JsToken(kw, word, start)
        return JsToken(JsTokenKind.IDENTIFIER, word, start)

    def tokenize(self) -> Generator[JsToken, None, None]:
        src = self.source
        length = len(src)
        prev_allows_regex = True

        while True:
            self._skip_whitespace()
            if self._at_end():
                yield JsToken(JsTokenKind.EOF, '', self.pos)
                return

            start = self.pos
            c = src[self.pos]
            c2 = src[self.pos:self.pos + 2]

            if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                self.pos += 2
                yield JsToken(JsTokenKind.NEWLINE, '\r\n', start)
                continue
            if c in '\r\n':
                self.pos += 1
                yield JsToken(JsTokenKind.NEWLINE, c, start)
                continue

            if c2 == '//':
                text = self._read_line_comment()
                yield JsToken(JsTokenKind.COMMENT, text, start)
                continue
            if c2 == '/*':
                text, has_newline = self._read_block_comment()
                yield JsToken(JsTokenKind.COMMENT, text, start)
                if has_newline:
                    yield JsToken(JsTokenKind.NEWLINE, '', self.pos)
                continue

            if c == "'":
                text = self._read_single_string()
                prev_allows_regex = False
                yield JsToken(JsTokenKind.STRING_SINGLE, text, start)
                continue
            if c == '"':
                text = self._read_double_string()
                prev_allows_regex = False
                yield JsToken(JsTokenKind.STRING_DOUBLE, text, start)
                continue
            if c == '`':
                tok = self._read_template()
                prev_allows_regex = False
                yield tok
                continue

            if c == '}' and self._template_depth > 0 and self._brace_stack:
                if self._brace_stack[-1] == 0:
                    self._brace_stack.pop()
                    tok = self._resume_template()
                    prev_allows_regex = False
                    yield tok
                    continue
                else:
                    self._brace_stack[-1] -= 1

            if c.isdigit() or (
                c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()
            ):
                tok = self._read_number()
                prev_allows_regex = False
                yield tok
                continue

            if c.isalpha() or c == '_' or c == '$' or c == '\\':
                tok = self._read_identifier_or_keyword()
                prev_allows_regex = tok.kind not in _EXPR_END_KINDS
                yield tok
                continue

            if c == '/':
                if prev_allows_regex:
                    text = self._read_regexp()
                    prev_allows_regex = False
                    yield JsToken(JsTokenKind.REGEXP, text, start)
                    continue
                c2_slash = src[self.pos:self.pos + 2]
                if c2_slash == '/=':
                    self.pos += 2
                    prev_allows_regex = True
                    yield JsToken(JsTokenKind.SLASH_ASSIGN, '/=', start)
                    continue
                self.pos += 1
                prev_allows_regex = True
                yield JsToken(JsTokenKind.SLASH, '/', start)
                continue

            c4 = src[self.pos:self.pos + 4]
            if c4 in _FOUR_CHAR_OPS:
                self.pos += 4
                kind = _FOUR_CHAR_OPS[c4]
                prev_allows_regex = True
                yield JsToken(kind, c4, start)
                continue

            c3 = src[self.pos:self.pos + 3]
            if c3 in _THREE_CHAR_OPS:
                self.pos += 3
                kind = _THREE_CHAR_OPS[c3]
                prev_allows_regex = True
                yield JsToken(kind, c3, start)
                continue

            if c2 in _TWO_CHAR_OPS:
                self.pos += 2
                kind = _TWO_CHAR_OPS[c2]
                if kind in (JsTokenKind.INC, JsTokenKind.DEC):
                    pass
                else:
                    prev_allows_regex = True
                yield JsToken(kind, c2, start)
                continue

            if c in _ONE_CHAR_OPS:
                self.pos += 1
                kind = _ONE_CHAR_OPS[c]
                if kind in (
                    JsTokenKind.RPAREN,
                    JsTokenKind.RBRACKET,
                ):
                    prev_allows_regex = False
                elif kind == JsTokenKind.RBRACE:
                    prev_allows_regex = True
                else:
                    prev_allows_regex = kind not in _EXPR_END_KINDS
                if kind == JsTokenKind.LBRACE and self._brace_stack:
                    self._brace_stack[-1] += 1
                yield JsToken(kind, c, start)
                continue

            self.pos += 1
            prev_allows_regex = True
            yield JsToken(JsTokenKind.ERROR, c, start)

Instance variables

var source

The type of the None singleton.

var pos

The type of the None singleton.

Methods

def tokenize(self)
Expand source code Browse git
def tokenize(self) -> Generator[JsToken, None, None]:
    src = self.source
    length = len(src)
    prev_allows_regex = True

    while True:
        self._skip_whitespace()
        if self._at_end():
            yield JsToken(JsTokenKind.EOF, '', self.pos)
            return

        start = self.pos
        c = src[self.pos]
        c2 = src[self.pos:self.pos + 2]

        if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
            self.pos += 2
            yield JsToken(JsTokenKind.NEWLINE, '\r\n', start)
            continue
        if c in '\r\n':
            self.pos += 1
            yield JsToken(JsTokenKind.NEWLINE, c, start)
            continue

        if c2 == '//':
            text = self._read_line_comment()
            yield JsToken(JsTokenKind.COMMENT, text, start)
            continue
        if c2 == '/*':
            text, has_newline = self._read_block_comment()
            yield JsToken(JsTokenKind.COMMENT, text, start)
            if has_newline:
                yield JsToken(JsTokenKind.NEWLINE, '', self.pos)
            continue

        if c == "'":
            text = self._read_single_string()
            prev_allows_regex = False
            yield JsToken(JsTokenKind.STRING_SINGLE, text, start)
            continue
        if c == '"':
            text = self._read_double_string()
            prev_allows_regex = False
            yield JsToken(JsTokenKind.STRING_DOUBLE, text, start)
            continue
        if c == '`':
            tok = self._read_template()
            prev_allows_regex = False
            yield tok
            continue

        if c == '}' and self._template_depth > 0 and self._brace_stack:
            if self._brace_stack[-1] == 0:
                self._brace_stack.pop()
                tok = self._resume_template()
                prev_allows_regex = False
                yield tok
                continue
            else:
                self._brace_stack[-1] -= 1

        if c.isdigit() or (
            c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()
        ):
            tok = self._read_number()
            prev_allows_regex = False
            yield tok
            continue

        if c.isalpha() or c == '_' or c == '$' or c == '\\':
            tok = self._read_identifier_or_keyword()
            prev_allows_regex = tok.kind not in _EXPR_END_KINDS
            yield tok
            continue

        if c == '/':
            if prev_allows_regex:
                text = self._read_regexp()
                prev_allows_regex = False
                yield JsToken(JsTokenKind.REGEXP, text, start)
                continue
            c2_slash = src[self.pos:self.pos + 2]
            if c2_slash == '/=':
                self.pos += 2
                prev_allows_regex = True
                yield JsToken(JsTokenKind.SLASH_ASSIGN, '/=', start)
                continue
            self.pos += 1
            prev_allows_regex = True
            yield JsToken(JsTokenKind.SLASH, '/', start)
            continue

        c4 = src[self.pos:self.pos + 4]
        if c4 in _FOUR_CHAR_OPS:
            self.pos += 4
            kind = _FOUR_CHAR_OPS[c4]
            prev_allows_regex = True
            yield JsToken(kind, c4, start)
            continue

        c3 = src[self.pos:self.pos + 3]
        if c3 in _THREE_CHAR_OPS:
            self.pos += 3
            kind = _THREE_CHAR_OPS[c3]
            prev_allows_regex = True
            yield JsToken(kind, c3, start)
            continue

        if c2 in _TWO_CHAR_OPS:
            self.pos += 2
            kind = _TWO_CHAR_OPS[c2]
            if kind in (JsTokenKind.INC, JsTokenKind.DEC):
                pass
            else:
                prev_allows_regex = True
            yield JsToken(kind, c2, start)
            continue

        if c in _ONE_CHAR_OPS:
            self.pos += 1
            kind = _ONE_CHAR_OPS[c]
            if kind in (
                JsTokenKind.RPAREN,
                JsTokenKind.RBRACKET,
            ):
                prev_allows_regex = False
            elif kind == JsTokenKind.RBRACE:
                prev_allows_regex = True
            else:
                prev_allows_regex = kind not in _EXPR_END_KINDS
            if kind == JsTokenKind.LBRACE and self._brace_stack:
                self._brace_stack[-1] += 1
            yield JsToken(kind, c, start)
            continue

        self.pos += 1
        prev_allows_regex = True
        yield JsToken(JsTokenKind.ERROR, c, start)