Module refinery.lib.scripts.vba.lexer

Expand source code Browse git
from __future__ import annotations

from dataclasses import dataclass
from typing import Generator

from refinery.lib.scripts.vba.token import _KEYWORDS, VbaToken, VbaTokenKind

_ONE_CHAR_OPS: dict[str, VbaTokenKind] = {
    '+' : VbaTokenKind.PLUS,
    '-' : VbaTokenKind.MINUS,
    '*' : VbaTokenKind.STAR,
    '/' : VbaTokenKind.SLASH,
    '\\': VbaTokenKind.BACKSLASH,
    '^' : VbaTokenKind.CARET,
    '&' : VbaTokenKind.AMPERSAND,
    '=' : VbaTokenKind.EQ,
    '<' : VbaTokenKind.LT,
    '>' : VbaTokenKind.GT,
    '.' : VbaTokenKind.DOT,
    '!' : VbaTokenKind.BANG,
    '(' : VbaTokenKind.LPAREN,
    ')' : VbaTokenKind.RPAREN,
    ',' : VbaTokenKind.COMMA,
    ';' : VbaTokenKind.SEMICOLON,
    ':' : VbaTokenKind.COLON,
}


@dataclass
class VbaLexer:
    source: str
    pos: int = 0

    def _peek(self, count: int = 1) -> str:
        return self.source[self.pos:self.pos + count]

    def _at_end(self) -> bool:
        return self.pos >= len(self.source)

    def _skip_whitespace(self) -> bool:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] in ' \t':
            self.pos += 1
        return self.pos > start

    def _read_string(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '"':
                self.pos += 1
                if self.pos < length and src[self.pos] == '"':
                    self.pos += 1
                    continue
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
            self.pos += 1
        return src[start:self.pos]

    def _read_date_literal(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '#':
                self.pos += 1
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
            self.pos += 1
        return src[start:self.pos]

    def _read_number(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)

        if src[self.pos] == '&' and self.pos + 1 < length:
            nc = src[self.pos + 1].lower()
            if nc == 'h':
                self.pos += 2
                while self.pos < length and src[self.pos] in '0123456789abcdefABCDEF':
                    self.pos += 1
                if self.pos < length and src[self.pos] in '&%':
                    self.pos += 1
                return VbaToken(VbaTokenKind.INTEGER, src[start:self.pos], start)
            if nc == 'o':
                self.pos += 2
                while self.pos < length and src[self.pos] in '01234567':
                    self.pos += 1
                if self.pos < length and src[self.pos] in '&%':
                    self.pos += 1
                return VbaToken(VbaTokenKind.INTEGER, src[start:self.pos], start)

        while self.pos < length and src[self.pos].isdigit():
            self.pos += 1

        is_float = False
        if self.pos < length and src[self.pos] == '.':
            next_pos = self.pos + 1
            if next_pos < length and src[next_pos].isdigit():
                is_float = True
                self.pos += 1
                while self.pos < length and src[self.pos].isdigit():
                    self.pos += 1

        if self.pos < length and src[self.pos] in 'eEdD':
            is_float = True
            self.pos += 1
            if self.pos < length and src[self.pos] in '+-':
                self.pos += 1
            while self.pos < length and src[self.pos].isdigit():
                self.pos += 1

        if self.pos < length and src[self.pos] in '%&!#@':
            self.pos += 1

        kind = VbaTokenKind.FLOAT if is_float else VbaTokenKind.INTEGER
        return VbaToken(kind, src[start:self.pos], start)

    def _read_identifier_or_keyword(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length:
            c = src[self.pos]
            if c.isalnum() or c == '_':
                self.pos += 1
            else:
                break
        word = src[start:self.pos]
        suffix = ''
        if self.pos < length and src[self.pos] in '%&!#@$':
            c_suffix = src[self.pos]
            consume = True
            if c_suffix == '&' and not (
                self.pos + 1 >= length
                or src[self.pos + 1] in ' \t\r\n)],;:\x00(.!'
            ):
                consume = False
            elif c_suffix == '!' and (
                self.pos + 1 < length
                and (src[self.pos + 1].isalpha() or src[self.pos + 1] in '_[')
            ):
                consume = False
            if consume:
                suffix = c_suffix
                self.pos += 1
        kw = _KEYWORDS.get(word.lower())
        if kw is not None and not suffix:
            return VbaToken(kw, word, start)
        return VbaToken(VbaTokenKind.IDENTIFIER, word + suffix, start)

    def _read_bracket_identifier(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == ']':
                self.pos += 1
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
            self.pos += 1
        return src[start:self.pos]

    def _read_comment(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] not in '\r\n':
            self.pos += 1
        return src[start:self.pos]

    def tokenize(self) -> Generator[VbaToken, None, None]:
        src = self.source
        length = len(src)
        last_was_newline = True

        while True:
            self._skip_whitespace()
            if self._at_end():
                yield VbaToken(VbaTokenKind.EOF, '', self.pos)
                return

            start = self.pos
            c = src[self.pos]

            if c == '_':
                p = self.pos + 1
                while p < length and src[p] in ' \t':
                    p += 1
                if p >= length or src[p] in '\r\n':
                    self.pos = p
                    if self.pos < length and src[self.pos] == '\r':
                        self.pos += 1
                    if self.pos < length and src[self.pos] == '\n':
                        self.pos += 1
                    continue

            if c == '\r' or c == '\n':
                if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                    self.pos += 2
                else:
                    self.pos += 1
                if not last_was_newline:
                    yield VbaToken(VbaTokenKind.NEWLINE, '\n', start)
                    last_was_newline = True
                continue

            if c == "'":
                text = self._read_comment()
                yield VbaToken(VbaTokenKind.COMMENT, text, start)
                continue

            if c == '"':
                text = self._read_string()
                last_was_newline = False
                yield VbaToken(VbaTokenKind.STRING, text, start)
                continue

            if c == '#':
                if last_was_newline and self.pos + 1 < length and src[self.pos + 1].isalpha():
                    peek = self.pos + 1
                    while peek < length and src[peek].isalpha():
                        peek += 1
                    word = src[self.pos + 1:peek].lower()
                    if word in ('if', 'elseif', 'else', 'end', 'endif', 'const'):
                        while self.pos < length and src[self.pos] not in '\r\n':
                            self.pos += 1
                        continue
                if self.pos + 1 < length and not src[self.pos + 1].isspace():
                    text = self._read_date_literal()
                    last_was_newline = False
                    yield VbaToken(VbaTokenKind.DATE_LITERAL, text, start)
                    continue
                self.pos += 1
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, '#', start)
                continue

            if c == '&' and self.pos + 1 < length and src[self.pos + 1].lower() in 'ho':
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isdigit() or (c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()):
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isalpha() or c == '_':
                tok = self._read_identifier_or_keyword()
                if tok.kind in (
                    VbaTokenKind.BOOLEAN_TRUE,
                    VbaTokenKind.BOOLEAN_FALSE,
                ):
                    last_was_newline = False
                    yield tok
                    continue
                if tok.value.lower() == 'rem':
                    text = self._read_comment()
                    yield VbaToken(VbaTokenKind.COMMENT, tok.value + text, start)
                    continue
                last_was_newline = False
                yield tok
                continue

            if c == '[':
                text = self._read_bracket_identifier()
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, text, start)
                continue

            c2 = src[self.pos:self.pos + 2]
            if c2 == '<>':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.NEQ, '<>', start)
                continue
            if c2 == '<=':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.LTE, '<=', start)
                continue
            if c2 == '>=':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.GTE, '>=', start)
                continue
            if c2 == ':=':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.ASSIGN, ':=', start)
                continue

            op_kind = _ONE_CHAR_OPS.get(c)
            if op_kind is not None:
                self.pos += 1
                last_was_newline = False
                if op_kind == VbaTokenKind.COLON:
                    yield VbaToken(VbaTokenKind.COLON, ':', start)
                else:
                    yield VbaToken(op_kind, c, start)
                continue

            self.pos += 1
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, c, start)

Classes

class VbaLexer (source, pos=0)

VbaLexer(source: 'str', pos: 'int' = 0)

Expand source code Browse git
@dataclass
class VbaLexer:
    source: str
    pos: int = 0

    def _peek(self, count: int = 1) -> str:
        return self.source[self.pos:self.pos + count]

    def _at_end(self) -> bool:
        return self.pos >= len(self.source)

    def _skip_whitespace(self) -> bool:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] in ' \t':
            self.pos += 1
        return self.pos > start

    def _read_string(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '"':
                self.pos += 1
                if self.pos < length and src[self.pos] == '"':
                    self.pos += 1
                    continue
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
            self.pos += 1
        return src[start:self.pos]

    def _read_date_literal(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == '#':
                self.pos += 1
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
            self.pos += 1
        return src[start:self.pos]

    def _read_number(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)

        if src[self.pos] == '&' and self.pos + 1 < length:
            nc = src[self.pos + 1].lower()
            if nc == 'h':
                self.pos += 2
                while self.pos < length and src[self.pos] in '0123456789abcdefABCDEF':
                    self.pos += 1
                if self.pos < length and src[self.pos] in '&%':
                    self.pos += 1
                return VbaToken(VbaTokenKind.INTEGER, src[start:self.pos], start)
            if nc == 'o':
                self.pos += 2
                while self.pos < length and src[self.pos] in '01234567':
                    self.pos += 1
                if self.pos < length and src[self.pos] in '&%':
                    self.pos += 1
                return VbaToken(VbaTokenKind.INTEGER, src[start:self.pos], start)

        while self.pos < length and src[self.pos].isdigit():
            self.pos += 1

        is_float = False
        if self.pos < length and src[self.pos] == '.':
            next_pos = self.pos + 1
            if next_pos < length and src[next_pos].isdigit():
                is_float = True
                self.pos += 1
                while self.pos < length and src[self.pos].isdigit():
                    self.pos += 1

        if self.pos < length and src[self.pos] in 'eEdD':
            is_float = True
            self.pos += 1
            if self.pos < length and src[self.pos] in '+-':
                self.pos += 1
            while self.pos < length and src[self.pos].isdigit():
                self.pos += 1

        if self.pos < length and src[self.pos] in '%&!#@':
            self.pos += 1

        kind = VbaTokenKind.FLOAT if is_float else VbaTokenKind.INTEGER
        return VbaToken(kind, src[start:self.pos], start)

    def _read_identifier_or_keyword(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length:
            c = src[self.pos]
            if c.isalnum() or c == '_':
                self.pos += 1
            else:
                break
        word = src[start:self.pos]
        suffix = ''
        if self.pos < length and src[self.pos] in '%&!#@$':
            c_suffix = src[self.pos]
            consume = True
            if c_suffix == '&' and not (
                self.pos + 1 >= length
                or src[self.pos + 1] in ' \t\r\n)],;:\x00(.!'
            ):
                consume = False
            elif c_suffix == '!' and (
                self.pos + 1 < length
                and (src[self.pos + 1].isalpha() or src[self.pos + 1] in '_[')
            ):
                consume = False
            if consume:
                suffix = c_suffix
                self.pos += 1
        kw = _KEYWORDS.get(word.lower())
        if kw is not None and not suffix:
            return VbaToken(kw, word, start)
        return VbaToken(VbaTokenKind.IDENTIFIER, word + suffix, start)

    def _read_bracket_identifier(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            c = src[self.pos]
            if c == ']':
                self.pos += 1
                return src[start:self.pos]
            if c in '\r\n':
                return src[start:self.pos]
            self.pos += 1
        return src[start:self.pos]

    def _read_comment(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] not in '\r\n':
            self.pos += 1
        return src[start:self.pos]

    def tokenize(self) -> Generator[VbaToken, None, None]:
        src = self.source
        length = len(src)
        last_was_newline = True

        while True:
            self._skip_whitespace()
            if self._at_end():
                yield VbaToken(VbaTokenKind.EOF, '', self.pos)
                return

            start = self.pos
            c = src[self.pos]

            if c == '_':
                p = self.pos + 1
                while p < length and src[p] in ' \t':
                    p += 1
                if p >= length or src[p] in '\r\n':
                    self.pos = p
                    if self.pos < length and src[self.pos] == '\r':
                        self.pos += 1
                    if self.pos < length and src[self.pos] == '\n':
                        self.pos += 1
                    continue

            if c == '\r' or c == '\n':
                if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                    self.pos += 2
                else:
                    self.pos += 1
                if not last_was_newline:
                    yield VbaToken(VbaTokenKind.NEWLINE, '\n', start)
                    last_was_newline = True
                continue

            if c == "'":
                text = self._read_comment()
                yield VbaToken(VbaTokenKind.COMMENT, text, start)
                continue

            if c == '"':
                text = self._read_string()
                last_was_newline = False
                yield VbaToken(VbaTokenKind.STRING, text, start)
                continue

            if c == '#':
                if last_was_newline and self.pos + 1 < length and src[self.pos + 1].isalpha():
                    peek = self.pos + 1
                    while peek < length and src[peek].isalpha():
                        peek += 1
                    word = src[self.pos + 1:peek].lower()
                    if word in ('if', 'elseif', 'else', 'end', 'endif', 'const'):
                        while self.pos < length and src[self.pos] not in '\r\n':
                            self.pos += 1
                        continue
                if self.pos + 1 < length and not src[self.pos + 1].isspace():
                    text = self._read_date_literal()
                    last_was_newline = False
                    yield VbaToken(VbaTokenKind.DATE_LITERAL, text, start)
                    continue
                self.pos += 1
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, '#', start)
                continue

            if c == '&' and self.pos + 1 < length and src[self.pos + 1].lower() in 'ho':
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isdigit() or (c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()):
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isalpha() or c == '_':
                tok = self._read_identifier_or_keyword()
                if tok.kind in (
                    VbaTokenKind.BOOLEAN_TRUE,
                    VbaTokenKind.BOOLEAN_FALSE,
                ):
                    last_was_newline = False
                    yield tok
                    continue
                if tok.value.lower() == 'rem':
                    text = self._read_comment()
                    yield VbaToken(VbaTokenKind.COMMENT, tok.value + text, start)
                    continue
                last_was_newline = False
                yield tok
                continue

            if c == '[':
                text = self._read_bracket_identifier()
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, text, start)
                continue

            c2 = src[self.pos:self.pos + 2]
            if c2 == '<>':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.NEQ, '<>', start)
                continue
            if c2 == '<=':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.LTE, '<=', start)
                continue
            if c2 == '>=':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.GTE, '>=', start)
                continue
            if c2 == ':=':
                self.pos += 2
                last_was_newline = False
                yield VbaToken(VbaTokenKind.ASSIGN, ':=', start)
                continue

            op_kind = _ONE_CHAR_OPS.get(c)
            if op_kind is not None:
                self.pos += 1
                last_was_newline = False
                if op_kind == VbaTokenKind.COLON:
                    yield VbaToken(VbaTokenKind.COLON, ':', start)
                else:
                    yield VbaToken(op_kind, c, start)
                continue

            self.pos += 1
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, c, start)

Instance variables

var source

The type of the None singleton.

var pos

The type of the None singleton.

Methods

def tokenize(self)
Expand source code Browse git
def tokenize(self) -> Generator[VbaToken, None, None]:
    src = self.source
    length = len(src)
    last_was_newline = True

    while True:
        self._skip_whitespace()
        if self._at_end():
            yield VbaToken(VbaTokenKind.EOF, '', self.pos)
            return

        start = self.pos
        c = src[self.pos]

        if c == '_':
            p = self.pos + 1
            while p < length and src[p] in ' \t':
                p += 1
            if p >= length or src[p] in '\r\n':
                self.pos = p
                if self.pos < length and src[self.pos] == '\r':
                    self.pos += 1
                if self.pos < length and src[self.pos] == '\n':
                    self.pos += 1
                continue

        if c == '\r' or c == '\n':
            if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                self.pos += 2
            else:
                self.pos += 1
            if not last_was_newline:
                yield VbaToken(VbaTokenKind.NEWLINE, '\n', start)
                last_was_newline = True
            continue

        if c == "'":
            text = self._read_comment()
            yield VbaToken(VbaTokenKind.COMMENT, text, start)
            continue

        if c == '"':
            text = self._read_string()
            last_was_newline = False
            yield VbaToken(VbaTokenKind.STRING, text, start)
            continue

        if c == '#':
            if last_was_newline and self.pos + 1 < length and src[self.pos + 1].isalpha():
                peek = self.pos + 1
                while peek < length and src[peek].isalpha():
                    peek += 1
                word = src[self.pos + 1:peek].lower()
                if word in ('if', 'elseif', 'else', 'end', 'endif', 'const'):
                    while self.pos < length and src[self.pos] not in '\r\n':
                        self.pos += 1
                    continue
            if self.pos + 1 < length and not src[self.pos + 1].isspace():
                text = self._read_date_literal()
                last_was_newline = False
                yield VbaToken(VbaTokenKind.DATE_LITERAL, text, start)
                continue
            self.pos += 1
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, '#', start)
            continue

        if c == '&' and self.pos + 1 < length and src[self.pos + 1].lower() in 'ho':
            tok = self._read_number()
            last_was_newline = False
            yield tok
            continue

        if c.isdigit() or (c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()):
            tok = self._read_number()
            last_was_newline = False
            yield tok
            continue

        if c.isalpha() or c == '_':
            tok = self._read_identifier_or_keyword()
            if tok.kind in (
                VbaTokenKind.BOOLEAN_TRUE,
                VbaTokenKind.BOOLEAN_FALSE,
            ):
                last_was_newline = False
                yield tok
                continue
            if tok.value.lower() == 'rem':
                text = self._read_comment()
                yield VbaToken(VbaTokenKind.COMMENT, tok.value + text, start)
                continue
            last_was_newline = False
            yield tok
            continue

        if c == '[':
            text = self._read_bracket_identifier()
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, text, start)
            continue

        c2 = src[self.pos:self.pos + 2]
        if c2 == '<>':
            self.pos += 2
            last_was_newline = False
            yield VbaToken(VbaTokenKind.NEQ, '<>', start)
            continue
        if c2 == '<=':
            self.pos += 2
            last_was_newline = False
            yield VbaToken(VbaTokenKind.LTE, '<=', start)
            continue
        if c2 == '>=':
            self.pos += 2
            last_was_newline = False
            yield VbaToken(VbaTokenKind.GTE, '>=', start)
            continue
        if c2 == ':=':
            self.pos += 2
            last_was_newline = False
            yield VbaToken(VbaTokenKind.ASSIGN, ':=', start)
            continue

        op_kind = _ONE_CHAR_OPS.get(c)
        if op_kind is not None:
            self.pos += 1
            last_was_newline = False
            if op_kind == VbaTokenKind.COLON:
                yield VbaToken(VbaTokenKind.COLON, ':', start)
            else:
                yield VbaToken(op_kind, c, start)
            continue

        self.pos += 1
        last_was_newline = False
        yield VbaToken(VbaTokenKind.IDENTIFIER, c, start)