Module `refinery.lib.scripts.vba.lexer`

Expand source code Browse git

from __future__ import annotations

from dataclasses import dataclass
from typing import Generator

from refinery.lib.scripts.vba.token import _KEYWORDS, VbaToken, VbaTokenKind

_ONE_CHAR_OPS: dict[str, VbaTokenKind] = {
    '+' : VbaTokenKind.PLUS,
    '-' : VbaTokenKind.MINUS,
    '*' : VbaTokenKind.STAR,
    '/' : VbaTokenKind.SLASH,
    '\\': VbaTokenKind.BACKSLASH,
    '^' : VbaTokenKind.CARET,
    '&' : VbaTokenKind.AMPERSAND,
    '=' : VbaTokenKind.EQ,
    '<' : VbaTokenKind.LT,
    '>' : VbaTokenKind.GT,
    '.' : VbaTokenKind.DOT,
    '!' : VbaTokenKind.BANG,
    '(' : VbaTokenKind.LPAREN,
    ')' : VbaTokenKind.RPAREN,
    ',' : VbaTokenKind.COMMA,
    ';' : VbaTokenKind.SEMICOLON,
    ':' : VbaTokenKind.COLON,
}


_TWO_CHAR_OPS: dict[str, VbaTokenKind] = {
    '<>': VbaTokenKind.NEQ,
    '<=': VbaTokenKind.LTE,
    '>=': VbaTokenKind.GTE,
    ':=': VbaTokenKind.ASSIGN,
}

_TYPE_SUFFIXES = '%&!#@$'
_TYPE_SUFFIXES_NO_DOLLAR = '%&!#@'
_AMPERSAND_STOP_CHARS = ' \t\r\n)],;:\x00(.!'


@dataclass
class VbaLexer:
    source: str
    pos: int = 0

    def _at_end(self) -> bool:
        return self.pos >= len(self.source)

    def _skip_whitespace(self) -> None:
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] in ' \t':
            self.pos += 1

    def _read_delimited(self, terminator: str, allow_doubling: bool = False) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            next_term = src.find(terminator, self.pos)
            next_cr = src.find('\r', self.pos)
            next_lf = src.find('\n', self.pos)
            stop = length
            if next_term >= 0 and next_term < stop:
                stop = next_term
            if next_cr >= 0 and next_cr < stop:
                stop = next_cr
            if next_lf >= 0 and next_lf < stop:
                stop = next_lf
            if stop >= length:
                self.pos = length
                break
            if src[stop] == terminator:
                self.pos = stop + 1
                if allow_doubling and self.pos < length and src[self.pos] == terminator:
                    self.pos += 1
                    continue
                return src[start:self.pos]
            self.pos = stop
            return src[start:self.pos]
        return src[start:self.pos]

    def _read_number(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)

        if src[self.pos] == '&' and self.pos + 1 < length:
            nc = src[self.pos + 1].lower()
            if nc == 'h':
                digits = '0123456789abcdefABCDEF'
            elif nc == 'o':
                digits = '01234567'
            else:
                digits = ''
            if digits:
                self.pos += 2
                while self.pos < length and src[self.pos] in digits:
                    self.pos += 1
                if self.pos < length and src[self.pos] in '&%':
                    self.pos += 1
                return VbaToken(VbaTokenKind.INTEGER, src[start:self.pos], start)

        while self.pos < length and src[self.pos].isdigit():
            self.pos += 1

        is_float = False
        if self.pos < length and src[self.pos] == '.':
            next_pos = self.pos + 1
            if next_pos < length and src[next_pos].isdigit():
                is_float = True
                self.pos += 1
                while self.pos < length and src[self.pos].isdigit():
                    self.pos += 1

        if self.pos < length and src[self.pos] in 'eEdD':
            is_float = True
            self.pos += 1
            if self.pos < length and src[self.pos] in '+-':
                self.pos += 1
            while self.pos < length and src[self.pos].isdigit():
                self.pos += 1

        if self.pos < length and src[self.pos] in _TYPE_SUFFIXES_NO_DOLLAR:
            self.pos += 1

        kind = VbaTokenKind.FLOAT if is_float else VbaTokenKind.INTEGER
        return VbaToken(kind, src[start:self.pos], start)

    def _read_identifier_or_keyword(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length:
            c = src[self.pos]
            if c.isalnum() or c == '_':
                self.pos += 1
            else:
                break
        word = src[start:self.pos]
        suffix = ''
        if self.pos < length and src[self.pos] in _TYPE_SUFFIXES:
            c_suffix = src[self.pos]
            consume = True
            if c_suffix == '&' and not (
                self.pos + 1 >= length
                or src[self.pos + 1] in _AMPERSAND_STOP_CHARS
            ):
                consume = False
            elif c_suffix == '!' and (
                self.pos + 1 < length
                and (src[self.pos + 1].isalpha() or src[self.pos + 1] in '_[')
            ):
                consume = False
            if consume:
                suffix = c_suffix
                self.pos += 1
        kw = _KEYWORDS.get(word.lower())
        if kw is not None and not suffix:
            return VbaToken(kw, word, start)
        return VbaToken(VbaTokenKind.IDENTIFIER, word + suffix, start)

    def _read_comment(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] not in '\r\n':
            self.pos += 1
        return src[start:self.pos]

    def tokenize(self) -> Generator[VbaToken, None, None]:
        src = self.source
        length = len(src)
        last_was_newline = True

        while True:
            self._skip_whitespace()
            if self._at_end():
                yield VbaToken(VbaTokenKind.EOF, '', self.pos)
                return

            start = self.pos
            c = src[self.pos]

            if c == '_':
                p = self.pos + 1
                while p < length and src[p] in ' \t':
                    p += 1
                if p >= length or src[p] in '\r\n':
                    self.pos = p
                    if self.pos < length and src[self.pos] == '\r':
                        self.pos += 1
                    if self.pos < length and src[self.pos] == '\n':
                        self.pos += 1
                    continue

            if c == '\r' or c == '\n':
                if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                    self.pos += 2
                else:
                    self.pos += 1
                if not last_was_newline:
                    yield VbaToken(VbaTokenKind.NEWLINE, '\n', start)
                    last_was_newline = True
                continue

            if c == "'":
                text = self._read_comment()
                yield VbaToken(VbaTokenKind.COMMENT, text, start)
                continue

            if c == '"':
                text = self._read_delimited('"', allow_doubling=True)
                last_was_newline = False
                yield VbaToken(VbaTokenKind.STRING, text, start)
                continue

            if c == '#':
                if last_was_newline and self.pos + 1 < length and src[self.pos + 1].isalpha():
                    peek = self.pos + 1
                    while peek < length and src[peek].isalpha():
                        peek += 1
                    word = src[self.pos + 1:peek].lower()
                    if word in ('if', 'elseif', 'else', 'end', 'endif', 'const'):
                        while self.pos < length and src[self.pos] not in '\r\n':
                            self.pos += 1
                        continue
                if self.pos + 1 < length and not src[self.pos + 1].isspace():
                    text = self._read_delimited('#')
                    last_was_newline = False
                    yield VbaToken(VbaTokenKind.DATE_LITERAL, text, start)
                    continue
                self.pos += 1
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, '#', start)
                continue

            if c == '&' and self.pos + 1 < length and src[self.pos + 1].lower() in 'ho':
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isdigit() or (c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()):
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isalpha() or c == '_':
                tok = self._read_identifier_or_keyword()
                if tok.value.lower() == 'rem':
                    text = self._read_comment()
                    yield VbaToken(VbaTokenKind.COMMENT, tok.value + text, start)
                    continue
                last_was_newline = False
                yield tok
                continue

            if c == '[':
                text = self._read_delimited(']')
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, text, start)
                continue

            c2 = src[self.pos:self.pos + 2]
            two_kind = _TWO_CHAR_OPS.get(c2)
            if two_kind is not None:
                self.pos += 2
                last_was_newline = False
                yield VbaToken(two_kind, c2, start)
                continue

            op_kind = _ONE_CHAR_OPS.get(c)
            if op_kind is not None:
                self.pos += 1
                last_was_newline = False
                yield VbaToken(op_kind, c, start)
                continue

            self.pos += 1
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, c, start)

Classes

class VbaLexer (source, pos=0)

VbaLexer(source: 'str', pos: 'int' = 0)

Expand source code Browse git

@dataclass
class VbaLexer:
    source: str
    pos: int = 0

    def _at_end(self) -> bool:
        return self.pos >= len(self.source)

    def _skip_whitespace(self) -> None:
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] in ' \t':
            self.pos += 1

    def _read_delimited(self, terminator: str, allow_doubling: bool = False) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        self.pos += 1
        while self.pos < length:
            next_term = src.find(terminator, self.pos)
            next_cr = src.find('\r', self.pos)
            next_lf = src.find('\n', self.pos)
            stop = length
            if next_term >= 0 and next_term < stop:
                stop = next_term
            if next_cr >= 0 and next_cr < stop:
                stop = next_cr
            if next_lf >= 0 and next_lf < stop:
                stop = next_lf
            if stop >= length:
                self.pos = length
                break
            if src[stop] == terminator:
                self.pos = stop + 1
                if allow_doubling and self.pos < length and src[self.pos] == terminator:
                    self.pos += 1
                    continue
                return src[start:self.pos]
            self.pos = stop
            return src[start:self.pos]
        return src[start:self.pos]

    def _read_number(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)

        if src[self.pos] == '&' and self.pos + 1 < length:
            nc = src[self.pos + 1].lower()
            if nc == 'h':
                digits = '0123456789abcdefABCDEF'
            elif nc == 'o':
                digits = '01234567'
            else:
                digits = ''
            if digits:
                self.pos += 2
                while self.pos < length and src[self.pos] in digits:
                    self.pos += 1
                if self.pos < length and src[self.pos] in '&%':
                    self.pos += 1
                return VbaToken(VbaTokenKind.INTEGER, src[start:self.pos], start)

        while self.pos < length and src[self.pos].isdigit():
            self.pos += 1

        is_float = False
        if self.pos < length and src[self.pos] == '.':
            next_pos = self.pos + 1
            if next_pos < length and src[next_pos].isdigit():
                is_float = True
                self.pos += 1
                while self.pos < length and src[self.pos].isdigit():
                    self.pos += 1

        if self.pos < length and src[self.pos] in 'eEdD':
            is_float = True
            self.pos += 1
            if self.pos < length and src[self.pos] in '+-':
                self.pos += 1
            while self.pos < length and src[self.pos].isdigit():
                self.pos += 1

        if self.pos < length and src[self.pos] in _TYPE_SUFFIXES_NO_DOLLAR:
            self.pos += 1

        kind = VbaTokenKind.FLOAT if is_float else VbaTokenKind.INTEGER
        return VbaToken(kind, src[start:self.pos], start)

    def _read_identifier_or_keyword(self) -> VbaToken:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length:
            c = src[self.pos]
            if c.isalnum() or c == '_':
                self.pos += 1
            else:
                break
        word = src[start:self.pos]
        suffix = ''
        if self.pos < length and src[self.pos] in _TYPE_SUFFIXES:
            c_suffix = src[self.pos]
            consume = True
            if c_suffix == '&' and not (
                self.pos + 1 >= length
                or src[self.pos + 1] in _AMPERSAND_STOP_CHARS
            ):
                consume = False
            elif c_suffix == '!' and (
                self.pos + 1 < length
                and (src[self.pos + 1].isalpha() or src[self.pos + 1] in '_[')
            ):
                consume = False
            if consume:
                suffix = c_suffix
                self.pos += 1
        kw = _KEYWORDS.get(word.lower())
        if kw is not None and not suffix:
            return VbaToken(kw, word, start)
        return VbaToken(VbaTokenKind.IDENTIFIER, word + suffix, start)

    def _read_comment(self) -> str:
        start = self.pos
        src = self.source
        length = len(src)
        while self.pos < length and src[self.pos] not in '\r\n':
            self.pos += 1
        return src[start:self.pos]

    def tokenize(self) -> Generator[VbaToken, None, None]:
        src = self.source
        length = len(src)
        last_was_newline = True

        while True:
            self._skip_whitespace()
            if self._at_end():
                yield VbaToken(VbaTokenKind.EOF, '', self.pos)
                return

            start = self.pos
            c = src[self.pos]

            if c == '_':
                p = self.pos + 1
                while p < length and src[p] in ' \t':
                    p += 1
                if p >= length or src[p] in '\r\n':
                    self.pos = p
                    if self.pos < length and src[self.pos] == '\r':
                        self.pos += 1
                    if self.pos < length and src[self.pos] == '\n':
                        self.pos += 1
                    continue

            if c == '\r' or c == '\n':
                if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                    self.pos += 2
                else:
                    self.pos += 1
                if not last_was_newline:
                    yield VbaToken(VbaTokenKind.NEWLINE, '\n', start)
                    last_was_newline = True
                continue

            if c == "'":
                text = self._read_comment()
                yield VbaToken(VbaTokenKind.COMMENT, text, start)
                continue

            if c == '"':
                text = self._read_delimited('"', allow_doubling=True)
                last_was_newline = False
                yield VbaToken(VbaTokenKind.STRING, text, start)
                continue

            if c == '#':
                if last_was_newline and self.pos + 1 < length and src[self.pos + 1].isalpha():
                    peek = self.pos + 1
                    while peek < length and src[peek].isalpha():
                        peek += 1
                    word = src[self.pos + 1:peek].lower()
                    if word in ('if', 'elseif', 'else', 'end', 'endif', 'const'):
                        while self.pos < length and src[self.pos] not in '\r\n':
                            self.pos += 1
                        continue
                if self.pos + 1 < length and not src[self.pos + 1].isspace():
                    text = self._read_delimited('#')
                    last_was_newline = False
                    yield VbaToken(VbaTokenKind.DATE_LITERAL, text, start)
                    continue
                self.pos += 1
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, '#', start)
                continue

            if c == '&' and self.pos + 1 < length and src[self.pos + 1].lower() in 'ho':
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isdigit() or (c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()):
                tok = self._read_number()
                last_was_newline = False
                yield tok
                continue

            if c.isalpha() or c == '_':
                tok = self._read_identifier_or_keyword()
                if tok.value.lower() == 'rem':
                    text = self._read_comment()
                    yield VbaToken(VbaTokenKind.COMMENT, tok.value + text, start)
                    continue
                last_was_newline = False
                yield tok
                continue

            if c == '[':
                text = self._read_delimited(']')
                last_was_newline = False
                yield VbaToken(VbaTokenKind.IDENTIFIER, text, start)
                continue

            c2 = src[self.pos:self.pos + 2]
            two_kind = _TWO_CHAR_OPS.get(c2)
            if two_kind is not None:
                self.pos += 2
                last_was_newline = False
                yield VbaToken(two_kind, c2, start)
                continue

            op_kind = _ONE_CHAR_OPS.get(c)
            if op_kind is not None:
                self.pos += 1
                last_was_newline = False
                yield VbaToken(op_kind, c, start)
                continue

            self.pos += 1
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, c, start)

Instance variables

var source: The type of the None singleton.
var pos: The type of the None singleton.

Methods

def tokenize(self)

Expand source code Browse git

def tokenize(self) -> Generator[VbaToken, None, None]:
    src = self.source
    length = len(src)
    last_was_newline = True

    while True:
        self._skip_whitespace()
        if self._at_end():
            yield VbaToken(VbaTokenKind.EOF, '', self.pos)
            return

        start = self.pos
        c = src[self.pos]

        if c == '_':
            p = self.pos + 1
            while p < length and src[p] in ' \t':
                p += 1
            if p >= length or src[p] in '\r\n':
                self.pos = p
                if self.pos < length and src[self.pos] == '\r':
                    self.pos += 1
                if self.pos < length and src[self.pos] == '\n':
                    self.pos += 1
                continue

        if c == '\r' or c == '\n':
            if c == '\r' and self.pos + 1 < length and src[self.pos + 1] == '\n':
                self.pos += 2
            else:
                self.pos += 1
            if not last_was_newline:
                yield VbaToken(VbaTokenKind.NEWLINE, '\n', start)
                last_was_newline = True
            continue

        if c == "'":
            text = self._read_comment()
            yield VbaToken(VbaTokenKind.COMMENT, text, start)
            continue

        if c == '"':
            text = self._read_delimited('"', allow_doubling=True)
            last_was_newline = False
            yield VbaToken(VbaTokenKind.STRING, text, start)
            continue

        if c == '#':
            if last_was_newline and self.pos + 1 < length and src[self.pos + 1].isalpha():
                peek = self.pos + 1
                while peek < length and src[peek].isalpha():
                    peek += 1
                word = src[self.pos + 1:peek].lower()
                if word in ('if', 'elseif', 'else', 'end', 'endif', 'const'):
                    while self.pos < length and src[self.pos] not in '\r\n':
                        self.pos += 1
                    continue
            if self.pos + 1 < length and not src[self.pos + 1].isspace():
                text = self._read_delimited('#')
                last_was_newline = False
                yield VbaToken(VbaTokenKind.DATE_LITERAL, text, start)
                continue
            self.pos += 1
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, '#', start)
            continue

        if c == '&' and self.pos + 1 < length and src[self.pos + 1].lower() in 'ho':
            tok = self._read_number()
            last_was_newline = False
            yield tok
            continue

        if c.isdigit() or (c == '.' and self.pos + 1 < length and src[self.pos + 1].isdigit()):
            tok = self._read_number()
            last_was_newline = False
            yield tok
            continue

        if c.isalpha() or c == '_':
            tok = self._read_identifier_or_keyword()
            if tok.value.lower() == 'rem':
                text = self._read_comment()
                yield VbaToken(VbaTokenKind.COMMENT, tok.value + text, start)
                continue
            last_was_newline = False
            yield tok
            continue

        if c == '[':
            text = self._read_delimited(']')
            last_was_newline = False
            yield VbaToken(VbaTokenKind.IDENTIFIER, text, start)
            continue

        c2 = src[self.pos:self.pos + 2]
        two_kind = _TWO_CHAR_OPS.get(c2)
        if two_kind is not None:
            self.pos += 2
            last_was_newline = False
            yield VbaToken(two_kind, c2, start)
            continue

        op_kind = _ONE_CHAR_OPS.get(c)
        if op_kind is not None:
            self.pos += 1
            last_was_newline = False
            yield VbaToken(op_kind, c, start)
            continue

        self.pos += 1
        last_was_newline = False
        yield VbaToken(VbaTokenKind.IDENTIFIER, c, start)