Module `refinery.units.pattern.carve_xml`

Expand source code Browse git

from __future__ import annotations

import re

from contextlib import suppress

import defusedxml.minidom

from refinery.units import Unit


class XMLTag:
    def __init__(self, tag: str):
        match = re.search(R'\A<([/\?]?)([^\W\d][-:\.\w]*)', tag)
        if not match:
            raise ValueError
        self.name = match[2]
        self.mod = match[1]
        self.delta = {'/': -1, '?': 0}.get(self.mod, 1)

    def __repr__(self) -> str:
        return F'<{self.mod}{self.name}>'


class XMLCarver:
    _MAX_TAG_SIZE = 0x8000

    def __init__(self, data: bytearray):
        self.data = data
        self.cursor = 0

    def __iter__(self):
        return self

    def _try_decode(self, data: bytes) -> str | None:
        def printable(s):
            return re.sub('\\s+', '', s).isprintable()

        for codec in ('UTF8', 'CP1252', 'LATIN-1'):
            with suppress(UnicodeDecodeError):
                if printable(decoded := data.decode(codec)):
                    return decoded
        if len(data) % 2 == 1:
            data = data + B'\0'
        with suppress(UnicodeDecodeError):
            if printable(decoded := data.decode('UTF-16LE')):
                return decoded

    def _seek_tag(self, start: int):
        quote = None
        escaped = False
        for end in range(start + 1, min(start + self._MAX_TAG_SIZE, len(self.data))):
            if not quote:
                if self.data[end] == B'>'[0]:
                    return end + 1
                elif self.data[end] == B'<'[0]:
                    return None
                elif self.data[end] in B''''"''':
                    quote = self.data[end]
            elif escaped:
                escaped = False
            elif self.data[end] == B'\\'[0]:
                escaped = True
            elif self.data[end] == quote:
                quote = None

    def _read_tag(self):
        end = self._seek_tag(self.cursor)
        if end is None:
            return None
        decoded = self._try_decode(self.data[self.cursor:end])
        if decoded is None:
            return None
        try:
            tag = XMLTag(decoded)
        except ValueError:
            return None
        else:
            self.cursor = end
            return tag

    def _find_xml_end(self, tag: XMLTag):
        stack = 1
        while stack:
            self.cursor = self.data.find(B'<', self.cursor)
            if self.cursor < 0:
                return False
            t = self._read_tag()
            if t is None:
                return False
            if t.name == tag.name:
                stack += t.delta
        return True

    def __next__(self):
        while True:
            start = self.data.find(B'<', self.cursor)
            if start < 0:
                raise StopIteration
            self.cursor = start
            tag = self._read_tag()
            if tag and tag.mod == '?' and tag.name.lower() == 'xml':
                self.cursor = self.data.find(B'<', self.cursor)
                if self.cursor < 0:
                    raise StopIteration
                tag = self._read_tag()
            if tag is None:
                self.cursor += 1
                continue
            if self._find_xml_end(tag):
                try:
                    decoded = self._try_decode(self.data[start:self.cursor])
                    if decoded is not None:
                        defusedxml.minidom.parseString(decoded)
                        return start, decoded.encode(Unit.codec)
                except Exception:
                    pass
            self.cursor = start + 1


class carve_xml(Unit):
    """
    Extracts anything from the input data that looks like XML.
    """

    def process(self, data):
        for offset, chunk in XMLCarver(data):
            yield self.labelled(chunk, offset=offset)

Classes

class XMLTag (tag)

Expand source code Browse git

class XMLTag:
    def __init__(self, tag: str):
        match = re.search(R'\A<([/\?]?)([^\W\d][-:\.\w]*)', tag)
        if not match:
            raise ValueError
        self.name = match[2]
        self.mod = match[1]
        self.delta = {'/': -1, '?': 0}.get(self.mod, 1)

    def __repr__(self) -> str:
        return F'<{self.mod}{self.name}>'

class XMLCarver (data)

Expand source code Browse git

class XMLCarver:
    _MAX_TAG_SIZE = 0x8000

    def __init__(self, data: bytearray):
        self.data = data
        self.cursor = 0

    def __iter__(self):
        return self

    def _try_decode(self, data: bytes) -> str | None:
        def printable(s):
            return re.sub('\\s+', '', s).isprintable()

        for codec in ('UTF8', 'CP1252', 'LATIN-1'):
            with suppress(UnicodeDecodeError):
                if printable(decoded := data.decode(codec)):
                    return decoded
        if len(data) % 2 == 1:
            data = data + B'\0'
        with suppress(UnicodeDecodeError):
            if printable(decoded := data.decode('UTF-16LE')):
                return decoded

    def _seek_tag(self, start: int):
        quote = None
        escaped = False
        for end in range(start + 1, min(start + self._MAX_TAG_SIZE, len(self.data))):
            if not quote:
                if self.data[end] == B'>'[0]:
                    return end + 1
                elif self.data[end] == B'<'[0]:
                    return None
                elif self.data[end] in B''''"''':
                    quote = self.data[end]
            elif escaped:
                escaped = False
            elif self.data[end] == B'\\'[0]:
                escaped = True
            elif self.data[end] == quote:
                quote = None

    def _read_tag(self):
        end = self._seek_tag(self.cursor)
        if end is None:
            return None
        decoded = self._try_decode(self.data[self.cursor:end])
        if decoded is None:
            return None
        try:
            tag = XMLTag(decoded)
        except ValueError:
            return None
        else:
            self.cursor = end
            return tag

    def _find_xml_end(self, tag: XMLTag):
        stack = 1
        while stack:
            self.cursor = self.data.find(B'<', self.cursor)
            if self.cursor < 0:
                return False
            t = self._read_tag()
            if t is None:
                return False
            if t.name == tag.name:
                stack += t.delta
        return True

    def __next__(self):
        while True:
            start = self.data.find(B'<', self.cursor)
            if start < 0:
                raise StopIteration
            self.cursor = start
            tag = self._read_tag()
            if tag and tag.mod == '?' and tag.name.lower() == 'xml':
                self.cursor = self.data.find(B'<', self.cursor)
                if self.cursor < 0:
                    raise StopIteration
                tag = self._read_tag()
            if tag is None:
                self.cursor += 1
                continue
            if self._find_xml_end(tag):
                try:
                    decoded = self._try_decode(self.data[start:self.cursor])
                    if decoded is not None:
                        defusedxml.minidom.parseString(decoded)
                        return start, decoded.encode(Unit.codec)
                except Exception:
                    pass
            self.cursor = start + 1

class carve_xml

Extracts anything from the input data that looks like XML.

Expand source code Browse git

class carve_xml(Unit):
    """
    Extracts anything from the input data that looks like XML.
    """

    def process(self, data):
        for offset, chunk in XMLCarver(data):
            yield self.labelled(chunk, offset=offset)

Ancestors

Subclasses

carve_xml

Class variables

var reverse: The type of the None singleton.

Inherited members

Unit:
- console
- optional_dependencies
- required_dependencies
Unit:
- FilterEverything
- Requires
- act
- assemble
- codec
- finish
- handles
- is_quiet
- is_reversible
- isatty
- labelled
- leniency
- log_always
- log_debug
- log_detach
- log_fail
- log_info
- log_level
- log_warn
- logger
- name
- nozzle
- read
- read1
- reset
- run
- source
- superinit
UnitBase:
- process
UnitBase:
- filter