Module `refinery.units.formats.xml`

Expand source code Browse git

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from refinery.lib.structures import MemoryFile
from refinery.lib.meta import metavars, is_valid_variable_name
from refinery.lib import xml
from refinery.units.sinks.ppxml import ppxml
from refinery.units.formats import XMLToPathExtractorUnit, UnpackResult


class xtxml(XMLToPathExtractorUnit):
    """
    Extract values from an XML document.
    """
    def unpack(self, data):
        root = xml.parse(data.strip())
        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        def walk(node: xml.XMLNode, *parts: str):
            def extract(node: xml.XMLNode = node):
                if not node.children:
                    return node.content.encode(self.codec)
                with MemoryFile() as stream:
                    node.write(stream)
                    return bytes(stream.getbuffer() | ppxml)

            attributes = {
                self._normalize_key(k): self._normalize_val(v)
                for k, v in node.attributes.items()
            }

            if not all(is_valid_variable_name(k) for k in attributes):
                attributes = {F'_{k}': v for k, v in attributes.items()}

            yield UnpackResult('/'.join(parts), extract, **attributes)

            for child in node.children:
                yield from walk(child, *parts, path(child))

        yield from walk(root, path(root))

    @classmethod
    def handles(self, data):
        return xml.is_xml(data)

Classes

class xtxml (*paths, format=None, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract values from an XML document.

Expand source code Browse git

class xtxml(XMLToPathExtractorUnit):
    """
    Extract values from an XML document.
    """
    def unpack(self, data):
        root = xml.parse(data.strip())
        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        def walk(node: xml.XMLNode, *parts: str):
            def extract(node: xml.XMLNode = node):
                if not node.children:
                    return node.content.encode(self.codec)
                with MemoryFile() as stream:
                    node.write(stream)
                    return bytes(stream.getbuffer() | ppxml)

            attributes = {
                self._normalize_key(k): self._normalize_val(v)
                for k, v in node.attributes.items()
            }

            if not all(is_valid_variable_name(k) for k in attributes):
                attributes = {F'_{k}': v for k, v in attributes.items()}

            yield UnpackResult('/'.join(parts), extract, **attributes)

            for child in node.children:
                yield from walk(child, *parts, path(child))

        yield from walk(root, path(root))

    @classmethod
    def handles(self, data):
        return xml.is_xml(data)

Ancestors

Class variables

var required_dependencies
var optional_dependencies

Methods

def unpack(self, data)

Expand source code Browse git

def unpack(self, data):
    root = xml.parse(data.strip())
    meta = metavars(data)
    path = self._make_path_builder(meta, root)

    def walk(node: xml.XMLNode, *parts: str):
        def extract(node: xml.XMLNode = node):
            if not node.children:
                return node.content.encode(self.codec)
            with MemoryFile() as stream:
                node.write(stream)
                return bytes(stream.getbuffer() | ppxml)

        attributes = {
            self._normalize_key(k): self._normalize_val(v)
            for k, v in node.attributes.items()
        }

        if not all(is_valid_variable_name(k) for k in attributes):
            attributes = {F'_{k}': v for k, v in attributes.items()}

        yield UnpackResult('/'.join(parts), extract, **attributes)

        for child in node.children:
            yield from walk(child, *parts, path(child))

    yield from walk(root, path(root))

Inherited members

XMLToPathExtractorUnit:
- Arg
- CustomPathSeparator
- assemble
- filter
- finish
- handles
- is_quiet
- labelled
- leniency
- log_always
- log_debug
- log_detach
- log_fail
- log_info
- log_level
- log_warn
- nozzle
- process
- read
- read1
- reverse
- run
- source
- superinit