Module refinery.units.formats.office.docmeta

Expand source code Browse git
from __future__ import annotations

from enum import Enum
from pathlib import Path

from refinery.lib import xml
from refinery.lib.dt import isodate
from refinery.units.formats import JSONTableUnit
from refinery.units.formats.office.xtdoc import xtdoc


class _Prop(str, Enum):
    app = 'app.xml'
    core = 'core.xml'
    custom = 'custom.xml'


class docmeta(JSONTableUnit):
    """
    Extract metadata from Word Documents such as custom document properties.
    """
    def json(self, data: bytearray):
        def interpret(value: str | dict):
            if isinstance(value, dict):
                return {k: interpret(v) for k, v in value.items()}
            if value.isdigit():
                return int(value)
            casefold = value.lower()
            if casefold == 'false':
                return False
            if casefold == 'true':
                return True
            return isodate(value) or value

        props = data | xtdoc('docProps/*.xml', exact=True, path=b'path') | {'path': bytearray}
        result = {}

        for path, page in props.items():
            name = Path(path).name
            if (dom := xml.parse(page)) is None:
                self.log_info(F'failed to parse as XML: {path}')
                continue
            try:
                prop = _Prop(name)
            except ValueError:
                self.log_info(F'skipped unknown property: {name}')
                continue

            result[prop.name] = contents = {}

            if prop == _Prop.custom:
                while dom.tag.lower() != 'properties':
                    dom = dom.children[0]
                for node in dom:
                    assert node.tag.lower() == 'property'
                    assert len(node.children) == 1
                    content = node.children[0].content
                    if content is None:
                        continue
                    contents[node.attributes['name']] = content.strip()
            elif prop == _Prop.app:
                while dom.tag.lower() != 'properties':
                    dom = dom.children[0]
                for node in dom:
                    if not (content := node.content):
                        continue
                    contents[node.tag] = content
            elif prop == _Prop.core:
                while dom.tag.lower() != 'cp:coreproperties':
                    dom = dom.children[0]
                for node in dom:
                    t, _, name = node.tag.partition(':')
                    if not name:
                        continue
                    if not (content := node.content):
                        continue
                    contents[name] = content
            for name, value in contents.items():
                contents[name] = interpret(value)

        return result

Classes

class docmeta (tabular=False, minimal=False)

Extract metadata from Word Documents such as custom document properties.

Expand source code Browse git
class docmeta(JSONTableUnit):
    """
    Extract metadata from Word Documents such as custom document properties.
    """
    def json(self, data: bytearray):
        def interpret(value: str | dict):
            if isinstance(value, dict):
                return {k: interpret(v) for k, v in value.items()}
            if value.isdigit():
                return int(value)
            casefold = value.lower()
            if casefold == 'false':
                return False
            if casefold == 'true':
                return True
            return isodate(value) or value

        props = data | xtdoc('docProps/*.xml', exact=True, path=b'path') | {'path': bytearray}
        result = {}

        for path, page in props.items():
            name = Path(path).name
            if (dom := xml.parse(page)) is None:
                self.log_info(F'failed to parse as XML: {path}')
                continue
            try:
                prop = _Prop(name)
            except ValueError:
                self.log_info(F'skipped unknown property: {name}')
                continue

            result[prop.name] = contents = {}

            if prop == _Prop.custom:
                while dom.tag.lower() != 'properties':
                    dom = dom.children[0]
                for node in dom:
                    assert node.tag.lower() == 'property'
                    assert len(node.children) == 1
                    content = node.children[0].content
                    if content is None:
                        continue
                    contents[node.attributes['name']] = content.strip()
            elif prop == _Prop.app:
                while dom.tag.lower() != 'properties':
                    dom = dom.children[0]
                for node in dom:
                    if not (content := node.content):
                        continue
                    contents[node.tag] = content
            elif prop == _Prop.core:
                while dom.tag.lower() != 'cp:coreproperties':
                    dom = dom.children[0]
                for node in dom:
                    t, _, name = node.tag.partition(':')
                    if not name:
                        continue
                    if not (content := node.content):
                        continue
                    contents[name] = content
            for name, value in contents.items():
                contents[name] = interpret(value)

        return result

Ancestors

Subclasses

Methods

def json(self, data)
Expand source code Browse git
def json(self, data: bytearray):
    def interpret(value: str | dict):
        if isinstance(value, dict):
            return {k: interpret(v) for k, v in value.items()}
        if value.isdigit():
            return int(value)
        casefold = value.lower()
        if casefold == 'false':
            return False
        if casefold == 'true':
            return True
        return isodate(value) or value

    props = data | xtdoc('docProps/*.xml', exact=True, path=b'path') | {'path': bytearray}
    result = {}

    for path, page in props.items():
        name = Path(path).name
        if (dom := xml.parse(page)) is None:
            self.log_info(F'failed to parse as XML: {path}')
            continue
        try:
            prop = _Prop(name)
        except ValueError:
            self.log_info(F'skipped unknown property: {name}')
            continue

        result[prop.name] = contents = {}

        if prop == _Prop.custom:
            while dom.tag.lower() != 'properties':
                dom = dom.children[0]
            for node in dom:
                assert node.tag.lower() == 'property'
                assert len(node.children) == 1
                content = node.children[0].content
                if content is None:
                    continue
                contents[node.attributes['name']] = content.strip()
        elif prop == _Prop.app:
            while dom.tag.lower() != 'properties':
                dom = dom.children[0]
            for node in dom:
                if not (content := node.content):
                    continue
                contents[node.tag] = content
        elif prop == _Prop.core:
            while dom.tag.lower() != 'cp:coreproperties':
                dom = dom.children[0]
            for node in dom:
                t, _, name = node.tag.partition(':')
                if not name:
                    continue
                if not (content := node.content):
                    continue
                contents[name] = content
        for name, value in contents.items():
            contents[name] = interpret(value)

    return result

Inherited members