Module refinery.units.formats.office.docmeta
Expand source code Browse git
from __future__ import annotations
from enum import Enum
from pathlib import Path
from refinery.lib import xml
from refinery.lib.dt import isodate
from refinery.lib.types import Param
from refinery.units import Arg, Unit
from refinery.units.formats.office.xtdoc import xtdoc
from refinery.units.sinks.ppjson import ppjson
class _Prop(str, Enum):
app = 'app.xml'
core = 'core.xml'
custom = 'custom.xml'
class docmeta(Unit):
"""
Extract metadata from Word Documents such as custom document properties.
"""
def __init__(self, tabular: Param[bool, Arg('-t', help='Print information in a table rather than as JSON')] = False):
super().__init__(tabular=tabular)
def process(self, data: bytearray):
def interpret(value: str | dict):
if isinstance(value, dict):
return {k: interpret(v) for k, v in value.items()}
if value.isdigit():
return int(value)
casefold = value.lower()
if casefold == 'false':
return False
if casefold == 'true':
return True
return isodate(value) or value
props = data | xtdoc('docProps/*.xml', exact=True, path=b'path') | {'path': bytearray}
result = {}
for path, page in props.items():
name = Path(path).name
if (dom := xml.parse(page)) is None:
self.log_info(F'failed to parse as XML: {path}')
continue
try:
prop = _Prop(name)
except ValueError:
self.log_info(F'skipped unknown property: {name}')
continue
result[prop.name] = contents = {}
if prop == _Prop.custom:
while dom.tag.lower() != 'properties':
dom = dom.children[0]
for node in dom:
assert node.tag.lower() == 'property'
assert len(node.children) == 1
content = node.children[0].content
if content is None:
continue
contents[node.attributes['name']] = content.strip()
elif prop == _Prop.app:
while dom.tag.lower() != 'properties':
dom = dom.children[0]
for node in dom:
if not (content := node.content):
continue
contents[node.tag] = content
elif prop == _Prop.core:
while dom.tag.lower() != 'cp:coreproperties':
dom = dom.children[0]
for node in dom:
t, _, name = node.tag.partition(':')
if not name:
continue
if not (content := node.content):
continue
contents[name] = content
for name, value in contents.items():
contents[name] = interpret(value)
yield from ppjson(tabular=self.args.tabular)._pretty_output(result)
Classes
class docmeta (tabular=False)-
Extract metadata from Word Documents such as custom document properties.
Expand source code Browse git
class docmeta(Unit): """ Extract metadata from Word Documents such as custom document properties. """ def __init__(self, tabular: Param[bool, Arg('-t', help='Print information in a table rather than as JSON')] = False): super().__init__(tabular=tabular) def process(self, data: bytearray): def interpret(value: str | dict): if isinstance(value, dict): return {k: interpret(v) for k, v in value.items()} if value.isdigit(): return int(value) casefold = value.lower() if casefold == 'false': return False if casefold == 'true': return True return isodate(value) or value props = data | xtdoc('docProps/*.xml', exact=True, path=b'path') | {'path': bytearray} result = {} for path, page in props.items(): name = Path(path).name if (dom := xml.parse(page)) is None: self.log_info(F'failed to parse as XML: {path}') continue try: prop = _Prop(name) except ValueError: self.log_info(F'skipped unknown property: {name}') continue result[prop.name] = contents = {} if prop == _Prop.custom: while dom.tag.lower() != 'properties': dom = dom.children[0] for node in dom: assert node.tag.lower() == 'property' assert len(node.children) == 1 content = node.children[0].content if content is None: continue contents[node.attributes['name']] = content.strip() elif prop == _Prop.app: while dom.tag.lower() != 'properties': dom = dom.children[0] for node in dom: if not (content := node.content): continue contents[node.tag] = content elif prop == _Prop.core: while dom.tag.lower() != 'cp:coreproperties': dom = dom.children[0] for node in dom: t, _, name = node.tag.partition(':') if not name: continue if not (content := node.content): continue contents[name] = content for name, value in contents.items(): contents[name] = interpret(value) yield from ppjson(tabular=self.args.tabular)._pretty_output(result)Ancestors
Subclasses
Inherited members