Module refinery.units.formats.office.docmeta
Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from refinery.lib import xml
from refinery.units.formats import PathExtractorUnit, UnpackResult
from refinery.units.formats.office.xtdoc import xtdoc
class docmeta(PathExtractorUnit):
"""
Extract metadata from Word Documents such as custom document properties.
"""
@PathExtractorUnit.Requires('olefile', 'formats', 'office')
def _olefile():
import olefile
return olefile
def unpack(self, data: bytearray):
properties = data | xtdoc('docProps/custom.xml') | str
if not properties:
return
properties = xml.parse(properties)
while properties.tag.lower() != 'properties':
properties = properties.children[0]
for node in properties:
assert node.tag.lower() == 'property'
assert len(node.children) == 1
content = node.children[0].content
assert content is not None
yield UnpackResult(node.attributes['name'], content.encode(self.codec))
Classes
class docmeta (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract metadata from Word Documents such as custom document properties.
Expand source code Browse git
class docmeta(PathExtractorUnit): """ Extract metadata from Word Documents such as custom document properties. """ @PathExtractorUnit.Requires('olefile', 'formats', 'office') def _olefile(): import olefile return olefile def unpack(self, data: bytearray): properties = data | xtdoc('docProps/custom.xml') | str if not properties: return properties = xml.parse(properties) while properties.tag.lower() != 'properties': properties = properties.children[0] for node in properties: assert node.tag.lower() == 'property' assert len(node.children) == 1 content = node.children[0].content assert content is not None yield UnpackResult(node.attributes['name'], content.encode(self.codec))
Ancestors
Class variables
var required_dependencies
var optional_dependencies
Methods
def unpack(self, data)
-
Expand source code Browse git
def unpack(self, data: bytearray): properties = data | xtdoc('docProps/custom.xml') | str if not properties: return properties = xml.parse(properties) while properties.tag.lower() != 'properties': properties = properties.children[0] for node in properties: assert node.tag.lower() == 'property' assert len(node.children) == 1 content = node.children[0].content assert content is not None yield UnpackResult(node.attributes['name'], content.encode(self.codec))
Inherited members