Module refinery.units.formats.office.xtdoc

Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from typing import Optional

from refinery.units.formats import PathExtractorUnit, UnpackResult
from refinery.units.formats.archive.xtzip import xtzip
from refinery.lib.structures import MemoryFile


def convert_msi_name(name: str):
    def _decode(alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._!'):
        for character in name:
            code = ord(character)
            if 0x3800 <= code < 0x4800:
                yield alphabet[(code - 0x3800) & 0x3F] + alphabet[((code - 0x3800) >> 6) & 0x3F]
            elif 0x4800 <= code <= 0x4840:
                yield alphabet[code - 0x4800]
            else:
                yield character
    return ''.join(_decode())


class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    @PathExtractorUnit.Requires('olefile', 'formats', 'office', 'extended')
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data):
        with MemoryFile(data) as stream:
            try:
                oledoc = self._olefile.OleFileIO(stream)
            except OSError as error:
                self.log_info(F'error, {error}, treating input as zip file')
                yield from xtzip().unpack(data)
                return
            for item in oledoc.listdir():
                if not item or not item[-1]:
                    continue
                path = '/'.join(item)
                olestream = oledoc.openstream(path)
                c0 = ord(item[-1][:1])
                if c0 < 20:
                    item[-1] = F'[{c0:d}]{item[-1][1:]}'
                    path = '/'.join(item)
                path = convert_msi_name(path)
                self.log_debug('exploring:', path)
                yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        if data.startswith(B'\xD0\xCF\x11\xE0'):
            return True
        if xtzip.handles(data):
            return sum(1 for marker in [
                B'[Content_Types].xml',
                B'word/document.xml',
                B'docProps/core.xml',
            ] if marker in data) >= 2

Functions

def convert_msi_name(name)
Expand source code Browse git
def convert_msi_name(name: str):
    def _decode(alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._!'):
        for character in name:
            code = ord(character)
            if 0x3800 <= code < 0x4800:
                yield alphabet[(code - 0x3800) & 0x3F] + alphabet[((code - 0x3800) >> 6) & 0x3F]
            elif 0x4800 <= code <= 0x4840:
                yield alphabet[code - 0x4800]
            else:
                yield character
    return ''.join(_decode())

Classes

class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract files from an OLE document such as a Microsoft Word DOCX file.

Expand source code Browse git
class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    @PathExtractorUnit.Requires('olefile', 'formats', 'office', 'extended')
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data):
        with MemoryFile(data) as stream:
            try:
                oledoc = self._olefile.OleFileIO(stream)
            except OSError as error:
                self.log_info(F'error, {error}, treating input as zip file')
                yield from xtzip().unpack(data)
                return
            for item in oledoc.listdir():
                if not item or not item[-1]:
                    continue
                path = '/'.join(item)
                olestream = oledoc.openstream(path)
                c0 = ord(item[-1][:1])
                if c0 < 20:
                    item[-1] = F'[{c0:d}]{item[-1][1:]}'
                    path = '/'.join(item)
                path = convert_msi_name(path)
                self.log_debug('exploring:', path)
                yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        if data.startswith(B'\xD0\xCF\x11\xE0'):
            return True
        if xtzip.handles(data):
            return sum(1 for marker in [
                B'[Content_Types].xml',
                B'word/document.xml',
                B'docProps/core.xml',
            ] if marker in data) >= 2

Ancestors

Subclasses

Class variables

var required_dependencies
var optional_dependencies

Methods

def unpack(self, data)
Expand source code Browse git
def unpack(self, data):
    with MemoryFile(data) as stream:
        try:
            oledoc = self._olefile.OleFileIO(stream)
        except OSError as error:
            self.log_info(F'error, {error}, treating input as zip file')
            yield from xtzip().unpack(data)
            return
        for item in oledoc.listdir():
            if not item or not item[-1]:
                continue
            path = '/'.join(item)
            olestream = oledoc.openstream(path)
            c0 = ord(item[-1][:1])
            if c0 < 20:
                item[-1] = F'[{c0:d}]{item[-1][1:]}'
                path = '/'.join(item)
            path = convert_msi_name(path)
            self.log_debug('exploring:', path)
            yield UnpackResult(path, olestream.read())

Inherited members