Module refinery.units.formats.office.xtdoc

Expand source code Browse git
from __future__ import annotations

from refinery.lib.id import is_likely_doc
from refinery.lib.ole.file import OleFile
from refinery.lib.types import buf
from refinery.units.formats import PathExtractorUnit, UnpackResult
from refinery.units.formats.archive.xtzip import xtzip


def convert_msi_name(name: str):
    def _decode(alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._!'):
        for character in name:
            code = ord(character)
            if 0x3800 <= code < 0x4800:
                yield alphabet[(code - 0x3800) & 0x3F] + alphabet[((code - 0x3800) >> 6) & 0x3F]
            elif 0x4800 <= code <= 0x4840:
                yield alphabet[code - 0x4800]
            else:
                yield character
    return ''.join(_decode())


class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    def unpack(self, data: buf):
        try:
            oledoc = OleFile(data)
        except OSError as error:
            self.log_info(F'error, {error}, treating input as zip file')
            yield from xtzip().unpack(data)
            return
        for item in oledoc.listdir():
            if not item or not item[-1]:
                continue
            path = '/'.join(item)
            olestream = oledoc.openstream(path)
            c0 = ord(item[-1][:1])
            if c0 < 20:
                item[-1] = F'[{c0:d}]{item[-1][1:]}'
                path = '/'.join(item)
            path = convert_msi_name(path)
            self.log_debug('exploring:', path)
            yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:8] == B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
            return True
        return is_likely_doc(data)

Functions

def convert_msi_name(name)
Expand source code Browse git
def convert_msi_name(name: str):
    def _decode(alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._!'):
        for character in name:
            code = ord(character)
            if 0x3800 <= code < 0x4800:
                yield alphabet[(code - 0x3800) & 0x3F] + alphabet[((code - 0x3800) >> 6) & 0x3F]
            elif 0x4800 <= code <= 0x4840:
                yield alphabet[code - 0x4800]
            else:
                yield character
    return ''.join(_decode())

Classes

class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract files from an OLE document such as a Microsoft Word DOCX file.

Expand source code Browse git
class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    def unpack(self, data: buf):
        try:
            oledoc = OleFile(data)
        except OSError as error:
            self.log_info(F'error, {error}, treating input as zip file')
            yield from xtzip().unpack(data)
            return
        for item in oledoc.listdir():
            if not item or not item[-1]:
                continue
            path = '/'.join(item)
            olestream = oledoc.openstream(path)
            c0 = ord(item[-1][:1])
            if c0 < 20:
                item[-1] = F'[{c0:d}]{item[-1][1:]}'
                path = '/'.join(item)
            path = convert_msi_name(path)
            self.log_debug('exploring:', path)
            yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:8] == B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
            return True
        return is_likely_doc(data)

Ancestors

Subclasses

Class variables

var reverse

The type of the None singleton.

Methods

def unpack(self, data)
Expand source code Browse git
def unpack(self, data: buf):
    try:
        oledoc = OleFile(data)
    except OSError as error:
        self.log_info(F'error, {error}, treating input as zip file')
        yield from xtzip().unpack(data)
        return
    for item in oledoc.listdir():
        if not item or not item[-1]:
            continue
        path = '/'.join(item)
        olestream = oledoc.openstream(path)
        c0 = ord(item[-1][:1])
        if c0 < 20:
            item[-1] = F'[{c0:d}]{item[-1][1:]}'
            path = '/'.join(item)
        path = convert_msi_name(path)
        self.log_debug('exploring:', path)
        yield UnpackResult(path, olestream.read())

Inherited members