Module refinery.units.formats.office.xtdoc
Expand source code Browse git
from __future__ import annotations
from refinery.lib.id import is_likely_doc
from refinery.lib.structures import MemoryFile
from refinery.units.formats import PathExtractorUnit, UnpackResult
from refinery.units.formats.archive.xtzip import xtzip
def convert_msi_name(name: str):
def _decode(alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._!'):
for character in name:
code = ord(character)
if 0x3800 <= code < 0x4800:
yield alphabet[(code - 0x3800) & 0x3F] + alphabet[((code - 0x3800) >> 6) & 0x3F]
elif 0x4800 <= code <= 0x4840:
yield alphabet[code - 0x4800]
else:
yield character
return ''.join(_decode())
class xtdoc(PathExtractorUnit):
"""
Extract files from an OLE document such as a Microsoft Word DOCX file.
"""
@PathExtractorUnit.Requires('olefile', ['formats', 'office', 'extended'])
def _olefile():
import olefile
return olefile
def unpack(self, data):
with MemoryFile(data) as stream:
try:
oledoc = self._olefile.OleFileIO(stream)
except OSError as error:
self.log_info(F'error, {error}, treating input as zip file')
yield from xtzip().unpack(data)
return
for item in oledoc.listdir():
if not item or not item[-1]:
continue
path = '/'.join(item)
olestream = oledoc.openstream(path)
c0 = ord(item[-1][:1])
if c0 < 20:
item[-1] = F'[{c0:d}]{item[-1][1:]}'
path = '/'.join(item)
path = convert_msi_name(path)
self.log_debug('exploring:', path)
yield UnpackResult(path, olestream.read())
@classmethod
def handles(cls, data) -> bool | None:
if data[:8] == B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
return True
return is_likely_doc(data)
Functions
def convert_msi_name(name)
-
Expand source code Browse git
def convert_msi_name(name: str): def _decode(alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._!'): for character in name: code = ord(character) if 0x3800 <= code < 0x4800: yield alphabet[(code - 0x3800) & 0x3F] + alphabet[((code - 0x3800) >> 6) & 0x3F] elif 0x4800 <= code <= 0x4840: yield alphabet[code - 0x4800] else: yield character return ''.join(_decode())
Classes
class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract files from an OLE document such as a Microsoft Word DOCX file.
Expand source code Browse git
class xtdoc(PathExtractorUnit): """ Extract files from an OLE document such as a Microsoft Word DOCX file. """ @PathExtractorUnit.Requires('olefile', ['formats', 'office', 'extended']) def _olefile(): import olefile return olefile def unpack(self, data): with MemoryFile(data) as stream: try: oledoc = self._olefile.OleFileIO(stream) except OSError as error: self.log_info(F'error, {error}, treating input as zip file') yield from xtzip().unpack(data) return for item in oledoc.listdir(): if not item or not item[-1]: continue path = '/'.join(item) olestream = oledoc.openstream(path) c0 = ord(item[-1][:1]) if c0 < 20: item[-1] = F'[{c0:d}]{item[-1][1:]}' path = '/'.join(item) path = convert_msi_name(path) self.log_debug('exploring:', path) yield UnpackResult(path, olestream.read()) @classmethod def handles(cls, data) -> bool | None: if data[:8] == B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1': return True return is_likely_doc(data)
Ancestors
Subclasses
Class variables
var required_dependencies
var console
var reverse
var optional_dependencies
Methods
def unpack(self, data)
-
Expand source code Browse git
def unpack(self, data): with MemoryFile(data) as stream: try: oledoc = self._olefile.OleFileIO(stream) except OSError as error: self.log_info(F'error, {error}, treating input as zip file') yield from xtzip().unpack(data) return for item in oledoc.listdir(): if not item or not item[-1]: continue path = '/'.join(item) olestream = oledoc.openstream(path) c0 = ord(item[-1][:1]) if c0 < 20: item[-1] = F'[{c0:d}]{item[-1][1:]}' path = '/'.join(item) path = convert_msi_name(path) self.log_debug('exploring:', path) yield UnpackResult(path, olestream.read())
Inherited members