Module `refinery`

        __     __  High Octane Triage Analysis          __
        ||    _||______ __       __________     _____   ||
        ||    \||___   \__| ____/   ______/___ / ____\  ||
========||=====||  | __/  |/    \  /==|  / __ \   __\===]|
        '======||  |   \  |   |  \_  _| \  ___/|  |     ||
               ||____  /__|___|__/  / |  \____]|  |     ||
===============''====\/=========/  /==|__|=====|__|======'
                               \  /
                                \/

This is the binary refinery package documentation; see GitHub and PyPi for more information.

The package refinery exports all Units which are of type Entry; this marker implies that the unit exposes a shell command. The command line interface for each of these units is given below, this is the same text as would be available by executing the command with the -h or --help option. The documentation for this module only lists the classes that correspond to exported refinery units, but for convenience, the refinery module also exports the classes Unit and Arg.

To better understand how the command line parameters are parsed, it is also recommended to study the module documentation of the following library modules, as their content is relevant for how the various Units can be combined.

refinery.lib.frame: framing syntax for working on lists of binary chunks
refinery.lib.argformats: the multibin syntax for refinery arguments
refinery.lib.meta: defining and using metadata variables within frames
refinery.units: writing custom units, add command-line arguments, and how to use refinery units within Python code.

Expand source code Browse git

R"""
    ----------------------------------------------------------
            __     __  High Octane Triage Analysis          __
            ||    _||______ __       __________     _____   ||
            ||    \||___   \__| ____/   ______/___ / ____\  ||
    ========||=====||  | __/  |/    \  /==|  / __ \   __\===]|
            '======||  |   \  |   |  \_  _| \  ___/|  |     ||
                   ||____  /__|___|__/  / |  \____]|  |     ||
    ===============''====\/=========/  /==|__|=====|__|======'
                                   \  /
                                    \/

This is the binary refinery package documentation; see
 [GitHub](https://github.com/binref/refinery/) and
 [PyPi](https://pypi.org/project/binary-refinery/)
for more information.

The package `refinery` exports all `refinery.units.Unit`s which are of type `refinery.units.Entry`;
this marker implies that the unit exposes a shell command. The command line interface for each of
these units is given below, this is the same text as would be available by executing the command
with the `-h` or `--help` option. The documentation for this module only lists the classes that
correspond to exported refinery units, but for convenience, the `refinery` module also exports the
classes `refinery.units.Unit` and `refinery.units.Arg`.

To better understand how the command line parameters are parsed, it is also recommended to study
the module documentation of the following library modules, as their content is relevant for how the
various `refinery.units.Unit`s can be combined.

1. `refinery.lib.frame`: framing syntax for working on lists of binary chunks
2. `refinery.lib.argformats`: the multibin syntax for refinery arguments
3. `refinery.lib.meta`: defining and using metadata variables within frames
4. `refinery.units`: writing custom units, add command-line arguments, and how to use refinery
   units within Python code.
"""
from __future__ import annotations

__version__ = '0.9.8'
__distribution__ = 'binary-refinery'

import pickle

from datetime import datetime
from threading import RLock
from typing import TYPE_CHECKING, Iterable, TypeVar, cast

from refinery.lib import resources
from refinery.units import Arg, Unit

if TYPE_CHECKING:
    from pathlib import Path


_T = TypeVar('_T')


def _singleton(cls: type[_T]) -> _T:
    return cls()


@_singleton
class __unit_loader__:
    """
    Every unit can be imported from the refinery base module. The import is performed on demand to
    reduce import times. The library ships with a pickled dictionary that maps unit names to their
    corresponding module path. This data is stored as `units.pkl` in the data directory.
    """
    units: dict[str, str]
    cache: dict[str, type[Unit]]
    _lock: RLock = RLock()

    def __init__(self):
        self.path = resources.datapath('units.pkl')
        self.reloading = False
        self.loaded = False
        self.units = {}
        self.cache = {}
        self.last_reload = datetime(1985, 8, 5)
        self.load()

    def __enter__(self):
        self._lock.__enter__()
        return self

    def __exit__(self, et, ev, tb):
        return self._lock.__exit__(et, ev, tb)

    def load(self):
        try:
            cache = pickle.load(self.path.open('rb'))
        except (FileNotFoundError, EOFError):
            cache = None
        else:
            try:
                version = cache['version']
            except KeyError:
                cache = None
            else:
                if version != __version__:
                    cache = None
        if cache is None:
            self.reload()
        else:
            self.units = cache['units']
            self.loaded = True

    def clear(self):
        self.loaded = False
        self.units.clear()
        self.cache.clear()

    def save(self):
        try:
            with cast('Path', self.path).open('wb') as out:
                pickle.dump({'units': self.units, 'version': __version__}, out)
        except Exception:
            pass
        else:
            self.loaded = True

    def reload(self):
        if not self.reloading:
            from refinery.lib.loader import get_all_entry_points
            self.reloading = True
            self.clear()
            for executable in get_all_entry_points():
                name = executable.__name__
                self.units[name] = executable.__module__
                self.cache[name] = executable
            self.save()
            self.reloading = False

    def resolve(self, name) -> type[Unit] | None:
        if not self.loaded:
            self.load()
        try:
            module_path = self.units[name]
            module = __import__(module_path, None, None, [name])
            entry = getattr(module, name)
            self.cache[name] = entry
            return entry
        except (KeyError, ModuleNotFoundError):
            return None


@_singleton
class __pdoc__(dict):
    def __init__(self, *a, **kw):
        super().__init__()
        self._loaded = False

    def _strip_globals(self, hlp: str):
        def _strip(lines: Iterable[str]):
            triggered = False
            for line in lines:
                if triggered:
                    if line.lstrip() != line:
                        continue
                    triggered = False
                if line.lower().startswith('global options:'):
                    triggered = True
                    continue
                yield line
        return ''.join(_strip(hlp.splitlines(keepends=True)))

    def _load(self):
        if self._loaded:
            return
        from .explore import get_help_string
        self['Unit'] = False
        self['Arg'] = False
        with __unit_loader__ as ul:
            for name in ul.units:
                unit = ul.resolve(name)
                if unit is None:
                    continue
                for base in unit.mro():
                    try:
                        abstractmethods: list[str] = base.__abstractmethods__
                    except AttributeError:
                        break
                    for method in abstractmethods:
                        if method.startswith('_'):
                            continue
                        at = getattr(unit, method, NotImplemented)
                        bt = getattr(unit.mro()[1], method, None)
                        if at is NotImplemented:
                            continue
                        if at is None:
                            continue
                        if at is not bt:
                            self[F'{name}.{method}'] = False
                if hlp := get_help_string(unit, width=97):
                    hlp = hlp.replace('\x60', '')
                    hlp = self._strip_globals(hlp).strip()
                    hlp = (
                        F'This unit is implemented in `{unit.__module__}` and has the following '
                        F'commandline Interface:\n```text\n{hlp}\n```'
                    )
                    self[name] = hlp
        self._loaded = True

    def items(self):
        self._load()
        return super().items()


__all__ = sorted(__unit_loader__.units, key=lambda x: x.lower()) + [
    Unit.__name__, Arg.__name__, '__unit_loader__', '__pdoc__']


def load(name) -> type[Unit] | None:
    with __unit_loader__ as ul:
        return ul.resolve(name)


def __getattr__(name):
    with __unit_loader__ as ul:
        unit = ul.resolve(name)
    if unit is None:
        raise AttributeError(name)
    return unit


def __dir__():
    return __all__

Sub-modules

refinery.data: This module contains data resources.
refinery.explore: A commandline script to search for binary refinery units based on keywords.
refinery.lib: Library functions used by various refinery units.
refinery.shell: Shell-Like Unit Interface …
refinery.units: This package contains all refinery units. To write an executable refinery unit, it is sufficient to write a class inheriting from …

Functions

def cast(typ, val)

This unit is implemented in refinery.units.crypto.cipher.cast and has the following commandline Interface:

usage: cast [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-t TAG]
            [-a AAD]
            key

CAST encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -t, --tag TAG         Only for EAX, GCM, OCB, and CCM: An authentication tag to verify the
                        message. For encryption, this parameter specifies the tag length, and the
                        tag is provided as a meta variable named "tag".
  -a, --aad AAD         Only for EAX, GCM, OCB, and CCM: Set additional authenticated data.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code

def cast(typ, val):
    """Cast a value to a type.

    This returns the value unchanged.  To the type checker this
    signals that the return value has the designated type, but at
    runtime we intentionally don't check anything (we want this
    to be as fast as possible).
    """
    return val

Units

class a3x (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.a3x and has the following commandline Interface:

usage: a3x [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script
bytecode. The unit also works on compiled AutoIt executables.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class a3x(PathExtractorUnit):
    """
    Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script
    bytecode. The unit also works on compiled AutoIt executables.
    """

    def unpack(self, data: bytearray):
        view = memoryview(data)
        cursor = 0
        errors: dict[int, Exception] = {}
        script_count = 0
        truncated: set[A3xRecord] = set()
        intact: set[A3xRecord] = set()

        def _package(records: Iterable[A3xRecord]) -> Generator[UnpackResult]:
            for k, record in enumerate(records, 1):
                self.log_info(F'record {k} type:', record.type)
                self.log_info(F'record {k} path:', record.src_path)
                if record.path is None:
                    continue
                yield UnpackResult(
                    record.path,
                    record.extract,
                    srcpath=record.src_path,
                    created=record.created.isoformat(' ', 'seconds'),
                    written=record.written.isoformat(' ', 'seconds'),
                )

        while cursor < len(view):
            self.log_debug(F'searching at offset 0x{cursor:08X}')
            nc = data.find(A3xScript.MAGIC, cursor)
            if nc >= 0:
                cursor = nc
            else:
                rp = data.find(A3xRecord.MAGIC, cursor) - A3xScript.WIDTH
                if rp <= cursor:
                    break
                cursor = rp
            try:
                script = A3xScript(view[cursor:])
            except Exception as E:
                errors[cursor] = E
                cursor += 1
                continue
            else:
                valid = script.has_valid_magic()
                if valid:
                    _m = 'correct'
                else:
                    _m = 'invalid'
                if not script.body:
                    cursor += A3xScript.WIDTH
                    if not script.has_valid_magic():
                        cursor += len(A3xRecord.MAGIC)
                    continue
                if script.truncated:
                    _a = 'truncated'
                    truncated.update(script.body)
                else:
                    script_count += 1
                    _a = 'intact'
                    intact.update(script.body)
                self.log_info(
                    F'{_a} script of type', script.type,
                    F'and length 0x{len(script):08X}',
                    F'with {len(script.body)} records and {_m} magic:',
                    script.magic
                )
                cursor += len(script)
                if script.truncated:
                    if not script.has_valid_magic():
                        cursor += len(A3xRecord.MAGIC)
                    continue

            yield from _package(script.body)

        remaining = truncated - intact
        if remaining:
            self.log_warn('emitting records from truncated scripts')
            yield from _package(remaining)
            return
        elif truncated:
            self.log_debug('good news: intact scripts contained all records from truncated scripts')
        if script_count == 0:
            error = None
            for offset, error in errors.items():
                self.log_warn(F'error at offset 0x{offset:08X}:', error)
            if error:
                raise error

    @classmethod
    def handles(cls, data) -> bool | None:
        return buffer_contains(data, A3xScript.MAGIC) or buffer_contains(data, A3xRecord.MAGIC)

class a85

This unit is implemented in refinery.units.encoding.a85 and has the following commandline Interface:

usage: a85 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Ascii85 encoding and decoding, the predecessor variant of Base85 with a different alphabet.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class a85(Unit):
    """
    Ascii85 encoding and decoding, the predecessor variant of Base85 with a different alphabet.
    """
    def reverse(self, data):
        return base64.a85encode(data)

    def process(self, data):
        if re.search(BR'\s', data) is not None:
            data = re.sub(BR'\s+', B'', data)
        return base64.a85decode(data)

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return formats.a85s.value.bin.fullmatch(data) is not None

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return base64.a85encode(data)

class add (*argument, bigendian=False, blocksize=0)

This unit is implemented in refinery.units.blockwise.add and has the following commandline Interface:

usage: add [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Add the given argument to each block.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. It is chosen, by default, to be the
                     smallest size that can hold the provided argument without loss of precision.
                     For example, passing the value 0x1234 will result in a default block size of
                     2, while passing the value 12 will mean that the default block size is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class add(BinaryOperationWithAutoBlockAdjustment):
    """
    Add the given argument to each block.
    """
    @staticmethod
    def operate(a, b): return a + b
    @staticmethod
    def inplace(a, b): a += b

class adler32 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.checksums and has the following commandline Interface:

usage: adler32 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the Adler32 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class adler32(HashUnit):
    """
    Returns the Adler32 hash of the input data.
    """
    def _algorithm(self, data) -> bytes:
        return zlib.adler32(data).to_bytes(4, 'big')

class aes (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=(), aad=b'')

This unit is implemented in refinery.units.crypto.cipher.aes and has the following commandline Interface:

usage: aes [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-t TAG] [-a AAD]
           key

AES encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CCM, CFB, CTR,
                        EAX, ECB, GCM, KW, KWP, OCB, OFB. By default, the CBC mode is used when
                        an IV is is provided, and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -t, --tag TAG         Only for EAX, GCM, OCB, and CCM: An authentication tag to verify the
                        message. For encryption, this parameter specifies the tag length, and the
                        tag is provided as a meta variable named "tag".
  -a, --aad AAD         Only for EAX, GCM, OCB, and CCM: Set additional authenticated data.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class aes(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(AES)):
    """
    AES encryption and decryption.
    """

class alu (operator, *argument, seed=0, prologue='', epilogue='', inc=False, dec=False, cbc=False, ctr=False, bigendian=False, blocksize=1, precision=-1)

This unit is implemented in refinery.units.blockwise.alu and has the following commandline Interface:

usage: alu [-h] [-L] [-Q] [-0] [-v] [-s STR] [-p E] [-e E | -I | -D | -X | -T] [-E] [-B N] [-P N]
           operator [argument ...]

The arithmetic-logical unit. It allows you to specify a custom Python expression where the
following variables are allowed:

- the variable A: same as V[0]
- the variable B: current block
- the variable E: block value of encoded input (not changed after update)
- the variable N: number of bytes in the input
- the variable K: current index in the input
- the variable S: the internal state value
- the variable V: the vector of arguments
- the variable I: function that casts to a signed int in current precision
- the variable U: function that casts to unsigned int in current precision
- the variable R: function; R(x,4) rotates x by 4 to the right
- the variable L: function; L(x,4) rotates x by 4 to the left
- the variable M: function; M(x,8) picks the lower 8 bits of x
- the variable X: function that negates the bits of the input

(The rotation operations are interpreted as shifts when arbitrary precision is used.)

Each block of the input is replaced by the value of this expression. Additionally, it is possible
to specify prologue and epilogue expressions which are used to update the state variable S before
and after the update of each block, respectively.

positional arguments:
  operator           A Python expression defining the operation.
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -s, --seed STR     Optional seed value for the state variable S. The default is zero. This can
                     be an expression involving the variable N.
  -p, --prologue E   Optional expression with which the state variable S is updated before a
                     block is operated on.
  -e, --epilogue E   Optional expression with which the state variable S is updated after a block
                     was operated on.
  -I, --inc          equivalent to --epilogue=S+1
  -D, --dec          equivalent to --epilogue=S-1
  -X, --cbc          equivalent to --epilogue=(B)
  -T, --ctr          equivalent to --epilogue=S+B
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.
  -P, --precision N  The size of the variables used for computing the result. By default, this is
                     equal to the block size. The value may be zero, indicating that arbitrary
                     precision is required.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class alu(ArithmeticUnit):
    """
    The arithmetic-logical unit. It allows you to specify a custom Python expression where the following
    variables are allowed:

    - the variable `A`: same as `V[0]`
    - the variable `B`: current block
    - the variable `E`: block value of encoded input (not changed after update)
    - the variable `N`: number of bytes in the input
    - the variable `K`: current index in the input
    - the variable `S`: the internal state value
    - the variable `V`: the vector of arguments
    - the variable `I`: function that casts to a signed int in current precision
    - the variable `U`: function that casts to unsigned int in current precision
    - the variable `R`: function; `R(x,4)` rotates x by 4 to the right
    - the variable `L`: function; `L(x,4)` rotates x by 4 to the left
    - the variable `M`: function; `M(x,8)` picks the lower 8 bits of x
    - the variable `X`: function that negates the bits of the input

    (The rotation operations are interpreted as shifts when arbitrary precision is used.)

    Each block of the input is replaced by the value of this expression. Additionally, it is possible to
    specify prologue and epilogue expressions which are used to update the state variable `S` before and
    after the update of each block, respectively.
    """

    @staticmethod
    def _parse_op(definition, default=None):
        if definition:
            return definition
        elif not default:
            raise ValueError('No definition given')
        else:
            return default

    def __init__(
        self,
        operator: Param[str, Arg.String(help='A Python expression defining the operation.')],
        *argument,
        seed: Param[int | str, Arg.String('-s', help=(
            'Optional seed value for the state variable S. The default is zero. This can be an expression '
            'involving the variable N.'))] = 0,
        prologue: Param[str, Arg.String('-p', metavar='E', help=(
            'Optional expression with which the state variable S is updated before a block is operated on.'))] = '',
        epilogue: Param[str, Arg.String('-e', metavar='E', group='EPI', help=(
            'Optional expression with which the state variable S is updated after a block was operated on.'))] = '',
        inc: Param[bool, Arg.Switch('-I', group='EPI', help='equivalent to --epilogue=S+1')] = False,
        dec: Param[bool, Arg.Switch('-D', group='EPI', help='equivalent to --epilogue=S-1')] = False,
        cbc: Param[bool, Arg.Switch('-X', group='EPI', help='equivalent to --epilogue=(B)')] = False,
        ctr: Param[bool, Arg.Switch('-T', group='EPI', help='equivalent to --epilogue=S+B')] = False,
        bigendian=False, blocksize=1, precision=-1
    ):
        for flag, flag_is_set, expression in [
            ('--cbc', cbc, '(B)'),
            ('--inc', inc, 'S+1'),
            ('--dec', dec, 'S-1'),
            ('--ctr', ctr, 'S+B'),
        ]:
            if flag_is_set:
                if epilogue:
                    raise ValueError(
                        F'Ambiguous specification; epilogue was already set to {epilogue} '
                        F'when {flag} was parsed.')
                epilogue = expression

        self._index = IndexCounter()

        super().__init__(
            self._index,
            *argument,
            bigendian=bigendian,
            blocksize=blocksize,
            precision=precision,
            seed=seed,
            operator=self._parse_op(operator),
            prologue=self._parse_op(prologue, 'S'),
            epilogue=self._parse_op(epilogue, 'S'),
        )

    @property
    def _is_ecb(self):
        return not self.args.epilogue and not self.args.prologue

    def _fastblock(self, data):
        raise FastBlockError

    def process(self, data):
        context = dict(metavars(data))
        seed = self.args.seed
        fbits = self.fbits
        fmask = self.fmask

        self._index.init(self.fmask)

        def _expression(definition: str):
            return PythonExpression(definition, *'IBEASMNVRLX', all_variables_allowed=True, mask=fmask)

        prologue = _expression(self.args.prologue).expression
        epilogue = _expression(self.args.epilogue).expression
        operator = _expression(self.args.operator).expression

        def cast_unsigned(n) -> int:
            return int(n) & fmask

        def cast_signed(n) -> int:
            n = int(n) & fmask
            if n >> (fbits - 1):
                return -((~n + 1) & fmask)
            else:
                return n

        if fbits is INF:
            def rotate_r(n, k): return n >> k
            def rotate_l(n, k): return n << k
        else:
            def rotate_r(n, k): return (n >> k) | (n << (fbits - k)) & fmask
            def rotate_l(n, k): return (n << k) | (n >> (fbits - k)) & fmask

        def negate_bits(n):
            return n ^ fmask

        def mask_to_bits(x, b):
            return x & ((1 << b) - 1)

        context.update(
            N=len(data),
            I=cast_signed,
            U=cast_unsigned,
            R=rotate_r,
            L=rotate_l,
            X=negate_bits,
            M=mask_to_bits,
        )
        args = [
            self._infinitize_argument(len(data), *self._argument_parse_hook(a))
            for a in self.args.argument]
        if args:
            args = [next(iter(a)) for a in args]
            context['A'] = args[0]
            context['V'] = args

        if isinstance(seed, str):
            seed = PythonExpression(seed, 'IAMNVRLX', constants=context, mask=fmask)
        if callable(seed):
            seed = seed(context, N=len(data))

        self._index.init(self.fmask)
        context.update(S=seed)

        def operate(block, index, *args):
            context.update(K=index, B=block, E=block, V=args)
            if args:
                context['A'] = args[0]
            context['S'] = eval(prologue, None, context)
            context['B'] = eval(operator, None, context)
            context['S'] = eval(epilogue, None, context)
            return context['B']

        placeholder = self.operate
        self.operate = operate

        try:
            result = super().process(data)
        finally:
            self.operate = placeholder

        return result

    @staticmethod
    def operate(block, index, *args):
        raise RuntimeError('This operate method cannot be called.')

    def inplace(self, block, *args) -> None:
        super().inplace(block, *args)

class aplib

This unit is implemented in refinery.units.compression.ap and has the following commandline Interface:

usage: aplib [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

APLib compression and decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class aplib(Unit):
    """
    APLib compression and decompression.
    """

    def reverse(self, buf):
        return compressor(buf).compress()

    def process(self, buf):
        view = memoryview(buf)
        size = 0
        if view[:4] == B'AP32':
            size = int.from_bytes(buf[4:8], 'little')
            if size > 0x80:
                size = 0
            else:
                self.log_info(F'detected aPLib header of size {size}')
        return decompressor(view[size:]).decompress()

    @classmethod
    def handles(cls, data):
        if len(data) < 2:
            return False
        if data[:4] == B'AP32':
            return True
        return None

Methods

def reverse(self, buf)

Expand source code Browse git

def reverse(self, buf):
    return compressor(buf).compress()

class argon2id (size, salt, iter=1, jobs=1, cost=None, skey=None, more=None)

This unit is implemented in refinery.units.crypto.keyderive.argon2id and has the following commandline Interface:

usage: argon2id [-h] [-L] [-Q] [-0] [-v] n S [t] [p] [m] [K] [X]

Implements Argon2id-based key derivation.

positional arguments:
  n              number of bytes to generate
  S              salt bytes
  t              number of iterations, defaults to 1
  p              parallelism, defaults to 1
  m              memory cost in kibibytes, defaults to the minimum of 8192 per job.
  K              optional secret key
  X              optional additional data

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class argon2id(Unit):
    """
    Implements Argon2id-based key derivation.
    """

    def __init__(
        self,
        size: Param[int, Arg.Number(metavar='n', help='number of bytes to generate')],
        salt: Param[buf, Arg.Binary(metavar='S', help='salt bytes')],
        iter: Param[int, Arg.Number(metavar='t', help='number of iterations, defaults to {default}')] = 1,
        jobs: Param[int, Arg.Number(metavar='p', help='parallelism, defaults to {default}')] = 1,
        cost: Param[int, Arg.Number(metavar='m', help='memory cost in kibibytes, defaults to the minimum of 8192 per job.')] = None,
        skey: Param[buf, Arg.Binary(metavar='K', help='optional secret key')] = None,
        more: Param[buf, Arg.Binary(metavar='X', help='optional additional data')] = None,
    ):
        super().__init__(size=size, salt=salt, iter=iter, skey=skey, jobs=jobs, cost=cost, more=more)

    @Unit.Requires('cryptography', ['default', 'extended'])
    def _argon2id():
        from cryptography.hazmat.primitives.kdf.argon2 import Argon2id
        return Argon2id

    def process(self, data):
        m = self.args.cost
        p = self.args.jobs
        S = self.args.salt
        K = self.args.skey
        n = self.args.size
        X = self.args.more
        t = self.args.iter
        K = K and bytes(K) or None
        X = X and bytes(X) or None
        S = bytes(S)
        m = m or 8192 * p
        a2id = self._argon2id(
            salt=S, length=n, iterations=t, lanes=p, memory_cost=m, ad=X, secret=K)
        return a2id.derive(data)

class asm (mode='x32', *, count=None, until=None, no_address=False, no_hexdump=False)

This unit is implemented in refinery.units.sinks.asm and has the following commandline Interface:

usage: asm [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-A] [-H] [[x32|x64|..]]

Disassembles the input data using capstone and produces a human-readable disassembly listing. It
internally uses the opc unit for this, which is an alternative option if you are looking for more
programmatic disassembly.

positional arguments:
  [x32|x64|..]      Machine code architecture, default is x32. Select from the following list:
                    x16, x32, x64, ppc32, ppc64, mips32, mips64.

options:
  -c, --count N     Maximum number of bytes to disassemble, infinite by default.
  -u, --until STR   Disassemble until the given string appears among the disassembly.
  -A, --no-address  Disable address display.
  -H, --no-hexdump  Disable opcodes hexdump.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class asm(opc):
    """
    Disassembles the input data using capstone and produces a human-readable disassembly listing.
    It internally uses the `refinery.opc` unit for this, which is an alternative option if you are
    looking for more programmatic disassembly.
    """
    def __init__(
        self, mode='x32', *, count=None, until=None,
        no_address: Param[bool, Arg.Switch('-A', help='Disable address display.')] = False,
        no_hexdump: Param[bool, Arg.Switch('-H', help='Disable opcodes hexdump.')] = False,
    ):
        super().__init__(
            mode=mode,
            nvar='_name',
            avar='_addr',
            ovar='_arg',
            count=count,
            until=until,
            no_address=no_address,
            no_hexdump=no_hexdump,
        )

    def process(self, data):
        insns = list(super().process(data))
        if not insns:
            return

        no_address = self.args.no_address
        no_hexdump = self.args.no_hexdump

        def _hl(x): return len(hex(x))

        args_width = max(len(insn['_args']) for insn in insns)
        memo_width = max(len(insn['_name']) for insn in insns)
        addr_width = max(_hl(insn['_addr']) for insn in insns)

        if no_address:
            addr_width = 0
            memo_width = memo_width + 2

        max_data_bytes_count = max(len(c) for c in insns)

        padding = addr_width + memo_width + args_width + 10
        metrics_opc = HexDumpMetrics(max_data_bytes_count, padding=padding)

        for insn in insns:
            hd = one(hexdump(insn, metrics_opc))
            name = insn.meta.pop('_name')
            args = insn.meta.pop('_args')
            addr = insn.meta.pop('_addr')
            msg = F' {name:<{memo_width}}  {args:<{args_width}}'
            if not no_hexdump:
                msg = F'{msg}  ; {hd}'
            if not no_address:
                msg = F'{addr:0{addr_width}X}: {msg}'
            yield msg.encode(self.codec)

class atbash

This unit is implemented in refinery.units.encoding.atbash and has the following commandline Interface:

usage: atbash [-h] [-L] [-Q] [-0] [-v] [-R]

https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st
century, except for picking out crypto nerds.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class atbash(Unit):
    """
    https://en.wikipedia.org/wiki/Atbash
    Atbash encoding and decoding. Fairly useless in the 21st century, except
    for picking out crypto nerds.
    """

    def process(self, data: bytearray):
        uc = range(B'A'[0], B'Z'[0] + 1)
        lc = range(B'a'[0], B'z'[0] + 1)
        for k, letter in enumerate(data):
            if letter in uc:
                data[k] = uc[~uc.index(letter)]
                continue
            if letter in lc:
                data[k] = lc[~lc.index(letter)]
                continue
        return data

    reverse = process

Methods

def reverse(self, data)

Expand source code Browse git

def process(self, data: bytearray):
    uc = range(B'A'[0], B'Z'[0] + 1)
    lc = range(B'a'[0], B'z'[0] + 1)
    for k, letter in enumerate(data):
        if letter in uc:
            data[k] = uc[~uc.index(letter)]
            continue
        if letter in lc:
            data[k] = lc[~lc.index(letter)]
            continue
    return data

class autoxor (range=slice(1, 32, None), plaintext=b'', searchpos=slice(0, None, None), alph=False, crib=False, freq=False)

This unit is implemented in refinery.units.misc.autoxor and has the following commandline Interface:

usage: autoxor [-h] [-L] [-Q] [-0] [-v] [-p B] [-s S:E] [-a] [-c] [-f] [start:end:step]

Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte with
successive bytes from a key or by subtracting the respective key byte value from each input byte.
It uses the xkey unit to attack the cipher and attempts to recover the plaintext automatically.

The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For
both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

1. Known plaintext cribs: The unit contains a library of file signatures that are expected to
   occur at specific offsets. It uses these to attempt a known-plaintext attack against the input.
   If a key is found that is at most half the size of such a crib, it is returned.
2. Known alphabets: For each given key length, the input is split into slices that would have
   been encrypted with a single byte for keys of that length. Each such slice undergoes a
   character frequency analysis. If the histogram indicates that an alphabet of a small size was
   used (i.e. base64), then the unit attempts to determine the key based on this.
3. Known high frequency glyph: Works if the plaintext contains one letter that occurs with very
   high frequency, i.e. zero padding in PE or ELF files, and the space character in text. Based on
   this assumption, the unit computes the most likely key. This method will work best on
   uncompressed files that were encrypted with a short key.

When no option is set, the unit uses all the above methods by default. When at least one of the
methods is selected, it will attempt only selected methods. When a custom plaintext is given, the
other methods are disabled by default.

positional arguments:
  start:end:step       range of length values to try in Python slice syntax, the default is 1:32.

options:
  -p, --plaintext B    Provide a buffer of known plaintext. Without a search position, this can
                       slow down the key search significantly.
  -s, --searchpos S:E  Only used when a known plaintext buffer is provided; In this case it
                       narrows the search range for the offset of that data to between S and E.
  -a, --alph           Enable search for keys via known encoder alphabets.
  -c, --crib           Enable search for keys via known plaintext cribs.
  -f, --freq           Enable search for keys via frequency analysis.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class autoxor(xkey, docs='{0}{p}{1}'):
    """
    Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte
    with successive bytes from a key or by subtracting the respective key byte value from each
    input byte. It uses the `refinery.xkey` unit to attack the cipher and attempts to recover the
    plaintext automatically.
    """
    def process(self, data: bytearray):
        fallback: tuple[str, bytes, bytearray] | None = None

        try:
            result = next(self._attack(data))
        except StopIteration:
            result = None
        else:
            key = result.key
            units: list[type[xor] | type[sub]] = []

            if result.xor is not False:
                units.append(xor)
            if result.xor is not True:
                units.append(sub)

            for unit in units:
                self.log_debug(F'attempting {unit.name} for detected key')

                name = unit.name
                bin = data | unit(key) | bytearray
                mem = memoryview(bin)
                space = B'\0' | unit(0x20) | bytes
                check = get_structured_data_type

                for k in range(0x1000):
                    if t := check(mem[k:]):
                        self.log_info(F'method {name} resulted in non-blob data ({t.mnemonic}) at offset 0x{k:X}; returning buffer')
                        return self.labelled(bin, key=key, method=name)
                    if k == 0:
                        check = get_executable_type

                if not fallback:
                    fallback = name, key, bin

                if not any(bin):
                    continue

                as_text = bin | unit(space) | bytearray

                try:
                    decoded = as_text.decode('utf8')
                except UnicodeDecodeError:
                    is_text = False
                else:
                    import re
                    is_text = bool(re.fullmatch(r'[\s\w!-~]+', decoded))

                if is_text:
                    self.log_info('detected likely text input; automatically shifting towards space character')
                    key = (b'\x20' * len(key)) | unit(key) | bytes
                    return self.labelled(as_text, key=key, method=name)

        if fallback is None:
            self.log_warn('no key was found; returning original data')
            return data
        else:
            assert result is not None
            name, key, bin = fallback
            if result.how == self._rt.freq and result.score < 8:
                self.log_warn(
                    F'unrecognized format, no confirmed crib, low score ({result.score:.2f}%); '
                    'the output is likely junk'
                )
            return self.labelled(bin, key=key)

class b2f

This unit is implemented in refinery.units.meta.pick and has the following commandline Interface:

usage: b2f [-h] [-L] [-Q] [-0] [-v]

Short for "back to front". This unit is a shortcut for pick with argument ::-1: It will reorder
the chunks in the current frame in reverse order.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class b2f(pick):
    """
    Short for "back to front". This unit is a shortcut for `refinery.pick` with argument `::-1`:
    It will reorder the chunks in the current frame in reverse order.
    """
    def __init__(self):
        super().__init__(slice(None, None, -1))

class b32

This unit is implemented in refinery.units.encoding.b32 and has the following commandline Interface:

usage: b32 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Base32 encoding and decoding.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class b32(Unit):
    """
    Base32 encoding and decoding.
    """
    def reverse(self, data):
        return base64.b32encode(data)

    def process(self, data: bytearray):
        before_padding = 0
        for before_padding in range(len(data), 0, -1):
            if data[before_padding - 1:before_padding] != B'=':
                break
        padding_size = -before_padding % 8
        missing = before_padding + padding_size - len(data)
        if missing > 0:
            self.log_info(F'detected incorrect padding: added {missing} padding characters')
            data.extend(B'=' * missing)
        if missing < 0:
            self.log_info(F'detected incorrect padding: removed {-missing} padding characters')
            data[padding_size + before_padding:] = []
        return base64.b32decode(data, casefold=True)

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        if not formats.b32.value.bin.fullmatch(data):
            return False
        return not formats.hex.value.bin.fullmatch(data)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return base64.b32encode(data)

class b58

This unit is implemented in refinery.units.encoding.b58 and has the following commandline Interface:

usage: b58 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the
alphabet omits digits and letters that look similar.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class b58(base):
    """
    Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses
    because the alphabet omits digits and letters that look similar.
    """
    def __init__(self):
        super().__init__(b'123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz')

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return (
            formats.b58.value.bin.fullmatch(data)
            and not formats.hex.value.bin.fullmatch(data)
            and not formats.b32.value.bin.fullmatch(data)
        )

class b62

This unit is implemented in refinery.units.encoding.b62 and has the following commandline Interface:

usage: b62 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Base62 encoding and decoding.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class b62(base):
    """
    Base62 encoding and decoding.
    """
    def __init__(self):
        super().__init__(b'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return (
            formats.b62.value.bin.fullmatch(data)
            and not formats.hex.value.bin.fullmatch(data)
            and not formats.b32.value.bin.fullmatch(data)
        )

class b64 (urlsafe=False)

This unit is implemented in refinery.units.encoding.b64 and has the following commandline Interface:

usage: b64 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-u]

Base64 encoding and decoding.

options:
  -u, --urlsafe  use URL-safe alphabet

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class b64(Unit):
    """
    Base64 encoding and decoding.
    """
    def __init__(self, urlsafe: Param[bool, Arg.Switch('-u', help='use URL-safe alphabet')] = False):
        super().__init__(urlsafe=urlsafe)

    def reverse(self, data):
        altchars = None
        if self.args.urlsafe:
            altchars = B'-_'
        return base64.b64encode(data, altchars=altchars)

    def process(self, data: bytearray):
        if not data:
            return data
        if len(data) == 1:
            raise ValueError('single byte can not be base64-decoded.')
        data.extend(B'===')
        altchars = None
        if (B'-' in data or B'_' in data) and (B'+' not in data and B'/' not in data) or self.args.urlsafe:
            altchars = B'-_'
        return base64.b64decode(data, altchars=altchars)

    @Unit.Requires('numpy', ['speed', 'default', 'extended'])
    def _numpy():
        import numpy
        return numpy

    @classmethod
    def handles(cls, data) -> bool:
        from refinery.lib.patterns import formats
        if not formats.b64s.value.bin.fullmatch(data):
            return False
        try:
            np = cls._numpy
        except ImportError:
            histogram = set()
            lcase_count = 0
            ucase_count = 0
            digit_count = 0
            other_count = 0
            total_count = len(data)
            for byte in data:
                histogram.add(byte)
                if len(histogram) > 60:
                    return True
                elif byte in range(0x61, 0x7B):
                    lcase_count += 1
                elif byte in range(0x41, 0x5B):
                    ucase_count += 1
                elif byte in range(0x30, 0x40):
                    digit_count += 1
                elif byte in B'\v\f\t\r\n\x20':
                    total_count -= 1
                else:
                    other_count += 1
        else:
            hist = np.histogram(
                np.frombuffer(memoryview(data), np.uint8), range(0x101))[0]
            lcase_count = sum(hist[k] for k in range(0x61, 0x7B))
            ucase_count = sum(hist[k] for k in range(0x41, 0x5B))
            digit_count = sum(hist[k] for k in range(0x30, 0x40))
            space_count = sum(hist[k] for k in B'\v\f\t\r\n\x20')
            total_count = len(data) - space_count
            other_count = total_count - (digit_count + ucase_count + lcase_count)

        if any(c < total_count // 64 for c in (lcase_count, ucase_count, digit_count)):
            return False
        if other_count * 2 > total_count:
            return False

        return True

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    altchars = None
    if self.args.urlsafe:
        altchars = B'-_'
    return base64.b64encode(data, altchars=altchars)

class b65536

This unit is implemented in refinery.units.encoding.b65536 and has the following commandline Interface:

usage: b65536 [-h] [-L] [-Q] [-0] [-v] [-R]

Base65536 encoding and decoding. A relatively esoteric encoding scheme utilizing the UTF-16 /
UTF-32 character set.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class b65536(Unit):
    """
    Base65536 encoding and decoding.
    A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set.
    """
    def reverse(self, data):
        if not data:
            return B''

        output = MemoryFile()
        length = len(data)
        for x in range(0, length, 2):
            b1 = data[x]
            b2 = data[x + 1] if x + 1 < length else -1
            code_point = _BLOCK_START[b2] + b1
            output.write(chr(code_point).encode())
        return output.getvalue()

    def process(self, data):
        if not data:
            return B''

        done = False
        output = MemoryFile()
        for ch in data.decode():
            code_point = ord(ch)
            b1 = code_point & ((1 << 8) - 1)
            try:
                b2 = _B2[code_point - b1]
            except KeyError:
                self.log_info('Invalid base65536 code point: %d, skipping' % code_point)
                continue
            b = b1.to_bytes(1, "little") if b2 == -1 else b1.to_bytes(1, "little") + b2.to_bytes(1, "little")
            if len(b) == 1:
                if done:
                    raise ValueError('base65536 sequence continued after final byte')
                done = True
            output.write(b)
        return output.getvalue()

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    if not data:
        return B''

    output = MemoryFile()
    length = len(data)
    for x in range(0, length, 2):
        b1 = data[x]
        b2 = data[x + 1] if x + 1 < length else -1
        code_point = _BLOCK_START[b2] + b1
        output.write(chr(code_point).encode())
    return output.getvalue()

class b85

This unit is implemented in refinery.units.encoding.b85 and has the following commandline Interface:

usage: b85 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Base85 encoding and decoding.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class b85(Unit):
    """
    Base85 encoding and decoding.
    """
    def reverse(self, data):
        return base64.b85encode(data)

    def process(self, data):
        if re.search(BR'\s', data) is not None:
            data = re.sub(BR'\s+', B'', data)
        return base64.b85decode(data)

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return formats.b85s.value.bin.fullmatch(data) is not None

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return base64.b85encode(data)

class b92

This unit is implemented in refinery.units.encoding.b92 and has the following commandline Interface:

usage: b92 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Base92 encoding and decoding.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class b92(Unit):
    """
    Base92 encoding and decoding.
    """
    def reverse(self, data):
        if not data:
            return B'~'

        reader = StructReader(data, bigendian=True)
        output = MemoryFile()
        while reader.remaining_bits > 0:
            try:
                block = reader.read_integer(13)
            except EOFError:
                count = reader.remaining_bits
                block = reader.read_integer(count)
                self.log_debug(F'reading {count} remaining bits: {block:0{count}b}')
                shift = 6 - count
                if shift >= 0:
                    block <<= shift
                    self.log_debug(F'encoding block: {block:06b}')
                    output.write_byte(_B92_ALPHABET[block])
                    break
                block <<= 13 - count
            self.log_debug(F'encoding block: {block:013b}')
            hi, lo = divmod(block, 91)
            output.write_byte(_B92_ALPHABET[hi])
            output.write_byte(_B92_ALPHABET[lo])
        return output.getvalue()

    def process(self, data):
        if data == B'~':
            return B''

        output = MemoryFile()
        buffer = 0
        length = 0

        view = memoryview(data)
        q, r = divmod(len(view), 2)

        if r > 0:
            bits = 6
            tail = _B92_DECODING[data[~0]]
        else:
            bits = 13
            tail = _B92_DECODING[data[~1]] * 91 + _B92_DECODING[data[~0]]
            view = view[:(q - 1) * 2]

        it = iter(view)

        for a, b in zip(it, it):
            block = _B92_DECODING[a] * 91 + _B92_DECODING[b]
            assert length < 8
            buffer <<= 13
            buffer |= block
            length += 13
            size, length = divmod(length, 8)
            assert size > 0
            output.write((buffer >> length).to_bytes(size, 'big'))
            buffer &= (1 << length) - 1

        missing = 8 - length
        shift = bits - missing

        if shift < 8:
            bytecount = 1
        else:
            bytecount = 2
            shift -= 8
            missing += 8

        if shift < 0:
            raise RefineryPartialResult(
                F'Invalid padding, missing {-shift} bits.',
                output.getvalue())

        buffer <<= missing
        buffer |= tail >> shift
        length += missing
        output.write(buffer.to_bytes(bytecount, 'big'))

        if tail & ((1 << shift) - 1) != 0:
            raise RefineryPartialResult(
                F'Invalid padding, lower {shift} bits of {tail:0{bits}b} are not zero.',
                output.getvalue())

        return output.getvalue()

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return formats.b92.value.bin.fullmatch(data) is not None

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    if not data:
        return B'~'

    reader = StructReader(data, bigendian=True)
    output = MemoryFile()
    while reader.remaining_bits > 0:
        try:
            block = reader.read_integer(13)
        except EOFError:
            count = reader.remaining_bits
            block = reader.read_integer(count)
            self.log_debug(F'reading {count} remaining bits: {block:0{count}b}')
            shift = 6 - count
            if shift >= 0:
                block <<= shift
                self.log_debug(F'encoding block: {block:06b}')
                output.write_byte(_B92_ALPHABET[block])
                break
            block <<= 13 - count
        self.log_debug(F'encoding block: {block:013b}')
        hi, lo = divmod(block, 91)
        output.write_byte(_B92_ALPHABET[hi])
        output.write_byte(_B92_ALPHABET[lo])
    return output.getvalue()

class base (base=0, strip_padding=False, little_endian=False, strict_digits=False)

This unit is implemented in refinery.units.encoding.base and has the following commandline Interface:

usage: base [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-e] [-d] [base|alphabet]

Encodes and decodes integers in arbitrary base.

positional arguments:
  base|alphabet        Either the base to be used or an alphabet. If an explicit alphabet is
                       given, its length determines the base. The default base 0 treats the input
                       as a Python integer literal. If a numeric base is given, digits from the
                       alphabet "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" are used.

options:
  -s, --strip-padding  Do not add leading zeros to the output.
  -e, --little-endian  Use little endian byte order instead of big endian.
  -d, --strict-digits  Check that all input digits are part of the alphabet.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.
  -R, --reverse        Use the reverse operation.

Expand source code Browse git

class base(Unit):
    """
    Encodes and decodes integers in arbitrary base.
    """
    def __init__(
        self,
        base: Param[isq, Arg.NumSeq(metavar='base|alphabet', help=(
            R'Either the base to be used or an alphabet. If an explicit alphabet is given, its length '
            R'determines the base. The default base 0 treats the input as a Python integer literal. If '
            F'a numeric base is given, digits from the alphabet "{_DEFAULT_ALPH_STR}" are used. '))] = 0,
        strip_padding: Param[bool, Arg.Switch('-s', help='Do not add leading zeros to the output.')] = False,
        little_endian: Param[bool, Arg.Switch('-e', help='Use little endian byte order instead of big endian.')] = False,
        strict_digits: Param[bool, Arg.Switch('-d', help='Check that all input digits are part of the alphabet.')] = False,
    ):
        super().__init__(
            base=base,
            strip_padding=strip_padding,
            little_endian=little_endian,
            strict_digits=strict_digits,
        )

    @property
    def _args(self):
        base = self.args.base
        if isinstance(base, int):
            if not base:
                return 0, B''
            if base in _LARGER_ALPHABETS:
                return base, _LARGER_ALPHABETS[base]
            if base not in range(2, len(_DEFAULT_ALPHABET) + 1):
                raise ValueError(F'base may only be an integer between 2 and {len(_DEFAULT_ALPHABET)}')
            return base, _DEFAULT_ALPHABET[:base]
        if len(set(base)) != len(base):
            raise ValueError('the given alphabet contains duplicate letters')
        return len(base), bytearray(base)

    @property
    def byteorder(self):
        return 'little' if self.args.little_endian else 'big'

    def reverse(self, data):
        base, alphabet = self._args
        self.log_info('using byte order', self.byteorder)
        number = int.from_bytes(data, byteorder=self.byteorder)

        if base == 0:
            return B'0x%X' % number
        if base > len(alphabet):
            raise ValueError(F'Only {len(alphabet)} available; not enough to encode base {base}')

        log2n = len(data) * 8
        logBn = int(log2n / math.log2(base))
        if base ** logBn <= number:
            logBn += 1
        result = bytearray()
        no_pad = self.args.strip_padding

        for _ in range(logBn):
            number, k = divmod(number, base)
            result.append(alphabet[k])
            if no_pad and number <= 0:
                break

        result.reverse()
        return result

    def process(self, data: bytearray):
        if not data:
            return data
        base, alphabet = self._args
        self.log_debug(F'decoding data using base {base}; alphabet {alphabet!r}')
        be_lenient = not self.args.strict_digits
        if be_lenient and alphabet.upper() == alphabet:
            lcased = (c + 0x20 if 0x41 <= c <= 0x5a else c for c in data)
            if all(x == y for x, y in zip(data, lcased)):
                data = data.upper()
        if base and base != 64 and be_lenient:
            check = '[^{}]'.format(
                ''.join(F'\\x{c:02x}' for c in sorted(set(alphabet)))).encode('ascii')
            if re.search(check, data) is not None:
                stripped = re.sub(check, B'', data)
                self.log_info(F'stripped {len(data) - len(stripped)} invalid digits from input data')
                data[:] = stripped
        if len(alphabet) <= len(_DEFAULT_ALPHABET):
            defaults = _DEFAULT_ALPHABET[:base]
            if alphabet != defaults:
                self.log_info('translating input data to a default alphabet for faster conversion')
                data_translated = data.translate(bytes.maketrans(alphabet, defaults))
                result = int(data_translated, base)
            else:
                result = int(data, base)
        elif len(alphabet) == 64 and len(data) >= 4:
            import base64
            _b64_alphabet = _LARGER_ALPHABETS[64]
            if alphabet != _b64_alphabet:
                data = data.translate(bytes.maketrans(alphabet, _b64_alphabet))
            return base64.b64decode(data + b'===', validate=self.args.strict_digits)
        elif len(alphabet) == 85 and len(data) >= 5:
            import base64
            _b85_alphabet = _LARGER_ALPHABETS[85]
            if alphabet != _b85_alphabet:
                data = data.translate(bytes.maketrans(alphabet, _b85_alphabet))
            return base64.b85decode(data)
        else:
            if len(data) > 100_000:
                self.log_warn('long alphabet & unable to use built-ins; reverting to (slow) fallback.')
            result = 0
            lookup = {digit: k for k, digit in enumerate(alphabet)}
            for digit in data:
                result *= base
                result += lookup[digit]
        if not base or self.args.strip_padding:
            size, r = divmod(result.bit_length(), 8)
            size += int(bool(r))
        else:
            log2n = int(len(data) * math.log2(base))
            test = 1 << log2n
            while test > result:
                log2n -= 1
                test >>= 1
            size = log2n // 8 + 1
        return result.to_bytes(size, byteorder=self.byteorder)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    base, alphabet = self._args
    self.log_info('using byte order', self.byteorder)
    number = int.from_bytes(data, byteorder=self.byteorder)

    if base == 0:
        return B'0x%X' % number
    if base > len(alphabet):
        raise ValueError(F'Only {len(alphabet)} available; not enough to encode base {base}')

    log2n = len(data) * 8
    logBn = int(log2n / math.log2(base))
    if base ** logBn <= number:
        logBn += 1
    result = bytearray()
    no_pad = self.args.strip_padding

    for _ in range(logBn):
        number, k = divmod(number, base)
        result.append(alphabet[k])
        if no_pad and number <= 0:
            break

    result.reverse()
    return result

class bat

This unit is implemented in refinery.units.formats.bat and has the following commandline Interface:

usage: bat [-h] [-L] [-Q] [-0] [-v]

Emulates the execution of a batch file. Each command line that would be executed is emitted as an
individual chunk. This can remove simple obfuscation based on expansion of environment variables.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class bat(Unit):
    """
    Emulates the execution of a batch file. Each command line that would be executed is emitted
    as an individual chunk. This can remove simple obfuscation based on expansion of environment
    variables.
    """

    def process(self, data):
        emu = BatchFileEmulator(data)
        for cmd in emu.emulate():
            yield cmd.encode(self.codec)

class bitrev (bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.bitrev and has the following commandline Interface:

usage: bitrev [-h] [-L] [-Q] [-0] [-v] [-E] [-B N]

Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer
multiple of the block size are ignored.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class bitrev(UnaryOperation):
    """
    Reverse the bits of every block. Any excess bytes at the end of the input that are not
    an integer multiple of the block size are ignored.
    """
    @staticmethod
    def operate(arg):
        raise RuntimeError('operate was called before the unit was initialized')

    def __init__(self, bigendian=False, blocksize=1):
        """
        Unreadable bit reversal operations due to:
        https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv
        https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
        """
        super().__init__(bigendian=bigendian, blocksize=blocksize, _truncate=1)

        if self.bytestream:
            def operate(v):
                return ((v * 0x202020202) & 0x10884422010) % 1023
        elif self.blocksize in (2, 4, 8):
            def operate(v):
                s = self.fbits
                m = self.fmask
                w = v
                while s > 1:
                    s >>= 1
                    m = m ^ (m << s)
                    w = ((w << s) & ~m) | ((w >> s) & m)
                return w
        else:
            def operate(v):
                w = v & 0
                for s in range(self.fbits):
                    w |= ((v >> s) & 1) << (self.fbits - s - 1)
                return w
        self.operate = operate

class bitsnip (slices=[slice(0, 1, None)], bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.bitsnip and has the following commandline Interface:

usage: bitsnip [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [slices ...]

Pick a certain range of bits from each block of the input. The extracted ranges of bits are
concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are
indexed from least significant at index 0 to most significant in each block. When the unit
operates in big endian mode, the internal bit buffer is shifted left in each step and new bits
are inserted as the least significant portion. Conversely, in default (little endian) mode, newly
extracted bits are added as the now most significant ones. After concatenating all bit slices
into a large integer, this integer is converted into a byte string according to the given byte
ordering.

positional arguments:
  slices             Specify start:stop:size, where size can be used to pad or truncate the
                     extracted bits. If size is omitted, it defaults to (stop-start). If no slice
                     is specified, it defaults to 0, which corresponds to 0:1:1, i.e. extracting
                     the lowest bit.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class bitsnip(BlockTransformationBase):
    """
    Pick a certain range of bits from each block of the input. The extracted ranges of bits are
    concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are
    indexed from least significant at index 0 to most significant in each block. When the unit
    operates in big endian mode, the internal bit buffer is shifted left in each step and new bits
    are inserted as the least significant portion. Conversely, in default (little endian) mode,
    newly extracted bits are added as the now most significant ones. After concatenating all bit
    slices into a large integer, this integer is converted into a byte string according to the
    given byte ordering.
    """
    def __init__(
        self, slices: Param[list[slice], Arg(help=(
            'Specify start:stop:size, where size can be used to pad or truncate the extracted '
            'bits. If size is omitted, it defaults to (stop-start). If no slice is specified, '
            'it defaults to 0, which corresponds to 0:1:1, i.e. extracting the lowest bit.')
        )] = [slice(0, 1)],
        bigendian=False, blocksize=1
    ):
        super().__init__(slices=slices, bigendian=bigendian, blocksize=blocksize)

    def process(self, data: bytearray):
        bitsnip_data = 0
        bitsnip_size = 0
        slices: list[tuple[int, int, int]] = []
        maxbits = 8 * self.blocksize
        args: Iterable[slice] = iter(self.args.slices)
        bigendian: bool = self.args.bigendian

        for s in args:
            start = s.start
            stop = s.stop
            if start is None:
                start = 0
            if stop is None:
                stop = maxbits
            elif stop > maxbits:
                raise ValueError(F'the selection {start}:{stop} is out of bounds for the block size {self.blocksize}')
            if start >= stop:
                continue
            size = stop - start
            mask = (1 << size) - 1
            size = s.step or size
            slices.append((start, mask, size))

        for item in self.chunk(data):
            for shift, mask, size in slices:
                bits = (item >> shift) & mask
                if bigendian:
                    bitsnip_data <<= size
                    bitsnip_data |= bits
                else:
                    bitsnip_data |= bits << bitsnip_size
                bitsnip_size += size

        length, remainder = divmod(bitsnip_size, 8)

        if remainder != 0:
            self.log_info(F'discarding {bitsnip_size % 8} bits')
            if bigendian:
                bitsnip_data >>= remainder
            else:
                bitsnip_data &= (1 << (8 * length)) - 1

        if bigendian:
            return bitsnip_data.to_bytes(length, 'big')
        else:
            return bitsnip_data.to_bytes(length, 'little')

class blabla (key, nonce=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', rounds=10, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.blabla and has the following commandline Interface:

usage: blabla [-h] [-L] [-Q] [-0] [-v] [-R] [-r N] [-d N] [-s] key [nonce]

Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is
similar to ChaCha in design but operates on 64-bit blocks.

positional arguments:
  key              The encryption key.
  nonce            The 16-byte nonce. The default are 16 null bytes.

options:
  -r, --rounds N   The number of rounds, default is 10.
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class blabla(StreamCipherUnit):
    """
    Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It
    is similar to ChaCha in design but operates on 64-bit blocks.
    """
    key_size = {32}

    def __init__(
        self, key,
        nonce: Param[buf, Arg(help='The 16-byte nonce. The default are 16 null bytes.')] = bytes(16),
        rounds: Param[int, Arg.Number('-r', help='The number of rounds, default is {default}.')] = 10,
        discard=0, stateful=False
    ):
        super().__init__(key=key, nonce=nonce, rounds=rounds, discard=discard, stateful=stateful)

    def keystream(self):
        r = self.args.rounds
        n = self.args.nonce
        k = struct.unpack('<4Q', self.args.key)

        try:
            n = struct.unpack('<2Q', n)
        except Exception:
            raise ValueError(F'The given nonce has invalid length of {len(n)}, it must be 16 bytes in size.')

        q = [
            0x6170786593810fab,  # 0x0
            0x3320646ec7398aee,  # 0x1
            0x79622d3217318274,  # 0x2
            0x6b206574babadada,  # 0x3
            *k,                  # 0x4 .. 0x7
            0x2ae36e593e46ad5f,  # 0x8
            0xb68f143029225fc9,  # 0x9
            0x8da1e08468303aa6,  # 0xA
            0xa48a209acd50a4a7,  # 0xB
            0x7fdc12f23f90778c,  # 0xC
            1,                   # 0xD
            *n                   # 0xE .. 0xF
        ]
        while True:
            v = [*q]
            for _ in range(r):
                for a, b, c, d in [
                    (0x0, 0x4, 0x8, 0xC),
                    (0x1, 0x5, 0x9, 0xD),
                    (0x2, 0x6, 0xA, 0xE),
                    (0x3, 0x7, 0xB, 0xF),
                    (0x0, 0x5, 0xA, 0xF),
                    (0x1, 0x6, 0xB, 0xC),
                    (0x2, 0x7, 0x8, 0xD),
                    (0x3, 0x4, 0x9, 0xE),
                ]:
                    v[a] = v[a] + v[b] & _M64
                    v[d] = rotr64(v[d] ^ v[a], 32)
                    v[c] = v[c] + v[d] & _M64
                    v[b] = rotr64(v[b] ^ v[c], 24)
                    v[a] = v[a] + v[b] & _M64
                    v[d] = rotr64(v[d] ^ v[a], 16)
                    v[c] = v[c] + v[d] & _M64
                    v[b] = rotr64(v[b] ^ v[c], 63)
            v = [x + y & _M64 for x, y in zip(q, v)]
            q[0xD] += 1
            yield from struct.pack('<16Q', *v)

class blake2b (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: blake2b [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the BLAKE2B hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class blake2s (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: blake2s [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the BLAKE2S hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class blowfish (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=(), aad=b'')

This unit is implemented in refinery.units.crypto.cipher.blowfish and has the following commandline Interface:

usage: blowfish [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-t TAG]
                [-a AAD]
                key

Blowfish encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -t, --tag TAG         Only for EAX, GCM, OCB, and CCM: An authentication tag to verify the
                        message. For encryption, this parameter specifies the tag length, and the
                        tag is provided as a meta variable named "tag".
  -a, --aad AAD         Only for EAX, GCM, OCB, and CCM: Set additional authenticated data.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class blowfish(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(Blowfish)):
    """
    Blowfish encryption and decryption.
    """

class blz

This unit is implemented in refinery.units.compression.blz and has the following commandline Interface:

usage: blz [-h] [-L] [-Q] [-0] [-v] [-R]

BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree
implementation: It requires a lot of time & memory.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class blz(Unit):
    """
    BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree
    implementation: It requires a lot of time & memory.
    """
    def _begin(self, data):
        self._src = StructReader(memoryview(data))
        self._dst = MemoryFile(bytearray())
        return self

    def _reset(self):
        self._src.seek(0)
        self._dst.seek(0)
        self._dst.truncate()
        return self

    def _decompress(self):
        (
            signature,
            version,
            src_count,
            src_crc32,
            dst_count,
            dst_crc32,
        ) = self._src.read_struct('>6L')
        if signature != 0x626C7A1A:
            raise ValueError(F'Invalid BriefLZ signature: {signature:08X}, should be 626C7A1A.')
        if version > 10:
            raise ValueError(F'Invalid version number {version}, should be less than 10.')
        self.log_debug(F'signature: 0x{signature:08X} V{version}')
        self.log_debug(F'src count: 0x{src_count:08X}')
        self.log_debug(F'src crc32: 0x{src_crc32:08X}')
        self.log_debug(F'dst count: 0x{dst_count:08X}')
        self.log_debug(F'dst crc32: 0x{dst_crc32:08X}')
        src = self._src.getvalue()
        src = src[24:24 + src_count]
        if len(src) < src_count:
            self.log_warn(F'Only {len(src)} bytes in buffer, but header annoucned a length of {src_count}.')
        if src_crc32:
            check = zlib.crc32(src)
            if check != src_crc32:
                self.log_warn(F'Invalid source data CRC {check:08X}, should be {src_crc32:08X}.')
        dst = self._decompress_chunk(dst_count)
        if not dst_crc32:
            return dst
        check = zlib.crc32(dst)
        if check != dst_crc32:
            self.log_warn(F'Invalid result data CRC {check:08X}, should be {dst_crc32:08X}.')
        return dst

    def _decompress_modded(self):
        self._src.seekrel(8)
        total_size = self._src.u64()
        chunk_size = self._src.u64()
        remaining = total_size
        self.log_debug(F'total size: 0x{total_size:016X}')
        self.log_debug(F'chunk size: 0x{chunk_size:016X}')
        while remaining > chunk_size:
            self._decompress_chunk(chunk_size)
            remaining -= chunk_size
        return self._decompress_chunk(remaining)

    def _decompress_chunk(self, size=None):
        bitcount = 0
        bitstore = 0
        decompressed = 1

        def readbit():
            nonlocal bitcount, bitstore
            if not bitcount:
                bitstore = int.from_bytes(self._src.read_exactly(2), 'little')
                bitcount = 0xF
            else:
                bitcount = bitcount - 1
            return (bitstore >> bitcount) & 1

        def readint():
            result = 2 + readbit()
            while readbit():
                result <<= 1
                result += readbit()
            return result

        self._dst.write(self._src.read_exactly(1))

        try:
            while not size or decompressed < size:
                if readbit():
                    length = readint() + 2
                    sector = readint() - 2
                    offset = self._src.read_byte() + 1
                    delta = offset + 0x100 * sector
                    available = self._dst.tell()
                    if delta not in range(available + 1):
                        raise RefineryPartialResult(
                            F'Requested rewind by 0x{delta:08X} bytes with only 0x{available:08X} bytes in output buffer.',
                            partial=self._dst.getvalue())
                    quotient, remainder = divmod(length, delta)
                    replay = memoryview(self._dst.getvalue())
                    replay = bytes(replay[-delta:] if quotient else replay[-delta:length - delta])
                    replay = quotient * replay + replay[:remainder]
                    self._dst.write(replay)
                    decompressed += length
                else:
                    self._dst.write(self._src.read_exactly(1))
                    decompressed += 1
        except EOFError as E:
            raise RefineryPartialResult(str(E), partial=self._dst.getvalue())
        dst = self._dst.getvalue()
        if decompressed < size:
            raise RefineryPartialResult(
                F'Attempted to decompress {size} bytes, got only {len(dst)}.', dst)
        if decompressed > size:
            raise RuntimeError('Decompressed buffer contained more bytes than expected.')
        return dst

    def _compress(self):
        from refinery.lib.suffixtree import SuffixTree

        try:
            self.log_info('computing suffix tree')
            tree = SuffixTree(self._src.getvalue())
        except Exception:
            raise

        bitstore = 0  # The bit stream to be written
        bitcount = 0  # The number of bits in the bit stream
        buffer = MemoryFile(bytearray())

        # Write empty header and first byte of source
        self._dst.write(bytearray(24))
        self._dst.write(self._src.read_exactly(1))

        def writeint(n: int) -> None:
            """
            Write an integer to the bit stream.
            """
            nonlocal bitstore, bitcount
            nbits = n.bit_length()
            if nbits < 2:
                raise ValueError
            # The highest bit is implicitly assumed:
            n ^= 1 << (nbits - 1)
            remaining = nbits - 2
            while remaining:
                remaining -= 1
                bitstore <<= 2
                bitcount += 2
                bitstore |= ((n >> remaining) & 3) | 1
            bitstore <<= 2
            bitcount += 2
            bitstore |= (n & 1) << 1

        src = self._src.getvalue()
        remaining = len(src) - 1
        self.log_info('compressing data')

        while True:
            cursor = len(src) - remaining
            rest = src[cursor:]
            if bitcount >= 0x10:
                block_count, bitcount = divmod(bitcount, 0x10)
                info_channel = bitstore >> bitcount
                bitstore = info_channel << bitcount ^ bitstore
                # The decompressor will read bits from top to bottom, and each 16 bit block has to be
                # little-endian encoded. The bit stream is encoded top to bottom bit in the bitstore
                # variable, and by encoding it as a big endian integer, the stream is in the correct
                # order. However, we need to swap adjacent bytes to achieve little endian encoding for
                # each of the blocks:
                info_channel = bytearray(info_channel.to_bytes(block_count * 2, 'big'))
                for k in range(block_count):
                    k0 = 2 * k + 0
                    k1 = 2 * k + 1
                    info_channel[k0], info_channel[k1] = info_channel[k1], info_channel[k0]
                info_channel = memoryview(info_channel)
                data_channel = memoryview(buffer.getvalue())
                self._dst.write(info_channel[:2])
                self._dst.write(data_channel[:-1])
                self._dst.write(info_channel[2:])
                data_channel = bytes(data_channel[-1:])
                buffer.truncate(0)
                store = buffer if bitcount else self._dst
                store.write(data_channel)
            if remaining + bitcount < 0x10:
                buffer = buffer.getvalue()
                if rest or buffer:
                    bitstore <<= 0x10 - bitcount
                    self._dst.write(bitstore.to_bytes(2, 'little'))
                    self._dst.write(buffer)
                    self._dst.write(rest)
                elif bitcount:
                    raise RuntimeError('Bitbuffer Overflow')
                break
            node = tree.root
            length = 0
            offset = 0
            sector = None
            while node.children and length < len(rest):
                for child in node.children.values():
                    if tree.data[child.start] == rest[length]:
                        node = child
                        break
                if node.start >= cursor:
                    break
                offset = node.start - length
                length = node.end + 1 - offset
            length = min(remaining, length)
            if length >= 4:
                sector, offset = divmod(cursor - offset - 1, 0x100)
            bitcount += 1
            bitstore <<= 1
            if sector is None:
                buffer.write(rest[:1])
                remaining -= 1
                continue
            bitstore |= 1
            buffer.write(bytes((offset,)))
            writeint(length - 2)
            writeint(sector + 2)
            remaining -= length

        self._dst.seek(24)
        dst = self._dst.peek()
        self._dst.seek(0)
        self._dst.write(struct.pack('>6L', 0x626C7A1A, 1, len(dst), zlib.crc32(dst), len(src), zlib.crc32(src)))
        return self._dst.getvalue()

    def process(self, data):
        self._begin(data)
        partial = None
        try:
            return self._decompress()
        except ValueError as error:
            if isinstance(error, RefineryPartialResult):
                partial = error
            self.log_warn(F'Reverting to modified BriefLZ after decompression error: {error!s}')
            self._reset()

        try:
            return self._decompress_modded()
        except RefineryPartialResult:
            raise
        except Exception as error:
            if not partial:
                raise
            raise partial from error

    def reverse(self, data):
        return self._begin(data)._compress()

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return self._begin(data)._compress()

class brotli

This unit is implemented in refinery.units.compression.brotli and has the following commandline Interface:

usage: brotli [-h] [-L] [-Q] [-0] [-v] [-R]

Brotli compression and decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class brotli(Unit):
    """
    Brotli compression and decompression.
    """

    @Unit.Requires('brotlipy', ['all'])
    def _brotli():
        import brotli
        return brotli

    def process(self, data):
        return self._brotli.decompress(bytes(data))

    def reverse(self, data):
        return self._brotli.compress(bytes(data))

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return self._brotli.compress(bytes(data))

class bruteforce (name, length=slice(1, None, None), format=None, alphabet=None, pattern=None, printable=False, digits=False, identifier=False, letters=False)

This unit is implemented in refinery.units.strings.bruteforce and has the following commandline Interface:

usage: bruteforce [-h] [-L] [-Q] [-0] [-v] [-a B | -r PATTERN | -p | -d | -i | -l]
                  name [length] [format]

Generates all possible combinations of letters in a given alphabet. For each generated string,
one copy of each input chunk is generated and populated with a meta variable containing that
string. This can be used for simple brute forcing checks.

positional arguments:
  name                   Name of the meta variable to be populated.
  length                 Specifies the range of characters to brute force, default is 1:.
  format                 Optional format expression for the output string. The format sequence
                         "{0}" is the current brute force string, the sequence "{1}" represents
                         the input data.

options:
  -a, --alphabet B       The alphabet from which to choose the letters. Entire byte range by
                         default.
  -r, --pattern PATTERN  Provide a regular expression pattern to define the alphabet.
  -p, --printable        Equivalent to --pattern=[\s\x20-\x7E]
  -d, --digits           Equivalent to --pattern=\d
  -i, --identifier       Equivalent to --pattern=\w
  -l, --letters          Equivalent to --pattern=[a-zA-Z]

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class bruteforce(Unit):
    """
    Generates all possible combinations of letters in a given alphabet. For each generated string,
    one copy of each input chunk is generated and populated with a meta variable containing that
    string. This can be used for simple brute forcing checks.
    """
    def __init__(
        self,
        name: Param[str, Arg.String(help='Name of the meta variable to be populated.')],
        length: Param[slice, Arg.Bounds(metavar='length', help=(
            'Specifies the range of characters to brute force, default is {default}.'
        ))] = slice(1, None),
        format: Param[str, Arg.String(help=(
            'Optional format expression for the output string. The format sequence "{0}" is the '
            'current brute force string, the sequence "{1}" represents the input data.'
        ))] = None,
        alphabet: Param[buf, Arg.Binary('-a', group='ALPH', help=(
            'The alphabet from which to choose the letters. Entire byte range by default.'
        ))] = None,
        pattern: Param[str, Arg.RegExp('-r', group='ALPH',
            help='Provide a regular expression pattern to define the alphabet.')] = None,
        printable: Param[bool, Arg.Switch('-p', group='ALPH',
            help='Equivalent to --pattern=[\\s\\x20-\\x7E]')] = False,
        digits: Param[bool, Arg.Switch('-d', group='ALPH',
            help='Equivalent to --pattern=\\d')] = False,
        identifier: Param[bool, Arg.Switch('-i', group='ALPH',
            help='Equivalent to --pattern=\\w')] = False,
        letters: Param[bool, Arg.Switch('-l', group='ALPH',
            help='Equivalent to --pattern=[a-zA-Z]')] = False,
    ):
        options = sum(1 for x in [printable, digits, identifier, letters] if x)

        if options > 1 or options and pattern:
            raise ValueError('Invalid selection.')

        if printable:
            pattern = b'[\\s\\x20-\\x7E]'
        if digits:
            pattern = b'\\d'
        if identifier:
            pattern = b'\\w'
        if letters:
            pattern = b'[a-zA-Z]'

        super().__init__(
            name=name,
            length=length,
            format=format,
            alphabet=alphabet,
            pattern=pattern,
        )

    def _alphabet(self) -> bytes:
        if (alphabet := self.args.alphabet):
            return alphabet
        else:
            alphabet = bytes(range(0x100))
        if not (pattern := self.args.pattern):
            return alphabet
        if isinstance((regex := Arg.AsRegExp(self.codec, pattern, flags=re.DOTALL)), re.Pattern):
            if (alphabet := B''.join(regex.findall(alphabet))):
                return alphabet
        raise ValueError(F'Invalid regular expression: {pattern}')

    def process(self, data: bytearray):
        format_spec: str = self.args.format
        meta = metavars(data)
        name = self.args.name
        kwargs = {name: None}

        for length in integers_of_slice(self.args.length):
            self.log_info(F'generating {length} digits')
            if not isinstance(length, int) or length < 0:
                raise ValueError(F'Unable to brute force {length} characters.')
            for string in itertools.product(self._alphabet(), repeat=length):
                string = bytes(string)
                if format_spec:
                    string = meta.format_bin(format_spec, self.codec, [string, data])
                kwargs[name] = string
                yield self.labelled(data, **kwargs)

class byteswap (size=4)

This unit is implemented in refinery.units.blockwise.byteswap and has the following commandline Interface:

usage: byteswap [-h] [-L] [-Q] [-0] [-v] [N]

Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the
block size are discarded.

positional arguments:
  N              the block size in bytes; the default is 4.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class byteswap(UnaryOperation):
    """
    Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block
    size are discarded.
    """
    def __init__(self, size: Param[int, Arg.Number(help='the block size in bytes; the default is {default}.')] = 4):
        super().__init__(blocksize=size, _truncate=2)

    def inplace(self, block: ndarray) -> None:
        block.byteswap(True)

    operate = NotImplemented

    def process(self, data):
        try:
            return self._fastblock(data)
        except FastBlockError:
            b = self.blocksize
            n = len(data)
            m = n - n % b
            v = memoryview(data)
            if b == 1:
                self.log_warn('running this unit with a block size of 1 does not have any effect')
                return data
            for k in range(0, m, b):
                _end = k and k - 1 or None
                data[k : k + b] = v[k + b - 1:_end:-1]
            if m < n:
                del v
                del data[m:]
            return data

class bz2 (level=9)

This unit is implemented in refinery.units.compression.bz2 and has the following commandline Interface:

usage: bz2 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-l N]

BZip2 compression and decompression.

options:
  -l, --level N  compression level preset between 1 and 9

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class bz2(Unit):
    """
    BZip2 compression and decompression.
    """
    def __init__(self, level: Param[int, Arg.Number('-l', bound=(1, 9), help='compression level preset between 1 and 9')] = 9):
        super().__init__(level=level)

    def process(self, data):
        return bz2_.decompress(data)

    def reverse(self, data):
        return bz2_.compress(data, self.args.level)

    @classmethod
    def handles(cls, data):
        return data[:3] == B'BZh'

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return bz2_.compress(data, self.args.level)

class camellia (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=(), aad=b'')

This unit is implemented in refinery.units.crypto.cipher.camellia and has the following commandline Interface:

usage: camellia [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] key

Camellia encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class camellia(StandardBlockCipherUnit, cipher=BlockCipherFactory(Camellia)):
    """
    Camellia encryption and decryption.
    """

class carve (format, unique=False, decode=False, single=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True)

This unit is implemented in refinery.units.pattern.carve and has the following commandline Interface:

usage: carve [-h] [-L] [-Q] [-0] [-v] [-q] [-d] [-s] [-n N] [-m N] [-e N] [-x] [-l] [-t K]
             [-a | -u]
             format

Extracts patches of data in particular formats from the input.

positional arguments:
  format            Specify one of the following formats: integer, float, number, string,
                    multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote,
                    urlquote-coarse, urlquote-narrow, intarray, strarray, numarray, word,
                    letters, wshenc, alphanumeric, b32, b58, b62, b64, b85, a85, z85, b92,
                    b64url, hex, b16, b16s, b64s, b85s, a85s, z85s, utf8, hexdump, hexarray,
                    uuencode

options:
  -q, --unique      Yield every match only once.
  -d, --decode      Automatically decode known patterns.
  -s, --single      Only get the biggest match; equivalent to -qlt1
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -l, --longest     Pick longer results first. The output will be sorted by length unless the
                    --take option is specified, in which case the longest K results will be
                    returned in order of appearance.
  -t, --take K      Return only the first K occurrences in order of appearance. If --longest is
                    specified, the K longest results will be returned in order of appearance
                    within the input.
  -a, --no-utf16    Search for ASCII encoded patterns only.
  -u, --no-ascii    Search for UTF16 encoded patterns only.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class carve(PatternExtractor):
    """
    Extracts patches of data in particular formats from the input.
    """
    def __init__(
        self, format: Param[str, Arg.String(metavar='format',
            help=F'Specify one of the following formats: {_FORMATS}')],
        unique: Param[bool, Arg.Switch('-q', help='Yield every match only once.')] = False,
        decode: Param[bool, Arg.Switch('-d', help='Automatically decode known patterns.')] = False,
        single: Param[bool, Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1')] = False,
        min=1, max=None, len=None,
        stripspace=False, longest=False, take=None, utf16=True, ascii=True
    ):
        if single:
            take = 1
            longest = True
            unique = True
        try:
            format = formats.from_dashname(format)
        except Exception:
            raise ValueError(F'{format} is not a valid format')
        super().__init__(
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            duplicates=not unique,
            longest=longest,
            take=take,
            ascii=ascii,
            utf16=utf16,
            format=format
        )
        if not decode:
            decoder = NotImplemented
        elif self.args.format in (formats.multiline_string, formats.string):
            from ..encoding.esc import esc
            decoder = esc(unicode=True, quoted=True)
        elif self.args.format == formats.integer:
            from ..encoding.base import base
            decoder = base()
        elif self.args.format in (formats.b16, formats.b16s, formats.hex):
            from ..encoding.hex import hex
            decoder = hex()
        elif self.args.format == formats.hexdump:
            from ..formats.hexload import hexload
            decoder = hexload()
        elif self.args.format == formats.intarray:
            from ..blockwise.pack import pack
            decoder = pack()
        elif self.args.format == formats.strarray:
            from ..encoding.esc import esc
            def _decoder(data: Chunk): # noqa
                return msgpack.packb([
                    m[0] | esc | bytes for m in formats.string.value.bin.finditer(data)])
            decoder = _decoder
        elif self.args.format in (formats.b64, formats.b64s):
            from ..encoding.b64 import b64
            decoder = b64()
        elif self.args.format in (formats.b85, formats.b85s):
            from ..encoding.b85 import b85
            decoder = b85()
        elif self.args.format == formats.b64url:
            from ..encoding.b64 import b64
            decoder = b64(urlsafe=True)
        elif self.args.format == formats.b32:
            from ..encoding.b32 import b32
            decoder = b32()
        elif self.args.format == formats.ps1str:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format == formats.vbastr:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format == formats.hexarray:
            from ..blockwise.pack import pack
            decoder = pack(0x10)
        elif self.args.format == formats.wshenc:
            from ..encoding.wshenc import wshenc
            decoder = wshenc()
        elif self.args.format == formats.uuencode:
            from ..encoding.uuenc import uuenc
            decoder = uuenc()
        elif self.args.format in (
            formats.urlquote,
            formats.urlquote_coarse,
            formats.urlquote_narrow,
        ):
            from ..encoding.url import url
            decoder = url()
        else:
            decoder = NotImplemented
        self.decoder = decoder

    def process(self, data):
        self.log_info('using pattern:', self.args.format.str.pattern)
        it = iter(self.matches_filtered(memoryview(data), self.args.format.value.bin))
        if self.decoder is NotImplemented:
            yield from it
        for chunk in it:
            try:
                yield self.decoder(chunk)
            except Exception as E:
                self.log_info(F'decoder failure: {E!s}')

class carve_7z

This unit is implemented in refinery.units.pattern.carve_7z and has the following commandline Interface:

usage: carve-7z [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a 7zip archive file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_7z(Unit):
    """
    Extracts anything from the input data that looks like a 7zip archive file.
    """
    @Unit.Requires('py7zr', ['arc', 'default', 'extended'])
    def _py7zr():
        import py7zr
        return py7zr

    HEADER_SIGNATURE = B'7z\xBC\xAF\x27\x1C'

    def process(self, data: bytearray):
        cursor = 0
        mv = memoryview(data)
        while True:
            start = data.find(self.HEADER_SIGNATURE, cursor)
            if start < cursor:
                break
            self.log_debug(F'found header at offset: 0x{start:08X}')
            try:
                mf = MemoryFileRecorder(mv[start:])
                self.log_debug('attempting to read archive')
                archive = self._py7zr.SevenZipFile(mf)
                self.log_debug('attempting to test archive')
                success = archive.test() is not False
            except ImportError:
                raise
            except Exception as error:
                self.log_debug('parsing archive failed:', error)
                success = False
            if success:
                self.log_info(F'identified archive of size 0x{mf.max_cursor:08X} at offset 0x{start:08X}')
                cursor = start + mf.max_cursor
                yield self.labelled(mv[start:cursor], offset=start)
            else:
                cursor = start + 5

class carve_der

This unit is implemented in refinery.units.pattern.carve_der and has the following commandline Interface:

usage: carve-der [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a DER sequence. The carving can be very
slow: The unit will attempt to parse an ASN1 sequence at every offset where a byte with value
0x30 is found, since this can indicate the start of an ASN1 SEQUENCE. It will only consider the
next 10KB of data at this offset, but it nevertheless remains a poor heuristic.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_der(Unit):
    """
    Extracts anything from the input data that looks like a DER sequence. The carving can be very
    slow: The unit will attempt to parse an ASN1 sequence at every offset where a byte with value
    0x30 is found, since this can indicate the start of an ASN1 SEQUENCE. It will only consider the
    next 10KB of data at this offset, but it nevertheless remains a poor heuristic.
    """
    @Unit.Requires('pyasn1', ['default', 'extended'])
    def _pyasn1parsers():
        from pyasn1.codec.der.decoder import decode
        from pyasn1.codec.der.encoder import encode
        return encode, decode

    def process(self, data: bytearray):
        cursor = 0
        encode, decode = self._pyasn1parsers
        while True:
            try:
                pos = data.index(0x30, cursor)
            except Exception:
                break
            else:
                cursor += 1
            if pos + 1 < len(data) and data[pos + 1] == 0:
                continue
            try:
                sequence = decode(bytes(data[pos:pos + 10_000]))
            except Exception:
                continue
            if not (der := sequence[0]):
                self.log_info(F'0x{pos:08X}: parser returned nothing')
                continue
            if len(der) < 2:
                self.log_info(F'0x{pos:08X}: parser returned empty sequence')
                continue
            der = encode(der)
            cursor = pos + len(der)
            yield self.labelled(der, offset=pos)

class carve_json (all=False)

This unit is implemented in refinery.units.pattern.carve_json and has the following commandline Interface:

usage: carve-json [-h] [-L] [-Q] [-0] [-v] [-a]

Extracts anything from the input data that looks like JSON.

options:
  -a, --all      By default, only dictionaries are carved. Specify this flag to also carve lists.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_json(Unit):
    """
    Extracts anything from the input data that looks like JSON.
    """
    def __init__(
        self, all: Param[bool, Arg.Switch('-a', help=(
            'By default, only dictionaries are carved. Specify this flag to also carve lists.'
        ))] = False
    ):
        super().__init__(all=all)

    def process(self, data):
        for start, chunk in JSONCarver(data, dictonly=not self.args.all):
            yield self.labelled(chunk, offset=start)

class carve_lnk

This unit is implemented in refinery.units.pattern.carve_lnk and has the following commandline Interface:

usage: carve-lnk [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_lnk(Unit):
    """
    Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)
    """

    @Unit.Requires('LnkParse3>=1.4.0', ['formats', 'extended'])
    def _LnkParse3():
        import LnkParse3
        import LnkParse3.extra_factory
        return LnkParse3

    def process(self, data: bytearray):
        pos = 0
        mem = memoryview(data)
        sig = B'\x4C\x00\x00\x00\x01\x14\x02\x00'
        lnk = self._LnkParse3

        while True:
            pos = data.find(sig, pos)
            if pos < 0:
                break
            try:
                parsed = lnk.lnk_file(indata=mem[pos:])
            except Exception:
                pos += 1
                continue
            end = pos + parsed.header.size() + parsed.string_data.size()
            if parsed.has_target_id_list():
                end += parsed.targets.size()
            if parsed.has_link_info() and not parsed.force_no_link_info():
                with suppress(AttributeError):
                    end += parsed.info.size()
            with NoLogging():
                while end < len(mem):
                    extra = lnk.extra_factory.ExtraFactory(mem[end:])
                    try:
                        ec = extra.extra_class()
                    except Exception:
                        break
                    if ec is None:
                        break
                    if 'UNKNOWN' in ec().name():
                        break
                    end += extra.item_size()

            terminal_block = mem[end:end + 4]
            if terminal_block != B'\0\0\0\0':
                self.log_warn(F'detected LNK at offset 0x{pos:X}, but size calculation did not end on a terminal block')
                continue
            else:
                end += 4
            yield self.labelled(mem[pos:end], offset=pos)
            pos = end

class carve_pe (*paths, list=False, join_path=False, drop_path=False, path=b'name', recursive=False, keep_root=False, memdump=False, fileinfo=False)

This unit is implemented in refinery.units.pattern.carve_pe and has the following commandline Interface:

usage: carve-pe [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-P NAME] [-r] [-k] [-m] [-f] [path ...]

Extracts anything from the input data that looks like a Portable Executable (PE) file.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "name".
  -r, --recursive  Extract PE files that are contained in already extracted PEs.
  -k, --keep-root  If the input chunk is itself a PE, include it as an output chunk.
  -m, --memdump    Use the virtual memory layout of a PE file to calculate its size.
  -f, --fileinfo   Use the PE meta information to deduce a file name meta variable.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class carve_pe(PathExtractorUnit):
    """
    Extracts anything from the input data that looks like a Portable
    Executable (PE) file.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, path=b'name',
        recursive: Param[bool, Arg.Switch('-r', help='Extract PE files that are contained in already extracted PEs.')] = False,
        keep_root: Param[bool, Arg.Switch('-k', help='If the input chunk is itself a PE, include it as an output chunk.')] = False,
        memdump: Param[bool, Arg.Switch('-m', help='Use the virtual memory layout of a PE file to calculate its size.')] = False,
        fileinfo: Param[bool, Arg.Switch('-f', help='Use the PE meta information to deduce a file name meta variable.')] = False
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            path=path,
            recursive=recursive,
            keep_root=keep_root,
            memdump=memdump,
            fileinfo=fileinfo,
        )

    def unpack(self, data):
        cursor = 0
        mv = memoryview(data)

        while True:
            offset = data.find(B'MZ', cursor)
            if offset < cursor: break
            cursor = offset + 2
            ntoffset = mv[offset + 0x3C:offset + 0x3E]
            if len(ntoffset) < 2:
                return
            ntoffset, = unpack('H', ntoffset)
            if mv[offset + ntoffset:offset + ntoffset + 2] != B'PE':
                self.log_debug(F'invalid NT header signature for candidate at 0x{offset:08X}')
                continue
            try:
                pe = lief.load_pe_fast(mv[offset:])
            except Exception as err:
                self.log_debug(F'parsing of PE header at 0x{offset:08X} failed:', err)
                continue

            pesize = get_pe_size(pe, memdump=self.args.memdump)
            pedata = mv[offset:offset + pesize]
            info = {}
            if self.args.fileinfo:
                pe_meta_parser = pemeta()
                try:
                    info = pe_meta_parser.parse_version(pe) or {}
                except Exception as error:
                    self.log_warn(F'Unable to obtain file information: {error!s}')
                try:
                    info.update(pe_meta_parser.parse_header(pe) or {})
                except Exception:
                    pass
            try:
                path = info['OriginalFilename']
            except KeyError:
                try:
                    path = info['ExportName']
                except KeyError:
                    path = F'carve-0x{offset:08X}.{magic(pedata).extension}'

            if offset > 0 or self.args.keep_root:
                yield UnpackResult(path, pedata, offset=offset)
                self.log_info(F'extracted PE file of size 0x{pesize:08X} from 0x{offset:08X}')
            else:
                self.log_info(F'ignored root file of size 0x{pesize:08X} from 0x{offset:08X}')
                continue

            if not offset or self.args.recursive:
                cursor += pe.optional_header.sizeof_headers
            else:
                cursor += pesize - 2

class carve_rtf

This unit is implemented in refinery.units.pattern.carve_rtf and has the following commandline Interface:

usage: carve-rtf [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like an RTF document.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_rtf(Unit):
    """
    Extracts anything from the input data that looks like an RTF document.
    """

    def process(self, data: bytearray):
        pos = 0
        mem = memoryview(data)
        sig = re.escape(b'{\\rtf')

        while True:
            match = re.search(sig, mem[pos:], flags=re.IGNORECASE)
            if match is None:
                break
            pos = pos + match.start()
            end = pos + 1
            depth = 1
            while depth and end < len(mem):
                if mem[end] == 0x7B:  # {
                    depth += 1
                if mem[end] == 0x7D:  # }
                    depth -= 1
                end += 1
            if depth > 0:
                break
            yield self.labelled(mem[pos:end], offset=pos)
            pos = end

class carve_xml

This unit is implemented in refinery.units.pattern.carve_xml and has the following commandline Interface:

usage: carve-xml [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like XML.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_xml(Unit):
    """
    Extracts anything from the input data that looks like XML.
    """

    def process(self, data):
        for offset, chunk in XMLCarver(data):
            yield self.labelled(chunk, offset=offset)

class carve_zip

This unit is implemented in refinery.units.pattern.carve_zip and has the following commandline Interface:

usage: carve-zip [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a zip archive file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_zip(Unit):
    """
    Extracts anything from the input data that looks like a zip archive file.
    """

    def process(self, data: bytearray):
        end = len(data)
        mem = memoryview(data)
        rev = []
        while True:
            end = data.rfind(ZipEndOfCentralDirectory.SIGNATURE, 0, end)
            if end < 0:
                break
            try:
                end_marker = ZipEndOfCentralDirectory(mem[end:])
            except ValueError as e:
                self.log_info(F'error parsing end of central directory at 0x{end:X}: {e!s}')
                continue
            else:
                self.log_info(F'successfully parsed end of central directory at 0x{end:X}')
            start = end - end_marker.directory_size
            shift = start - end_marker.directory_offset
            if start < 0:
                self.log_debug('end of central directory size is invalid')
                continue
            try:
                central_directory = ZipCentralDirectory(mem[start:])
            except ValueError:
                self.log_debug('computed location of central directory is invalid')
                end = end - len(ZipEndOfCentralDirectory.SIGNATURE)
                continue
            start = central_directory.header_offset + shift
            if mem[start:start + 4] not in (B'PK\x03\x04', B'\0\0\0\0'):
                # SFX payloads seem to have a nulled header, so we permit this.
                self.log_debug('computed start of ZIP archive does not have the correct signature bytes')
                continue
            rev.append((start, end + len(end_marker)))
            end = start
        for start, end in reversed(rev):
            zip = mem[start:end]
            yield self.labelled(zip, offset=start)

class cca (data)

This unit is implemented in refinery.units.strings.cca and has the following commandline Interface:

usage: cca [-h] [-L] [-Q] [-0] [-v] data

Short for ConCatAppend: This unit concatenates the input data with its argument by appending the
latter to the former. See also ccp for the unit that prepends instead.

positional arguments:
  data           Binary string to be appended to the input.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cca(Unit):
    """
    Short for ConCatAppend: This unit concatenates the input data with its argument by
    appending the latter to the former. See also `refinery.ccp` for the unit that prepends
    instead.
    """

    def __init__(self, data: Param[buf, Arg(help='Binary string to be appended to the input.')]):
        super().__init__(data=data)

    def process(self, data: bytearray):
        data.extend(self.args.data)
        return data

class ccp (data)

This unit is implemented in refinery.units.strings.ccp and has the following commandline Interface:

usage: ccp [-h] [-L] [-Q] [-0] [-v] data

Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending
the latter to the former. See also cca for the unit that appends instead.

positional arguments:
  data           Binary string to be prepended to the input.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ccp(Unit):
    """
    Short for ConCatPrepend: This unit concatenates the input data with its argument by
    prepending the latter to the former. See also `refinery.cca` for the unit that appends
    instead.
    """

    def __init__(self, data: Param[buf, Arg(help='Binary string to be prepended to the input.')]):
        super().__init__(data=data)

    def process(self, data: bytearray):
        data[:0] = self.args.data
        return data

class chacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

This unit is implemented in refinery.units.crypto.cipher.chacha and has the following commandline Interface:

usage: chacha [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce]

ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original
Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is
interpreted as the initial state box and all other parameters are ignored.

positional arguments:
  key                The encryption key.
  nonce              The nonce. Default is the string REFINERY.

options:
  -s, --stateful     Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N    Discard the first N bytes of the keystream, 0 by default.
  -m, --magic MAGIC  The magic constant; depends on the key size by default.
  -x, --offset N     Optionally specify the stream index, default is 0.
  -r, --rounds N     The number of rounds. Has to be an even number. Default is 20.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class chacha(LatinCipherUnit):
    """
    ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the
    original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this
    data is interpreted as the initial state box and all other parameters are ignored.
    """
    def keystream(self) -> Iterable[int]:
        key = self.args.key
        if len(key) == 64:
            it = ChaChaCipher.FromState(key)
        else:
            it = ChaChaCipher(
                key,
                self.args.nonce,
                self.args.magic,
                self.args.rounds,
                self.args.offset,
            )
        yield from it

class chacha20 (key, nonce=b'REFINERY')

This unit is implemented in refinery.units.crypto.cipher.chacha and has the following commandline Interface:

usage: chacha20 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce]

ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12
bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20
is functionally equivalent to chacha with 20 rounds, but this unit uses the PyCryptodome library
C implementation rather than the pure Python implementation used by chacha.

positional arguments:
  key            The encryption key.
  nonce          The nonce. Default is the string REFINERY.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class chacha20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20)):
    """
    ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must
    be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking
    this unit for ChaCha20 is functionally equivalent to `refinery.chacha` with 20 rounds,
    but this unit uses the PyCryptodome library C implementation rather than the pure
    Python implementation used by `refinery.chacha`.
    """

class chacha20poly1305 (key, nonce=b'REFINERY')

This unit is implemented in refinery.units.crypto.cipher.chacha and has the following commandline Interface:

usage: chacha20poly1305 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce]

ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20 variant, the
nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce instead.

positional arguments:
  key            The encryption key.
  nonce          The nonce. Default is the string REFINERY.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class chacha20poly1305(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20_Poly1305)):
    """
    ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20
    variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce
    instead.
    """
    def _get_cipher(self, reset_cache=False):
        cipher = super()._get_cipher(reset_cache)
        cipher.block_size = 1
        return cipher

class chaskey (key, iv=b'', padding=None, mode=None, raw=False, rounds=12, swap=False, *, aad=b'', tag=(), segment_size=0, little_endian=False)

This unit is implemented in refinery.units.crypto.cipher.chaskey and has the following commandline Interface:

usage: chaskey [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-k N] [-s] [-e] [-S N]
               key

This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the
default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey
with 16 rounds and in CTR mode.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -k, --rounds N        Number of rounds to use, the default is 12
  -s, --swap            Use big endian byte order for all blocks.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class chaskey(StandardBlockCipherUnit, cipher=BlockCipherFactory(Chaskey)):
    """
    This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the
    default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey
    with 16 rounds and in CTR mode.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        rounds: Param[int, Arg.Number('-k', help='Number of rounds to use, the default is {default}')] = _R,
        swap: Param[bool, Arg.Switch('-s', help='Use big endian byte order for all blocks.')] = False,
        **more
    ):
        super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, rounds=rounds, swap=swap, **more)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            swap=self.args.swap,
            rounds=self.args.rounds,
            **optionals
        )

class chop (size, step=None, truncate=False)

This unit is implemented in refinery.units.meta.chop and has the following commandline Interface:

usage: chop [-h] [-L] [-Q] [-0] [-v] [-t] N [N]

Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.

positional arguments:
  N               Chop data into chunks of this size
  N               Optionally specify a step size (which is equal to the size by default) which
                  indicates the number of bytes by which the cursor will be increased after
                  extracting a chunk.

options:
  -t, --truncate  Truncate possible excess bytes at the end of the input, by default they are
                  appended as a single chunk.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class chop(Unit):
    """
    Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.
    """

    def __init__(
        self,
        size: Param[int, Arg.Number('size', help='Chop data into chunks of this size')],
        step: Param[int, Arg.Number('step', help=(
            'Optionally specify a step size (which is equal to the size by default) which indicates the number of bytes by '
            'which the cursor will be increased after extracting a chunk.'))] = None,
        truncate: Param[bool, Arg.Switch('-t', help=(
            'Truncate possible excess bytes at the end of the input, by default they are appended as a single chunk.'))] = False,
    ):
        return super().__init__(size=size, step=step, truncate=truncate)

    def process(self, data):
        view = memoryview(data)
        size = self.args.size
        step = self.args.step
        if size < 1:
            raise ValueError('The chunk size has to be a positive integer value.')
        yield from splitchunks(view, size, step, self.args.truncate)

class clower

This unit is implemented in refinery.units.strings.clower and has the following commandline Interface:

usage: clower [-h] [-L] [-Q] [-0] [-v]

Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the
input to lowercase.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class clower(Unit):
    """
    Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the
    input to lowercase.
    """
    def process(self, data):
        return data.lower()

class cm (invert=False, all=False, reset=False, size=False, ext=False, entropy=False, ic=False, magic=False, sha1=False, sha256=False, crc32=False, md5=False, hashes=False, *names)

This unit is implemented in refinery.units.meta.cm and has the following commandline Interface:

usage: cm [-h] [-L] [-Q] [-0] [-v] [-x | -a] [-r] [-S] [-X] [-E] [-C] [-M] [-1] [-2] [-3] [-5]
          [-H]
          [name ...]

The Common Meta variables unit populates the set of meta variables of the current chunk with
commonly used metadata. The unit has no effect outside a frame.

positional arguments:
  name           A variable name that can include the common properties: mime, ext, magic, size,
                 entropy, ic, crc32, sha1, sha256, sha512, md5. If none is given, the size
                 variable is populated. For most of these, an optional argument is available that
                 can be used as a shorthand:

options:
  -x, --invert   populate only options that have not been specified
  -a, --all      populate all options
  -r, --reset    discard all meta variables that were not explicitly specified
  -S, --size     size of the chunk
  -X, --ext      guess file extension
  -E, --entropy  compute data entropy
  -C, --ic       compute the index of coincidence
  -M, --magic    compute file magic
  -1, --sha1     compute hash: SHA-1
  -2, --sha256   compute hash: SHA-256
  -3, --crc32    compute hash: CRC32
  -5, --md5      compute hash: MD5
  -H, --hashes   compute all common hashes

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cm(Unit):
    """
    The Common Meta variables unit populates the set of meta variables of the current chunk with commonly
    used metadata. The unit has no effect outside a frame.
    """
    def __init__(
        self,
        invert: Param[bool, Arg.Switch('-x', group='ALL', help='populate only options that have not been specified')] = False,
        all: Param[bool, Arg.Switch('-a', group='ALL', help='populate all options')] = False,
        reset: Param[bool, Arg.Switch('-r', help='discard all meta variables that were not explicitly specified')] = False,
        size: Param[bool, Arg.Switch('-S', help='size of the chunk')] = False,
        ext: Param[bool, Arg.Switch('-X', help='guess file extension')] = False,
        entropy: Param[bool, Arg.Switch('-E', help='compute data entropy')] = False,
        ic: Param[bool, Arg.Switch('-C', help='compute the index of coincidence')] = False,
        magic: Param[bool, Arg.Switch('-M', help='compute file magic')] = False,
        sha1: Param[bool, Arg.Switch('-1', help='compute hash: SHA-1')] = False,
        sha256: Param[bool, Arg.Switch('-2', help='compute hash: SHA-256')] = False,
        crc32: Param[bool, Arg.Switch('-3', help='compute hash: CRC32')] = False,
        md5: Param[bool, Arg.Switch('-5', help='compute hash: MD5')] = False,
        hashes: Param[bool, Arg.Switch('-H', help='compute all common hashes')] = False,
        *names: Param[str, Arg.String(metavar='name', help=(
            F'A variable name that can include the common properties: {_COMMON_PROPERTIES_LIST}.'
            R' If none is given, the size variable is populated. For most of these, an optional '
            R'argument is available that can be used as a shorthand:'))]
    ):
        def stringify(name):
            if isinstance(name, (bytes, bytearray)):
                return name.decode(self.codec)
            if isinstance(name, str):
                return name
            raise TypeError(F'Invalid type for name: {name!r}')

        _names = {
            stringify(name) for name in names}
        if hashes:
            md5 = sha256 = sha1 = crc32 = True
        if size:
            _names.add('size')
        if ext:
            _names.add('ext')
        if entropy:
            _names.add('entropy')
        if ic:
            _names.add('ic')
        if magic:
            _names.add('magic')
        if sha1:
            _names.add('sha1')
        if sha256:
            _names.add('sha256')
        if crc32:
            _names.add('crc32')
        if md5:
            _names.add('md5')
        if not _names and not reset:
            _names.add('size')
        if all:
            if invert:
                raise ValueError('invert and all are both enabled, resulting in empty configuration.')
            _names = set(LazyMetaOracle.derivations)
        elif invert:
            _names = set(LazyMetaOracle.derivations) - _names
        super().__init__(names=list(_names), reset=reset)

    def process(self, data):
        return data

    def filter(self, chunks):
        names = self.args.names
        reset = self.args.reset
        for chunk in chunks:
            if not chunk.visible:
                yield chunk
                continue
            meta = metavars(chunk)
            if reset:
                chunk.meta.clear()
            for name in names:
                chunk[name] = meta[name]
            yield chunk

class codebook (words)

This unit is implemented in refinery.units.crypto.cipher.codebook and has the following commandline Interface:

usage: codebook [-h] [-L] [-Q] [-0] [-v] [-R] words

Given a sequence of words (as a msgpack-encoded list of binary strings) the unit converts the
occurrence of any of these words by a byte value representing the word's index in the sequence.
The first word from the sequence that matches at a given offset will be used to determine this
value. Any substrings that cannot be matched to a word in the sequence are skipped, assuming that
they are separators.

positional arguments:
  words          A list of binary strings in msgpack format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class codebook(Unit):
    """
    Given a sequence of words (as a msgpack-encoded list of binary strings) the unit converts the
    occurrence of any of these words by a byte value representing the word's index in the sequence.
    The first word from the sequence that matches at a given offset will be used to determine this
    value. Any substrings that cannot be matched to a word in the sequence are skipped, assuming
    that they are separators.
    """
    def __init__(
        self,
        words: Param[buf, Arg.Binary(help='A list of binary strings in msgpack format.')],
    ):
        super().__init__(words=words)

    def _book(self) -> list[bytes]:
        try:
            book = msgpack.loads(self.args.words)
        except Exception:
            raise ValueError(R'The given words are not a valid msgpack buffer.')
        if not isinstance(book, list):
            raise ValueError(F'The given words are not a list, but a {type(book).__name__}.')
        if not all(isinstance(v, bytes) for v in book):
            raise ValueError(R'The given words are not all byte strings.')
        if len(book) > 256:
            raise NotImplementedError(
                R'Only code books up to 256 entries in size are currently supported.')
        return book

    def process(self, data: bytearray):
        book = self._book()
        lookup = {word: code for code, word in enumerate(book)}
        decode = re.compile(B'|'.join(re.escape(word) for word in book))
        return bytearray((lookup[x] for x in decode.findall(data)))

    def reverse(self, data: bytearray):
        book = self._book()
        return B''.join(book[b] for b in data)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data: bytearray):
    book = self._book()
    return B''.join(book[b] for b in data)

class cp1252

This unit is implemented in refinery.units.encoding.cp1252 and has the following commandline Interface:

usage: cp1252 [-h] [-L] [-Q] [-0] [-v] [-R]

Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class cp1252(Unit):
    """
    Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.
    """

    def process(self, data):
        return data.decode(self.codec).encode('cp1252')

    def reverse(self, data):
        return data.decode('cp1252').encode(self.codec)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return data.decode('cp1252').encode(self.codec)

class crc32 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.checksums and has the following commandline Interface:

usage: crc32 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the CRC32 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class crc32(HashUnit):
    """
    Returns the CRC32 hash of the input data.
    """
    def _algorithm(self, data) -> bytes:
        return zlib.crc32(data).to_bytes(4, 'big')

class csb (format, utf16=True, ascii=True, stripspace=False)

This unit is implemented in refinery.units.pattern.carve and has the following commandline Interface:

usage: csb [-h] [-L] [-Q] [-0] [-v] [-a | -u] [-x] format

Short for carve single buffer; carves the single largest buffer of a given format from the input
data and returns it.

positional arguments:
  format            Specify one of the following formats: integer, float, number, string,
                    multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote,
                    urlquote-coarse, urlquote-narrow, intarray, strarray, numarray, word,
                    letters, wshenc, alphanumeric, b32, b58, b62, b64, b85, a85, z85, b92,
                    b64url, hex, b16, b16s, b64s, b85s, a85s, z85s, utf8, hexdump, hexarray,
                    uuencode

options:
  -a, --no-utf16    Search for ASCII encoded patterns only.
  -u, --no-ascii    Search for UTF16 encoded patterns only.
  -x, --stripspace  Strip all whitespace from input data.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class csb(carve):
    """
    Short for carve single buffer; carves the single largest buffer of a given format from the
    input data and returns it.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=False,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )

class csd (format, utf16=True, ascii=True, stripspace=False)

This unit is implemented in refinery.units.pattern.carve and has the following commandline Interface:

usage: csd [-h] [-L] [-Q] [-0] [-v] [-a | -u] [-x] format

Short for carve & decode; carves the single largest buffer of a given format from the input and
decodes it with the appropriate decoder.

positional arguments:
  format            Specify one of the following formats: integer, float, number, string,
                    multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote,
                    urlquote-coarse, urlquote-narrow, intarray, strarray, numarray, word,
                    letters, wshenc, alphanumeric, b32, b58, b62, b64, b85, a85, z85, b92,
                    b64url, hex, b16, b16s, b64s, b85s, a85s, z85s, utf8, hexdump, hexarray,
                    uuencode

options:
  -a, --no-utf16    Search for ASCII encoded patterns only.
  -u, --no-ascii    Search for UTF16 encoded patterns only.
  -x, --stripspace  Strip all whitespace from input data.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class csd(carve):
    """
    Short for carve & decode; carves the single largest buffer of a given format from the input
    and decodes it with the appropriate decoder.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=True,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )

class csv (quote=b'"', delim=b',')

This unit is implemented in refinery.units.formats.csv and has the following commandline Interface:

usage: csv [-h] [-L] [-Q] [-0] [-v] [-R] [-q QUOTE] [-d DELIM]

Extracts the rows of a CSV document with header and converts them into JSON chunks.

options:
  -q, --quote QUOTE  Specify the quote character, the default is a double quote.
  -d, --delim DELIM  Specify the delimiter, the default is a single comma.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class csv(Unit):
    """
    Extracts the rows of a CSV document with header and converts them into JSON chunks.
    """
    def __init__(
        self,
        quote: Param[buf, Arg('-q', help='Specify the quote character, the default is a double quote.')] = B'"',
        delim: Param[buf, Arg('-d', help='Specify the delimiter, the default is a single comma.')] = B','
    ):
        super().__init__(quote=quote, delim=delim)

    def json_to_csv(self, table: dict):
        quote = self.args.quote.decode(self.codec)
        delim = self.args.delim.decode(self.codec)

        if not isinstance(table, list):
            raise ValueError('Input must be a JSON list.')

        out = MemoryFile()

        with io.TextIOWrapper(out, self.codec, newline='') as stream:
            writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            for row in table:
                if not isinstance(row, list):
                    break
                if not all(isinstance(item, str) for item in row):
                    break
                writer.writerow(row)
            else:
                return out.getvalue()

        keys = {}
        # A dictionary is used here over a set because dictionaries remember insertion order.
        # When feeding the unit a sequence of JSON objects, the user would likely expect the
        # column order in the resulting CSV to derive from the entry oder in the JSON data.

        for row in table:
            for key in row:
                if not isinstance(key, str):
                    continue
                keys[key] = None

        keys = list(keys)
        out = MemoryFile()

        with io.TextIOWrapper(out, self.codec, newline='') as stream:
            writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            writer.writerow(keys)
            for row in table:
                writer.writerow([str(row.get(key, '')) for key in keys])
            return out.getvalue()

    def reverse(self, data: bytearray):
        try:
            table: list[dict[str, Any]] = json.loads(data)
        except Exception:
            table: list[dict[str, Any]] = [json.loads(line) for line in data.splitlines()]
        return self.json_to_csv(table)

    def process(self, data):
        quote = self.args.quote.decode(self.codec)
        delim = self.args.delim.decode(self.codec)

        def convert(field: str):
            if field.isdigit() and not field.startswith('0'):
                return int(field)
            date = isodate(field)
            if date is not None:
                return date.isoformat(' ', 'seconds')
            return field

        with io.TextIOWrapper(MemoryFile(data), self.codec) as stream:
            rows = _csv.reader(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            keys = next(rows)
            for row in rows:
                out = {key: convert(value) for key, value in zip(keys, row)}
                yield json.dumps(out, indent=4).encode(self.codec)

Methods

def json_to_csv(self, table)

Expand source code Browse git

def json_to_csv(self, table: dict):
    quote = self.args.quote.decode(self.codec)
    delim = self.args.delim.decode(self.codec)

    if not isinstance(table, list):
        raise ValueError('Input must be a JSON list.')

    out = MemoryFile()

    with io.TextIOWrapper(out, self.codec, newline='') as stream:
        writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
        for row in table:
            if not isinstance(row, list):
                break
            if not all(isinstance(item, str) for item in row):
                break
            writer.writerow(row)
        else:
            return out.getvalue()

    keys = {}
    # A dictionary is used here over a set because dictionaries remember insertion order.
    # When feeding the unit a sequence of JSON objects, the user would likely expect the
    # column order in the resulting CSV to derive from the entry oder in the JSON data.

    for row in table:
        for key in row:
            if not isinstance(key, str):
                continue
            keys[key] = None

    keys = list(keys)
    out = MemoryFile()

    with io.TextIOWrapper(out, self.codec, newline='') as stream:
        writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
        writer.writerow(keys)
        for row in table:
            writer.writerow([str(row.get(key, '')) for key in keys])
        return out.getvalue()

def reverse(self, data)

Expand source code Browse git

def reverse(self, data: bytearray):
    try:
        table: list[dict[str, Any]] = json.loads(data)
    except Exception:
        table: list[dict[str, Any]] = [json.loads(line) for line in data.splitlines()]
    return self.json_to_csv(table)

class cswap

This unit is implemented in refinery.units.strings.cswap and has the following commandline Interface:

usage: cswap [-h] [-L] [-Q] [-0] [-v]

Swap the case of the input string; all lowercase letters are turned into their uppercase variant
and vice-versa.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cswap(Unit):
    """
    Swap the case of the input string; all lowercase letters are turned into their uppercase
    variant and vice-versa.
    """
    def process(self, data: bytearray):
        lcase = bytes(range(B'a'[0], B'z'[0] + 1))
        ucase = bytes(range(B'A'[0], B'Z'[0] + 1))
        delta = lcase[0] - ucase[0]
        for k, letter in enumerate(data):
            if letter in ucase:
                data[k] += delta
            elif letter in lcase:
                data[k] -= delta
        return data

class cupper

This unit is implemented in refinery.units.strings.cupper and has the following commandline Interface:

usage: cupper [-h] [-L] [-Q] [-0] [-v]

Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the
input to uppercase.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cupper(Unit):
    """
    Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the
    input to uppercase.
    """
    def process(self, data):
        return data.upper()

class d2p (tee=False, stream=False, plain=False, force=False)

This unit is implemented in refinery.units.sinks.dump and has the following commandline Interface:

usage: d2p [-h] [-L] [-Q] [-0] [-v] [-t] [-s] [-p] [-f]

Stands for "dump to path"; this is a shortcut for the dump unit which is equivalent to running:

    dump {path}

This will dump all chunk in the current frame to the path given by the path meta variable, which
is cmmonly set by units like xt.

options:
  -t, --tee      Forward all inputs to STDOUT.
  -s, --stream   Dump all incoming data to the same file.
  -p, --plain    Never apply any formatting to file names.
  -f, --force    Remove files if necessary to create dump path.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class d2p(dump):
    """
    Stands for "dump to path"; this is a shortcut for the `refinery.dump` unit which is equivalent
    to running:

        dump {path}

    This will dump all chunk in the current frame to the path given by the `path` meta variable,
    which is cmmonly set by units like `refinery.xt`.
    """
    def __init__(self, tee=False, stream=False, plain=False, force=False):
        super().__init__('{path}', tee=tee, stream=stream, plain=plain, force=force)

class datefix (format='%Y-%m-%d %H:%M:%S', dos=False)

This unit is implemented in refinery.units.misc.datefix and has the following commandline Interface:

usage: datefix [-h] [-L] [-Q] [-0] [-v] [-d] [format]

Parses all kinds of date formats and unifies them into the same format.

positional arguments:
  format         Specify the output format as a strftime-like string, using ISO by default.

options:
  -d, --dos      Parse timestamps in DOS rather than Unix format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class datefix(Unit):
    """
    Parses all kinds of date formats and unifies them into the same format.
    """

    def __init__(
        self,
        format: Param[str, Arg(help='Specify the output format as a strftime-like string, using ISO by default.')] = '%Y-%m-%d %H:%M:%S',
        dos: Param[bool, Arg('-d', help='Parse timestamps in DOS rather than Unix format.')] = False
    ):
        super().__init__(format=format, dos=dos)

    @staticmethod
    def dostime(stamp: int) -> datetime:
        """
        Parses a given DOS timestamp into a datetime object.
        """
        d, t = stamp >> 16, stamp & 0xFFFF
        s = (t & 0x1F) << 1

        return datetime(
            year   = ((d & 0xFE00) >> 0x9) + 1980,  # noqa
            month  = ((d & 0x01E0) >> 0x5),         # noqa
            day    = ((d & 0x001F) >> 0x0),         # noqa
            hour   = ((t & 0xF800) >> 0xB),         # noqa
            minute = ((t & 0x07E0) >> 0x5),         # noqa
            second = 59 if s == 60 else s,          # noqa
        )

    def _format(self, dt: datetime) -> str:
        return dt.strftime(self.args.format)

    def _extract_timezone(self, data: str):
        def extract(match: re.Match[str]):
            nonlocal zone
            if zone is not None:
                raise ValueError
            h = int(h) if (h := match['h']) else 0
            m = int(m) if (m := match['m']) else 0
            zone = timedelta(hours=h, minutes=m)
            if match['p'] == '-':
                zone = -zone
            return ''
        zone = None
        data = re.sub(_TIMEZONE_PATTERN, extract, data)
        data = re.sub('\\s{2,}', ' ', data).strip()
        return data, zone

    @linewise
    def process(self, data: str) -> str:
        data = data.strip()

        # replace colons (i.e. for exiftool dates: 2017:01:01)
        if len(data) > 10 and data[4] == ':' and data[7] == ':':
            data = F'{data[0:4]}-{data[5:7]}-{data[8:]}'

        # strips Z at end (i.e. 20171022055144Z)
        if data.endswith('Z'):
            data = data[:-1]

        if data.startswith('0x'):
            try:
                data = str(int(data, 16))
            except Exception:
                pass

        # parses timestamps and dates without much format
        if data.isdigit():
            time_stamp = int(data)
            if len(data) > 14:
                raise Exception('cannot parse all-numeric string as date: %s' % data)
            elif len(data) == 14:
                # i.e. 20111020193727
                return self._format(datetime.strptime(data, '%Y%m%d%H%M%S'))
            elif len(data) == 13:
                # i.e. 1458016535000
                time_stamp //= 1000
                data = data[:-3]
            if self.args.dos:
                return self._format(self.dostime(time_stamp))
            else:
                return self._format(date_from_timestamp(time_stamp))

        try:
            data, time_delta = self._extract_timezone(data)
        except ValueError:
            return data

        for f in _DATETIME_PATTERNS:
            try:
                dt = datetime.strptime(data, f)
            except ValueError:
                continue
            return self._format(dt if time_delta is None else dt - time_delta)

        return data

Static methods

def dostime(stamp)

Parses a given DOS timestamp into a datetime object.

Expand source code Browse git

@staticmethod
def dostime(stamp: int) -> datetime:
    """
    Parses a given DOS timestamp into a datetime object.
    """
    d, t = stamp >> 16, stamp & 0xFFFF
    s = (t & 0x1F) << 1

    return datetime(
        year   = ((d & 0xFE00) >> 0x9) + 1980,  # noqa
        month  = ((d & 0x01E0) >> 0x5),         # noqa
        day    = ((d & 0x001F) >> 0x0),         # noqa
        hour   = ((t & 0xF800) >> 0xB),         # noqa
        minute = ((t & 0x07E0) >> 0x5),         # noqa
        second = 59 if s == 60 else s,          # noqa
    )

class decompress (prepend=True, tolerance=12, max_ratio=1.0, min_ratio=0.0001, expand_limits=range(0, 257), expand_factor=1.75, strict_limits=False)

This unit is implemented in refinery.units.compression.decompress and has the following commandline Interface:

usage: decompress [-h] [-L] [-Q] [-0] [-v] [-P] [-t N] [-m R] [-n R] [-d a:b] [-k EXPAND_FACTOR]
                  [-l]

Attempts all available decompression units against the input and returns the output of the first
successful one. If none succeeds, the data is returned unaltered. The process is heavily biased
against LZNT1 decompression due to a large tendency for LZNT1 false positives.

options:
  -P, --no-prepend            By default, if decompression fails, the unit attempts to prefix the
                              data with all possible values of a single byte and decompress the
                              result. This behavior can be disabled with this flag.
  -t, --tolerance N           Maximum number of bytes to strip from the beginning of the data;
                              The default value is 12.
  -m, --max-ratio R           To determine whether a decompression algorithm was successful, the
                              ratio of compressed size to decompressed size may at most be as
                              large as this number, a floating point value R; default value is
                              1.0.
  -n, --min-ratio R           Require that compression ratios must be at least as large as R.
                              This is a "too good to be true" heuristic against algorithms like
                              lznt1 that can produce false positives. The default is 0.0001.
  -d, --expand-limits a:b     Ratio limits are expanded for sizes of input data in the given
                              range, the default being 0:0x100. The reason for this is that small
                              buffers can increase in size when compressed under many formats.
                              Set this to :0 or use strict limits to disable this setting.
  -k, --expand-factor EXPAND_FACTOR
                              The number by which the maximum compression ratio is multiplied for
                              small buffers. The default is 1.75.
  -l, --strict-limits         For recognized formats i.e. when a magic signature is present, the
                              above limits are disabled by default. Activate this flag to enforce
                              them in every case.

generic options:
  -h, --help                  Show this help message and exit.
  -L, --lenient               Increase the leniency, allowing partial results and ignoring more
                              errors.
  -Q, --quiet                 Disables all log output.
  -0, --devnull               Do not produce any output.
  -v, --verbose               Specify up to two times to increase log level.

Expand source code Browse git

class decompress(Unit):
    """
    Attempts all available decompression units against the input and returns
    the output of the first successful one. If none succeeds, the data is
    returned unaltered. The process is heavily biased against LZNT1 decompression
    due to a large tendency for LZNT1 false positives.
    """
    def __init__(
        self,
        prepend: Param[bool, Arg.Switch('-P', '--no-prepend', off=True, help=(
            'By default, if decompression fails, the unit attempts to prefix '
            'the data with all possible values of a single byte and decompress '
            'the result. This behavior can be disabled with this flag.')
        )] = True,
        tolerance: Param[int, Arg.Number('-t', help=(
            'Maximum number of bytes to strip from the beginning of the data; '
            'The default value is 12.')
        )] = 12,
        max_ratio: Param[float, Arg.Double('-m', metavar='R', help=(
            'To determine whether a decompression algorithm was successful, the '
            'ratio of compressed size to decompressed size may at most be as large '
            'as this number, a floating point value R; default value is {default}.')
        )] = 1.0,
        min_ratio: Param[float, Arg.Double('-n', metavar='R', help=(
            'Require that compression ratios must be at least as large as R. This '
            'is a "too good to be true" heuristic against algorithms like lznt1 '
            'that can produce false positives. The default is {default}.')
        )] = 0.0001,
        expand_limits: Param[slice, Arg.Bounds('-d', metavar='a:b', help=(
            'Ratio limits are expanded for sizes of input data in the given range, '
            'the default being 0:0x100. The reason for this is that small buffers '
            'can increase in size when compressed under many formats. Set this to :0 '
            'or use strict limits to disable this setting.')
        )] = range(0, 0x101),
        expand_factor: Param[float, Arg.Double('-k', help=(
            'The number by which the maximum compression ratio is multiplied for '
            'small buffers. The default is {default}.'
        ))] = 1.75,
        strict_limits: Param[bool, Arg.Switch('-l', help=(
            'For recognized formats i.e. when a magic signature is present, the '
            'above limits are disabled by default. Activate this flag to enforce '
            'them in every case.')
        )] = False

    ):
        if min_ratio <= 0:
            raise ValueError('The compression factor must be nonnegative.')
        super().__init__(
            tolerance=tolerance,
            prepend=prepend,
            min_ratio=min_ratio,
            max_ratio=max_ratio,
            strict_limits=strict_limits,
            expand_limits=expand_limits,
            expand_factor=expand_factor,
        )
        self.engines: dict[str, Unit] = {}
        for mode in (
            MSCF_MODE.XPRESS,
            MSCF_MODE.XPRESS_HUFF,
        ):
            mode = normalize_to_display(mode.name).casefold()
            unit = mscf.assemble(mode)
            self.engines[F'{unit.name}[{mode}]'] = unit
        for engine in [
            mscf,
            pkw,
            zstd,
            szdd,
            bz2,
            zl,
            lzf,
            flz,
            lzma,
            lzw,
            jcalg,
            lzo,
            aplib,
            qlz,
            brotli,
            blz,
            lzjb,
            lz4,
            lznt1,
            nrv2e,
            nrv2d,
            nrv2b,
        ]:
            unit: Unit = engine.assemble()
            _, _, name = unit.name.rpartition('auto-decompress-')
            self.engines[name] = unit
        for unit in self.engines.values():
            unit.log_detach()

    def process(self, data):

        data = memoryview(data)
        tiny = bounds[self.args.expand_limits]

        class Decompression(NamedTuple):
            method: str
            engine: Unit
            rating: _R
            result: buf | None = None
            cutoff: int = 0
            prefix: int | None = None
            magic: str | None = None

            def __str__(self):
                status = self.rating.summary
                method = self.method
                prefix = self.prefix
                if prefix is not None:
                    prefix = F'{_COLOR_WARNING}0x{prefix:02X}{_CR}'
                if cutoff := self.cutoff:
                    cutoff = F'{_COLOR_WARNING}0x{cutoff:02X}{_CR}'
                else:
                    cutoff = R'0x00'
                return F'prefix={prefix}, cutoff={cutoff}, [{status}] method={method}'

            def __len__(self):
                if not self.result:
                    return 0
                return len(self.result)

            @property
            def ratio(self):
                if not self.result:
                    return INF
                return (len(data) + int(bool(self.prefix)) - self.cutoff) / len(self)

            @property
            def unmodified(self):
                return self.prefix is None and self.cutoff == 0

        if self.args.prepend:
            buffer = bytearray(1 + len(data))
            buffer[1:] = data

        best_by_rating: dict[_R, Decompression] = {}

        def best_current_rating():
            return max(best_by_rating, default=_R.InvalidData)

        def decompress(method: str, engine: Unit, cutoff: int = 0, prefix: int | None = None, careful: bool = False):
            ingest = data[cutoff:]
            rating = _R.ValidData
            magic = None
            if cutoff == 0 and prefix is None and not careful:
                rating |= _R.NotMangled
            if prefix is not None:
                buffer[0] = prefix
                ingest = buffer
            is_handled = engine.handles(ingest)
            if is_handled is True:
                rating |= _R.KnownFormat
            if is_handled is False:
                return Decompression(method, engine, _R.InvalidData, None, cutoff, prefix)
            try:
                result = next(engine.act(ingest))
            except RefineryPartialResult as pr:
                rating |= _R.HadOutput
                result = pr.partial
            except Exception:
                result = None
            else:
                rating |= _R.Successful
                magic = get_structured_data_type(result)
                if magic is not None:
                    magic = magic.mnemonic
                    rating |= _R.KnownFormatOut

            return Decompression(method, engine, rating, result, cutoff, prefix, magic)

        def update(new: Decompression, discard_if_too_good=False):
            if not new.result:
                return
            ratio = new.ratio
            known = new.rating & _R.KnownFormat
            strict = self.args.strict_limits
            max_ratio = self.args.max_ratio
            min_ratio = self.args.min_ratio
            if not strict and len(data) in tiny:
                max_ratio *= self.args.expand_factor
                min_ratio /= self.args.expand_factor
            if (strict or not known) and not (min_ratio <= ratio <= max_ratio):
                return
            best = best_by_rating.get(new.rating, None)
            prefix = new.prefix
            if prefix is not None:
                prefix = F'0x{prefix:02X}'
            if new.unmodified and best and not best.unmodified:
                threshold = 1.00
            else:
                threshold = 0.95

            if not best:
                q = 0
            elif (q := len(best) / len(new)) > 1:
                # This is unexpected, but indicates that we may have produced incorrect output
                # before: What seems to work best is to force a reset at this point, although
                # it seems like there should be a better solution than this.
                q = -1
                assert best.result
                vb = memoryview(best.result)
                vn = memoryview(new.result)
                # This looks like we have skipped part of the compressed stream; At this point
                # we can abort and not force an update.
                if new.cutoff and vb[-len(vn):] == vn:
                    return

            if q < threshold:
                if best and discard_if_too_good:
                    if q < 0.5:
                        return
                    if new.rating & _R.Successful != _R.Successful:
                        return
                best_by_rating[new.rating] = new
                logger = self.log_info
                _color = _COLOR_SUCCESS
            else:
                logger = self.log_info
                _color = _COLOR_FAILURE
            if ratio >= 9:
                rs = 'USELESS'
                rc = _COLOR_FAILURE
            else:
                rs = F'{ratio * 100:6.2f}%'
                if ratio >= 1.1:
                    rc = _COLOR_FAILURE
                elif ratio >= 1.0:
                    rc = _COLOR_WARNING
                else:
                    rc = _COLOR_SUCCESS
            if q < 0:
                qs = 'RESTART'
            else:
                qs = F'{q:07.4f}'
            logger(lambda: (
                F'[{new.rating.brief}] [{rc}{rs}{_CR}] [q={_color}{qs}{_CR}] {new!s}'))

        for method, engine in self.engines.items():
            self.log_debug(F'attempting engine: {method}')
            careful = isinstance(engine, (lznt1, flz, lzjb))
            for t in range(self.args.tolerance + 1):
                if best_current_rating() >= _R.Successful and careful and t > 0:
                    break
                update(decompress(method, engine, t, None, careful), careful)
            if self.args.prepend and method not in _NO_PREFIX and best_current_rating() < _R.Successful:
                for p in range(0x100):
                    update(decompress(method, engine, 0, p, careful), careful)

        for r, u in best_by_rating.items():
            self.log_debug(r, u.method)

        for r in sorted(best_by_rating, reverse=True):
            if dc := best_by_rating[r]:
                if not dc.rating & _R.HadOutput:
                    continue
                self.log_info(F'settling on {dc.method} decompression, cutoff={dc.cutoff} and prefix={dc.prefix}.')
                if dc.rating & _R.NotMangled:
                    self.log_info('supporting evidence: no modifications to the buffer were necessary')
                if dc.rating & _R.KnownFormat:
                    self.log_info('supporting evidence: found a known magic signature')
                if dc.rating & _R.HadNoErrors:
                    self.log_info('supporting evidence: engine produced output without errors')
                elif dc.rating & _R.HadOutput:
                    self.log_info('supporting evidence: there were errors, but the engine produced output')
                if not dc.rating & _R.Successful:
                    self.log_info('the only decompression with result returned only a partial result.')
                if dc.rating & _R.KnownFormatOut and (magic := dc.magic):
                    self.log_info(F'the decompressed result had a known format: {magic}')
                return self.labelled(dc.result, method=dc.method)

        raise ValueError('no compression engine worked')

class dedup (key=None, count=False)

This unit is implemented in refinery.units.meta.dedup and has the following commandline Interface:

usage: dedup [-h] [-L] [-Q] [-0] [-v] [-c] [key]

Deduplicates a sequence of multiple inputs. The deduplication is limited to the current frame.

positional arguments:
  key            An optional meta variable expression to deduplicate.

options:
  -c, --count    Store the count of each deduplicated chunk.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dedup(Unit):
    """
    Deduplicates a sequence of multiple inputs. The deduplication is limited to the current `refinery.lib.frame`.
    """
    def __init__(
        self,
        key: Param[str, Arg.String('key', help='An optional meta variable expression to deduplicate.')] = None,
        count: Param[bool, Arg.Switch('-c', help='Store the count of each deduplicated chunk.')] = False
    ):
        super().__init__(key=key, count=count)

    def filter(self, chunks):
        keyvar = self.args.key

        if keyvar is not None:
            def key(chunk):
                v = PythonExpression.Evaluate(keyvar, metavars(chunk))
                if isbuffer(v):
                    v = md5(v).digest()
                return v
        else:
            def key(chunk):
                return md5(chunk).digest()

        if self.args.count:
            counts = {}
            buffer = {}
            hashes = None
        else:
            hashes = set()
            counts = None
            buffer = None

        for chunk in chunks:
            if not chunk.visible:
                yield chunk
                continue

            uid = key(chunk)

            if hashes is None:
                counts[uid] = counts.get(uid, 0) + 1
                buffer.setdefault(uid, chunk)
            elif uid in hashes:
                continue
            else:
                hashes.add(uid)
                yield chunk

        if hashes is None:
            for uid, chunk in buffer.items():
                yield self.labelled(chunk, count=counts[uid])

class defang (url_only=False, url_protocol=False, dot_only=False, quote_md=False)

This unit is implemented in refinery.units.pattern.defang and has the following commandline Interface:

usage: defang [-h] [-L] [-Q] [-0] [-v] [-R] [-u] [-p] [-d] [-q]

Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot
in the expression by [.]. For example, 127.0.0.1 will be replaced by 127.0.0[.]1. For URL
indicators, the colon after the procol scheme is also wrapped in brackets.

options:
  -u, --url-only      Only defang URLs, do not look for domains or IPs.
  -p, --url-protocol  Escape the protocol in URLs.
  -d, --dot-only      Do not escape the protocol colon in URLs.
  -q, --quote-md      Wrap all indicators in backticks for markdown code.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.
  -R, --reverse       Use the reverse operation.

Expand source code Browse git

class defang(Unit):
    """
    Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot
    in the expression by `[.]`. For example, `127.0.0.1` will be replaced by `127.0.0[.]1`. For URL
    indicators, the colon after the procol scheme is also wrapped in brackets.
    """

    _WHITELIST = [
        B'wscript.shell',
    ]

    _PROTOCOL_ESCAPES = {
        B'http': B'hxxp',
        B'https': B'hxxps',
        B'ftp': B'fxp',
        B'ftps': B'fxps',
    }

    def __init__(
        self,
        url_only: Param[bool, Arg.Switch('-u', help='Only defang URLs, do not look for domains or IPs.')] = False,
        url_protocol: Param[bool, Arg.Switch('-p', help='Escape the protocol in URLs.')] = False,
        dot_only: Param[bool, Arg.Switch('-d', help='Do not escape the protocol colon in URLs.')] = False,
        quote_md: Param[bool, Arg.Switch('-q', help='Wrap all indicators in backticks for markdown code.')] = False
    ):
        self.superinit(super(), **vars())

    def _quote(self, word):
        return word if not self.args.quote_md else B'`%s`' % word

    def reverse(self, data: bytearray):
        def refang(hostname):
            return hostname[0].replace(B'[.]', B'.')
        data = defanged.hostname.sub(refang, data)
        data = data.replace(B'[:]//', B'://')
        data = data.replace(B'[://]', B'://')
        data = re.sub(B'h.{3}?(s?)://', B'http\\1://', data)
        data = re.sub(B'fxp(s?)://', B'ftp\\1://', data)
        return data

    def process(self, data):
        def replace_hostname(hostname: bytes, match=True):
            if match:
                return self._quote(replace_hostname(hostname[0], False))
            self.log_info('replace:', hostname)
            host = hostname
            user, atsgn, host = host.rpartition(B'@')
            host, colon, port = host.rpartition(B':')
            host = host.lower()
            if not colon:
                host = port
                port = B''
            if host in self._WHITELIST:
                return hostname
            host = re.split(R'(?:\[\.\]|\.)', host.decode('latin1'))
            if len(host) == 1:
                return hostname
            components = iter(reversed(host))
            defanged_parts = [next(components)]
            separator = '[.]'
            for part in components:
                defanged_parts.append(separator)
                defanged_parts.append(part)
                separator = '[.]' if part in tlds else '.'
            defanged_host = ''.join(reversed(defanged_parts)).encode('latin1')
            return user + atsgn + defanged_host + colon + port

        def replace_url(url: bytes):
            if not url:
                return url
            self.log_info('replace:', url)
            url = url.replace(B'[:]//', B'://', 1)
            url = url.replace(B'[.]', B'.')
            prefix = B'tcp'
            if url.startswith(B'://'):
                scheme = 0
            elif url.startswith(B'//'):
                scheme = 1
                prefix = prefix + B':'
            else:
                scheme = 2
                prefix = B''
            parsed = urlparse(prefix + url)
            operations = {
                name: self.process(getattr(parsed, name))
                for name in ('path', 'params', 'query', 'fragment')
            }
            if self.args.url_protocol and parsed.scheme:
                operations.update(scheme=self._PROTOCOL_ESCAPES.get(parsed.scheme.lower(), scheme))
            if scheme < 2:
                operations.update(scheme=B'')
            operations.update(netloc=replace_hostname(parsed.netloc, False))
            url = urlunparse(parsed._replace(**operations))
            if scheme == 0:
                url = B':' + url
            if not self.args.dot_only:
                url = url.replace(B'://', B'[:]//')
            return self._quote(url)

        urlsplit = defanged.url.split(data)
        step = defanged.url.value.groups + 1
        urlsplit[1::step] = [replace_url(t) for t in itertools.islice(iter(urlsplit), 1, None, step)]

        if not self.args.url_only:
            urlsplit[0::step] = [
                indicators.hostname.sub(replace_hostname, t)
                for t in itertools.islice(iter(urlsplit), 0, None, step)
            ]

        def fuse(urlsplit):
            txt = itertools.islice(iter(urlsplit), 0, None, step)
            url = itertools.islice(iter(urlsplit), 1, None, step)
            while True:
                try:
                    yield next(txt)
                    yield next(url)
                except StopIteration:
                    break

        return B''.join(fuse(urlsplit))

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data: bytearray):
    def refang(hostname):
        return hostname[0].replace(B'[.]', B'.')
    data = defanged.hostname.sub(refang, data)
    data = data.replace(B'[:]//', B'://')
    data = data.replace(B'[://]', B'://')
    data = re.sub(B'h.{3}?(s?)://', B'http\\1://', data)
    data = re.sub(B'fxp(s?)://', B'ftp\\1://', data)
    return data

class deob_js_arithmetic

This unit is implemented in refinery.units.obfuscation.js.arithmetic and has the following commandline Interface:

usage: deob-js-arithmetic [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_arithmetic(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.string, data)

        @strings.outside
        def evaluate(match: re.Match[str]):
            expression = match[0]
            expression = expression.strip()
            if not any(c.isdigit() for c in expression):
                return expression
            brackets = 0
            positions = []
            ok = True
            head = tail = rest = ''
            for end, character in enumerate(expression):
                if character == '(':
                    brackets += 1
                    positions.append(end)
                    continue
                if character == ')':
                    brackets -= 1
                    if brackets < 0:
                        expression, tail = expression[:end], expression[end:]
                        break
                    else:
                        positions.pop()
                    if brackets == 0 and expression[0] == '(':
                        expression, rest = expression[:end + 1], expression[end + 1:]
                        break
            if expression.isdigit():
                return match[0]
            if brackets > 0:
                pos = positions[~0] + 1
                head = expression[:pos]
                expression = expression[pos:]
            try:
                result = str(cautious_eval(expression + rest))
            except Exception:
                ok = False
            else:
                rest = ''
            if not ok and rest:
                try:
                    result = str(cautious_eval(expression))
                except Exception:
                    expression += rest
                else:
                    ok = True
            if not ok:
                result = expression
                self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})')
            else:
                if expression.startswith('(') and expression.endswith(')'):
                    result = F'({result})'
            if tail:
                tail = self.deobfuscate(tail)
            return F'{head}{result}{rest}{tail}'

        pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format(
            i=str(formats.integer), f=str(formats.float)))

        return pattern.sub(evaluate, data)

class deob_js_arrays

This unit is implemented in refinery.units.obfuscation.js.arrays and has the following commandline Interface:

usage: deob-js-arrays [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator to turn ["Z", "t", "s", "e"][0] into "Z".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_arrays(Deobfuscator):
    """
    JavaScript deobfuscator to turn `["Z", "t", "s", "e"][0]` into `"Z"`.
    """

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.string, data)

        @strlit.outside
        def litpick(match: re.Match[str]):
            try:
                array = match[1]
                index = int(match[2])
                lpick = array.split(',')[index].strip()
                self.log_debug(lambda: F'{lpick} = {match[0]}')
            except (TypeError, IndexError):
                lpick = match[0]
            return lpick

        p = R'\s{{0,5}}'.join([
            '\\[', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\]', '\\[', '({i})', '\\]'
        ]).format(i=formats.integer, s=formats.string)
        return re.sub(p, litpick, data)

class deob_js_comments

This unit is implemented in refinery.units.obfuscation.js.comments and has the following commandline Interface:

usage: deob-js-comments [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator that removes comments from the script.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_comments(Deobfuscator):
    """
    JavaScript deobfuscator that removes comments from the script.
    """
    def deobfuscate(self, data):
        strings = StringLiterals(formats.string, data)
        @strings.outside
        def remove(_): return ''

        data = re.sub(R'/\*.*?\*/', remove, data, flags=re.DOTALL)
        data = re.sub(R'(?m)//.*$', remove, data)
        return data

class deob_js_concat (timeout=100)

This unit is implemented in refinery.units.obfuscation.js.concat and has the following commandline Interface:

usage: deob-js-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

options:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''['"]\s*\+\s*['"]''')

    def deobfuscate(self, data):
        def concat(data):
            strlit = StringLiterals(formats.string, data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    _, a = strlit.ranges[a]
                    b, c = strlit.ranges[b]
                    yield data[:a - 1] + data[b + 1:c]
                    data = data[c:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_js_getattr

This unit is implemented in refinery.units.obfuscation.js.getattr and has the following commandline Interface:

usage: deob-js-getattr [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator to turn WScript["CreateObject"] into WScript.CreateObject.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_getattr(Deobfuscator):
    """
    JavaScript deobfuscator to turn `WScript["CreateObject"]` into `WScript.CreateObject`.
    """

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.string, data)

        @strlit.outside
        def dottify(match: re.Match[str]):
            name = match[2][1:-1]
            if name.isidentifier():
                return F'{match[1]}.{name}'
            return match[0]

        return re.sub(FR'(\w+)\[({formats.string})\]', dottify, data)

class deob_js_tuples

This unit is implemented in refinery.units.obfuscation.js.tuples and has the following commandline Interface:

usage: deob-js-tuples [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator to turn ("Z", "t", "s", "e") into "e".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_tuples(Deobfuscator):
    """
    JavaScript deobfuscator to turn `("Z", "t", "s", "e")` into `"e"`.
    """

    def deobfuscate(self, data):

        def litpick(match):
            try:
                array = match[1]
                lpick = array.split(',')[-1].strip()
                self.log_debug(lambda: F'{lpick} = {match[0]}')
            except (TypeError, IndexError):
                lpick = match[0]
            return lpick

        p = R'\s{{0,5}}'.join([
            '\\(', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\)'
        ]).format(i=formats.integer, s=formats.string)
        return re.sub(p, litpick, data)

class deob_ps1 (timeout=100)

This unit is implemented in refinery.units.obfuscation.ps1.all and has the following commandline Interface:

usage: deob-ps1 [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

options:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1(IterativeDeobfuscator):

    _SUBUNITS: list[type[Deobfuscator]] = [
        deob_ps1_escape,
        deob_ps1_cases,
        deob_ps1_brackets,
        deob_ps1_format,
        deob_ps1_typecast,
        deob_ps1_stringreplace,
        deob_ps1_b64convert,
        deob_ps1_encodings,
        deob_ps1_concat,
        deob_ps1_invoke,
        deob_ps1_uncurly
    ]

    def deobfuscate(self, data):
        units = [u() for u in self._SUBUNITS]
        for u in units:
            u.log_level = self.log_level
        for unit in units:
            self.log_debug(lambda: F'invoking {unit.name}')
            checkpoint = hash(data)
            data = unit.deobfuscate(data)
            if checkpoint != hash(data) and not self.log_debug('data has changed.'):
                self.log_info(F'used {unit.name}')
        return re.sub(R'[\r\n]+', '\n', data)

class deob_ps1_b64convert

This unit is implemented in refinery.units.obfuscation.ps1.b64convert and has the following commandline Interface:

usage: deob-ps1-b64convert [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_b64convert(Deobfuscator):

    _SENTINEL = re.compile('\\s*'.join(
        (re.escape('[System.Convert]::FromBase64String'), '\\(', '({s})', '\\)')
    ).format(s=formats.ps1str), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        def replacer(match: re.Match[str]):
            if strlit.get_container(match.start()):
                return match[0]
            try:
                string, = string_unquote(match[1])
            except ValueError:
                return match[0]
            try:
                bytes = base64.b64decode(string)
            except Exception:
                return match[0]
            return '@({})'.format(','.join(F'0x{b:02X}' for b in bytes))

        return self._SENTINEL.sub(replacer, data)

class deob_ps1_brackets

This unit is implemented in refinery.units.obfuscation.ps1.brackets and has the following commandline Interface:

usage: deob-ps1-brackets [-h] [-L] [-Q] [-0] [-v]

PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e.
("{0}{2}{1}") is transformed to "{0}{2}{1}". Currently, only integer and string constants are
supported.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_brackets(Deobfuscator):
    """
    PowerShell deobfuscation that removes superfluous brackets around constant
    literals, i.e. `("{0}{2}{1}")` is transformed to `"{0}{2}{1}"`. Currently,
    only integer and string constants are supported.
    """
    _SENTINEL = re.compile(
        RF'''(?<![\w"']{{2}})'''  # this may be a function call
        RF'''(\-\w+)?'''  # not a function call but an argument
        RF'''\(\s*({formats.integer}|{formats.ps1str})\s*(\S)''',
        flags=re.IGNORECASE
    )

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        repeat = True

        @strlit.outside
        def replacement(match):
            nonlocal repeat
            if match[3] == ')':
                repeat = True
                return (match[1] or '') + match[2]

        while repeat:
            repeat = False
            data = self._SENTINEL.sub(replacement, data)

        return data

class deob_ps1_cases

This unit is implemented in refinery.units.obfuscation.ps1.cases and has the following commandline Interface:

usage: deob-ps1-cases [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_cases(Deobfuscator):
    _NAMES = [
        '-BXor',
        '-Exec Bypass',
        '-NoLogo',
        '-NonInter',
        '-Replace',
        '-Windows Hidden',
        '.Invoke',
        'Assembly',
        'Byte',
        'Char',
        'ChildItem',
        'CreateThread',
        'Get-Variable',
        'GetType',
        'IntPtr',
        'Invoke-Expression',
        'Invoke',
        'Length',
        'Net.WebClient',
        'PowerShell',
        'PSVersionTable',
        'Set-Item',
        'Set-Variable',
        'Start-Sleep',
        'ToString',
        'Type',
        'Value',
        'Void',
    ]

    @outside(formats.ps1str)
    def deobfuscate(self, data):
        for name in self._NAMES:
            data = re.sub(RF'\b{re.escape(name)}\b', name, data, flags=re.IGNORECASE)
        return data

class deob_ps1_concat (timeout=100)

This unit is implemented in refinery.units.obfuscation.ps1.concat and has the following commandline Interface:

usage: deob-ps1-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

options:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''['"]\s*[+&]\s*['"]''')

    def deobfuscate(self, data):

        def concat(data):
            strlit = Ps1StringLiterals(data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    a = strlit.ranges[a]
                    b = strlit.ranges[b]
                    stra = data[slice(*a)]
                    strb = data[slice(*b)]
                    parts = list(string_unquote(stra))
                    it = iter(string_unquote(strb))
                    parts[~0] += next(it)
                    parts.extend(it)
                    yield data[:a[0]] + string_quote(parts)
                    data = data[b[1]:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_ps1_encodings

This unit is implemented in refinery.units.obfuscation.ps1.encodings and has the following commandline Interface:

usage: deob-ps1-encodings [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_encodings(Deobfuscator):

    _SENTINEL = re.compile('\\s*'.join(
        (re.escape('[System.Text.Encoding]::') + '(\\w+)\\.GetString', '\\(', '@\\(', '({a})', '\\)', '\\)')
    ).format(a=formats.intarray), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        def replacer(match: re.Match[str]):
            if strlit.get_container(match.start()):
                return match[0]
            try:
                bytes = bytearray(int(x.strip(), 0) for x in match[2].split(','))
            except Exception:
                return match[0]
            encoding = {
                'ASCII': 'ascii',
                'BigEndianUnicode': 'utf-16be',
                'Default': 'latin1',
                'Unicode': 'utf-16le',
            }.get(match[1], match[1])
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'utf8'
            try:
                string = bytes.decode(encoding)
            except Exception:
                return match[0]
            return string_quote(string)

        return self._SENTINEL.sub(replacer, data)

class deob_ps1_escape

This unit is implemented in refinery.units.obfuscation.ps1.escape and has the following commandline Interface:

usage: deob-ps1-escape [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_escape(Deobfuscator):

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        @strlit.outside
        def repl(m): return m[1]
        return re.sub(R'''`([^0abfnrtv`#'"\$])''', repl, data)

class deob_ps1_format

This unit is implemented in refinery.units.obfuscation.ps1.format and has the following commandline Interface:

usage: deob-ps1-format [-h] [-L] [-Q] [-0] [-v]

PowerShell deobfuscation for the following "format string"-based technique:

- "{0}{2}{1}"-f 'signa','ures','t'
- "{0}na{2}{1}"-f 'sig','ures','t'

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_format(Deobfuscator):
    """
    PowerShell deobfuscation for the following "format string"-based technique:

    - `"{0}{2}{1}"-f 'signa','ures','t'`
    - `"{0}na{2}{1}"-f 'sig','ures','t'`
    """

    def deobfuscate(self, data):

        repeat = True

        while repeat:

            repeat = False

            for string in re.finditer(str(formats.ps1str), data):
                argmatch = re.search(R'^\s*-[fF]\s*((?:{s},\s*)*{s})'.format(s=formats.ps1str), data[string.end():])
                if not argmatch:
                    continue

                def dbgmsg():
                    sample = string[0]
                    if len(sample) > 33:
                        sample = F"{sample[1:30]}...{sample[0]}"
                    return F'found match at {string.start()}: {sample}'

                self.log_debug(dbgmsg)

                args = re.split(F'({formats.ps1str})', argmatch[1])
                args = [list(string_unquote(a.strip())) for a in args[1::2]]

                def formatter(string):
                    buffer = []
                    for k, part in enumerate(re.split(R'(\{\d+\})', string)):
                        if k % 2 == 0:
                            if part:
                                buffer.append(part)
                            continue
                        try:
                            index = int(part[1:-1])
                            arg = args[index]
                        except IndexError as IE:
                            raise IndexError(F'only found {len(args)} arguments and format sequence {index}, aborting.') from IE

                        it = iter(arg)
                        buffer.append(next(it))

                        if len(arg) > 1:
                            yield ''.join(buffer)
                            buffer = []
                            for last, part in lookahead(it):
                                if last:
                                    buffer.append(part)
                                    break
                                yield part

                    yield ''.join(buffer)

                try:
                    result = string_apply(string[0], formatter)
                except IndexError:
                    continue

                data = data[:string.start()] + result + data[argmatch.end() + string.end():]
                repeat = True
                break

        return data

class deob_ps1_invoke

This unit is implemented in refinery.units.obfuscation.ps1.invoke and has the following commandline Interface:

usage: deob-ps1-invoke [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_invoke(Deobfuscator):
    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        @strlit.outside
        def invrepl1(m): return m[1] + m[3]

        data = re.sub(
            R'''(\.|::)'''                    # preceeded by dot or namespace delimiter
            R'''(['"])(\w{1,200})\2'''        # quoted string (actually a method name)
            R'''(?=[\s\(\.\,\;\+\-])''',      # only if followed by certain characters
            invrepl1, data                    # remove quotes around symbol
        )

        @strlit.outside
        def invrepl2(m): return m[1] + '('

        data = re.sub(
            '\\s{0,5}'.join([
                '[.&]', '(\\(',               # sourcing operator
                '(?:gcm|get-command)', ')?',  # potentially a get-command
                '([\'"])([-a-z]{1,100})\\2'   # string enclosing a command
                '(?(1)\\s{0,5}\\)|)',         # closing bracket for get-command
            ]), '\\3', data, flags=re.IGNORECASE
        )
        data = re.sub(
            R'''(\w{1,200})\.Invoke\s*\(''',
            invrepl2, data,
            flags=re.IGNORECASE
        )

        return data

class deob_ps1_secstr (*a)

This unit is implemented in refinery.units.obfuscation.ps1.securestring and has the following commandline Interface:

usage: deob-ps1-secstr [-h] [-L] [-Q] [-0] [-v] [a ...]

positional arguments:
  a

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_secstr(Deobfuscator):
    def __init__(self, *a, **kw):
        super().__init__(*a, **kw)

        self._pack = pack()
        self._secstr = secstr()

        self._pattern = re.compile(
            R'\s{{0,20}}'.join([
                R'''(['"])({b})\1''',
                R'\|', R'\.?', R'&?',
                R'''(['"]?)ConvertTo-SecureString\3''',
                R'-ke?y?',
                R'''(\(?)({a}|{i}\s{{0,20}}\.\.\s{{0,20}}{i})''',
                R'((?:\)\s{{0,20}}){{0,10}})?'
            ]).format(
                b=formats.b64,
                a=formats.intarray,
                i=formats.integer
            ),
            flags=re.IGNORECASE | re.DOTALL
        )

    def _decrypt_block(self, data, match):
        if '..' in match[5]:
            a, b = (int(x.strip(), 0) for x in match[5].split('..'))
            key = range(min(a, b), max(a, b) + 1)
            if a > b:
                key = reversed(key)
            self._secstr.args.key = bytes(bytearray(key))
        else:
            self._secstr.args.key = self._pack(match[5].encode(self.codec))
        decoded = self._secstr(match[2].encode(self.codec))
        decoded = decoded.decode(self.codec)
        result = F'\n\n{decoded}\n\n'
        brackets = match[6].count(')')
        start = match.start()
        if match[4]:
            brackets -= 1
        if brackets <= 0:
            if brackets < 0:
                result += ')'
            return start, result
        while brackets:
            start -= 1
            if data[start] == '(':
                brackets -= 1
            if data[start] == ')':
                brackets += 1
        return start, result

    def deobfuscate(self, data):
        while True:
            match = self._pattern.search(data)
            if not match:
                break
            start, result = self._decrypt_block(data, match)
            data = data[:start] + result + data[match.end():]
        return data

class deob_ps1_stringreplace

This unit is implemented in refinery.units.obfuscation.ps1.stringreplace and has the following commandline Interface:

usage: deob-ps1-stringreplace [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_stringreplace(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)[\'"]\s*'               # end of haystack string
        R'(-c|-i|-|\.)replace'        # the replace call
        R'([\(\s]*)({s})([\)\s]*),'   # needle (with brackets)
        R'([\(\s]*)({s})([\)\s]*)'    # insert (with brackets)
    ).format(s=formats.ps1str), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        repeat = True
        strlit = Ps1StringLiterals(data)

        while repeat:
            repeat = False
            needle = None

            for match in self._SENTINEL.finditer(data):
                k = strlit.get_container(match.start())
                if k is None:
                    continue
                offset, end = strlit.ranges[k]
                if match.start() != end - 1:
                    continue
                string = data[offset:end]
                pf, bl1, needle, bl2, br1, insert, br2 = match.groups()
                end = match.end()
                case = '' if pf[0] in '.c' else '(?i)'
                bl = bl1.count('(') - bl2.count(')')
                br = br2.count(')') - br1.count('(')
                if pf[0] == '.':
                    bl -= 1
                    br -= 1
                if bl != 0 or br < 0:
                    continue
                needle = list(string_unquote(needle))
                if len(needle) > 1:
                    continue

                needle = needle[0]
                head, *body = string_unquote(insert)

                self.log_info('replacing', needle, 'by', insert)

                if not body:
                    def perform_replacement(string):
                        return re.sub(F'{case}{re.escape(needle)}', lambda _: head, string)
                else:
                    *body, tail = body
                    def perform_replacement(string): # noqa
                        parts = re.split(F'{case}{re.escape(needle)}', string)
                        if len(parts) == 1:
                            yield string
                            return
                        it = iter(parts)
                        yield next(it) + head
                        yield from body
                        for last, part in lookahead(it):
                            if last:
                                yield tail + part
                            else:
                                yield tail + part + head
                                yield from body

                replaced = string_apply(string, perform_replacement) + (br * ')')
                strlit.ranges[k] = offset, offset + len(replaced) - br
                strlit.ranges[k + 1: k + 3] = []
                strlit.shift(len(replaced) + offset - end, k + 1)
                data = data[:offset] + replaced + data[end:]
                repeat = True
                break

        return data

class deob_ps1_typecast

This unit is implemented in refinery.units.obfuscation.ps1.typecast and has the following commandline Interface:

usage: deob-ps1-typecast [-h] [-L] [-Q] [-0] [-v]

Replaces sequences like [Char]120 to their string representation, in this case the string "x".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_typecast(Deobfuscator):
    """
    Replaces sequences like [Char]120 to their string representation, in this
    case the string "x".
    """

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        @strlit.outside
        def strip_typecast(m): return m[1]

        data = re.sub(
            FR'\[(?:string|char\[\])\]\s*({formats.ps1str!s})',
            strip_typecast,
            data,
            flags=re.IGNORECASE
        )

        @strlit.outside
        def char_literal(match):
            c = chr(int(match[1].lower(), 0))
            if c == "'":
                return '''"'"'''
            return F"'{c}'"

        data = re.sub(
            R'\[char\]\s*0*(0x[0-9a-f]+|\d+)',
            char_literal,
            data,
            flags=re.IGNORECASE
        )

        def char_array(match):
            result = bytes(int(x, 0) for x in match[1].split(','))
            try:
                result = result.decode('ascii')
                if not all(x in string.printable or x.isspace() for x in result):
                    raise ValueError
            except ValueError:
                return match[0]
            else:
                return string_quote(result)

        data = re.sub(
            R'\s*'.join([
                R'\[char\[\]\]',
                R'\((',
                R'(?:\s*(?:0x[0-9a-f]+|\d+)\s*,)+',
                R'(?:0x[0-9a-f]+|\d+)',
                R')\)'
            ]),
            char_array,
            data,
            flags=re.IGNORECASE
        )

        return data

class deob_ps1_uncurly

This unit is implemented in refinery.units.obfuscation.ps1.uncurly and has the following commandline Interface:

usage: deob-ps1-uncurly [-h] [-L] [-Q] [-0] [-v]

PowerShell deobfuscation that removes superfluous curly braces around variable names that do not
require it, i.e. ${variable} is transformed to just $variable.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_uncurly(Deobfuscator):
    """
    PowerShell deobfuscation that removes superfluous curly braces around variable
    names that do not require it, i.e. `${variable}` is transformed to just `$variable`.
    """

    _SENTINEL = re.compile(R'\$\{(\w+)\}')

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        @strlit.outside
        def strip(m): return F'${m[1]}'
        return self._SENTINEL.sub(strip, data)

class deob_vba (timeout=100)

This unit is implemented in refinery.units.obfuscation.vba.all and has the following commandline Interface:

usage: deob-vba [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

options:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba(IterativeDeobfuscator):

    _SUBUNITS: list[type[Deobfuscator]] = [
        deob_vba_comments,
        deob_vba_brackets,
        deob_vba_char_function,
        deob_vba_concat,
        deob_vba_arithmetic,
        deob_vba_constants,
        deob_vba_dummy_variables,
        deob_vba_stringreplace,
        deob_vba_stringreverse,
    ]

    def deobfuscate(self, data):
        units = [u() for u in self._SUBUNITS]
        for u in units:
            u.log_level = self.log_level
        for unit in units:
            self.log_debug(lambda: F'invoking {unit.name}')
            checkpoint = hash(data)
            data = unit.deobfuscate(data)
            if checkpoint != hash(data) and not self.log_debug('data has changed.'):
                self.log_info(F'used {unit.name}')
        return re.sub(R'[\r\n]+', '\n', data)

class deob_vba_arithmetic

This unit is implemented in refinery.units.obfuscation.vba.arithmetic and has the following commandline Interface:

usage: deob-vba-arithmetic [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_arithmetic(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.vbastr, data)

        def vba_int_eval(match: re.Match[str]) -> str:
            s = match[0].lower()
            if not s.startswith('&'):
                return s
            t, s = s[1], s[2:].rstrip('&')
            if t == 'h':
                return str(int(s, 16))
            if t == 'b':
                return str(int(s, 2))
            if t == 'o':
                return str(int(s, 8))

        @strings.outside
        def evaluate(match: re.Match[str]):
            expression = match[0]
            expression = expression.strip()
            if not any(c.isdigit() for c in expression):
                return expression
            expression = re.sub(str(formats.vbaint), vba_int_eval, expression)
            brackets = 0
            positions = []
            ok = True
            head = tail = rest = ''
            for end, character in enumerate(expression):
                if character == '(':
                    brackets += 1
                    positions.append(end)
                    continue
                if character == ')':
                    brackets -= 1
                    if brackets < 0:
                        expression, tail = expression[:end], expression[end:]
                        break
                    else:
                        positions.pop()
                    if brackets == 0 and expression[0] == '(':
                        expression, rest = expression[:end + 1], expression[end + 1:]
                        break
            if expression.isdigit():
                return match[0]
            if brackets > 0:
                pos = positions[~0] + 1
                head = expression[:pos]
                expression = expression[pos:]
            try:
                result = str(_cautious_vba_eval(expression + rest))
            except Exception:
                ok = False
            else:
                rest = ''
            if not ok and rest:
                try:
                    result = str(_cautious_vba_eval(expression))
                except Exception:
                    expression += rest
                else:
                    ok = True
            if not ok:
                result = expression
                self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})')
            else:
                if expression.startswith('(') and expression.endswith(')'):
                    result = F'({result})'
            if tail:
                tail = self.deobfuscate(tail)
            return F'{head}{result}{rest}{tail}'

        pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format(
            i=str(formats.vbaint), f=str(formats.float)))

        return pattern.sub(evaluate, data)

class deob_vba_brackets

This unit is implemented in refinery.units.obfuscation.vba.brackets and has the following commandline Interface:

usage: deob-vba-brackets [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_brackets(Deobfuscator):
    _SENTINEL = re.compile(
        RF'''(?<![\w"']{{2}})'''  # this may be a function call
        RF'''\(\s*({formats.vbaint}|{formats.vbastr}|{formats.float})\s*(\S)''',
        flags=re.IGNORECASE
    )

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)
        repeat = True

        @strlit.outside
        def replacement(match):
            nonlocal repeat
            if match[2] == ')':
                repeat = True
                return match[1]

        while repeat:
            repeat = False
            data = self._SENTINEL.sub(replacement, data)

        return data

class deob_vba_char_function

This unit is implemented in refinery.units.obfuscation.vba.char and has the following commandline Interface:

usage: deob-vba-char-function [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_char_function(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.vbastr, data)

        @strings.outside
        def evaluate_char_function(match: re.Match[str]):
            try:
                c = chr(int(match[1]))
            except ValueError:
                return match[0]
            if c == '"':
                return '""""'
            if c == '\\':
                return '"\\"'
            c = repr(c)[1:-1]
            if len(c) > 1:
                return match[0]
            return f'"{c}"'

        return re.sub(R'(?i)\bchrw?\s*\(\s*(\d+)\s*\)', evaluate_char_function, data)

class deob_vba_chr_literals

This unit is implemented in refinery.units.obfuscation.vba.vba and has the following commandline Interface:

usage: deob-vba-chr-literals [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_chr_literals(Unit):
    def process(self, data):
        def _chr(m):
            code = int(m[1], 0)
            if code == 34:
                return B'""""'
            return B'"%s"' % chr(code).encode('unicode_escape')
        data = re.sub(BR'Chr\((\d+x?\d+)\)', _chr, data, flags=re.IGNORECASE)
        data = re.sub(BR'"\s*\&\s*"', B'', data)
        return data

class deob_vba_comments

This unit is implemented in refinery.units.obfuscation.vba.comments and has the following commandline Interface:

usage: deob-vba-comments [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_comments(Deobfuscator):
    def deobfuscate(self, data):
        return re.sub(R"(?im)^\s{0,20}(?:'|rem\b|dim\b).*(?:\Z|$\n\r?)", '', data)

class deob_vba_concat (timeout=100)

This unit is implemented in refinery.units.obfuscation.vba.concat and has the following commandline Interface:

usage: deob-vba-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

options:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''"\s*(\++|&)\s*"''')

    def deobfuscate(self, data):

        def concat(data):
            strlit = StringLiterals(formats.vbastr, data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    _, a = strlit.ranges[a]
                    b, c = strlit.ranges[b]
                    yield data[:a - 1] + data[b + 1:c]
                    data = data[c:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_vba_constants

This unit is implemented in refinery.units.obfuscation.vba.constants and has the following commandline Interface:

usage: deob-vba-constants [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_constants(Deobfuscator):
    def deobfuscate(self, data):
        codelines = data.splitlines(keepends=True)
        constants = {}
        constline = {}
        variables = set()
        for k, line in enumerate(codelines):
            match = re.match(R'(?im)^\s*(?:sub|function)\s*(\w+)', line)
            if match:
                variables.add(match[1])
                continue
            match = re.match(
                R'(?im)^(?:\s*const)?\s*(\w+)\s*=\s*({i}|{s})\s*(?:\'|rem|$)'.format(
                    s=formats.ps1str,
                    i=formats.integer
                ), line)
            if match is None or match[1] in variables:
                pass
            elif match[2] != constants.get(match[1], match[2]):
                self.log_debug(F'del {match[1]}')
                del constants[match[1]]
                del constline[match[1]]
                variables.add(match[1])
            else:
                self.log_debug(F'add {match[1]} = {match[2]}')
                constants[match[1]] = match[2]
                constline[match[1]] = k
        codelines = [line for k, line in enumerate(codelines) if k not in constline.values()]
        data = ''.join(codelines)
        for name, value in constants.items():
            data = re.sub(RF'\b{re.escape(name)!s}\b', lambda _: value, data)

        return data

class deob_vba_dummy_variables

This unit is implemented in refinery.units.obfuscation.vba.dummies and has the following commandline Interface:

usage: deob-vba-dummy-variables [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_dummy_variables(Deobfuscator):
    def deobfuscate(self, data):
        lines = data.splitlines(keepends=False)
        names = collections.defaultdict(list)

        def might_be_used_in(name, line):
            # avoid finding the name within a string literal
            line = '""'.join(re.split(str(formats.ps1str), line))
            line = re.split(RF'\b{name}\b', line)
            try:
                L, R = line
            except ValueError:
                return False
            L = L.strip().lower()
            if L.startswith("'") or L.startswith('rem'):
                return False
            R = R.strip().lower()
            if R.startswith('=') and 'if' not in L:
                return False
            if L.startswith('dim'):
                return False
            return True

        pattern = re.compile(
            R'(?i)^\s{0,8}(?:const\s{1,8})?(\w+)\s{1,8}=\s{1,8}.*$'
        )

        for k, line in enumerate(lines):
            try:
                name = pattern.match(line)[1]
            except (AttributeError, TypeError):
                continue
            if re.search(r'\w+\(', line):
                # might be a function call
                continue
            names[name].append(k)

        for line in lines:
            while True:
                for name in names:
                    if might_be_used_in(name, line):
                        del names[name]
                        break
                else:
                    break

        return '\n'.join(line for k, line in enumerate(lines) if not any(
            k in rows for rows in names.values()))

class deob_vba_stringreplace

This unit is implemented in refinery.units.obfuscation.vba.stringreplace and has the following commandline Interface:

usage: deob-vba-stringreplace [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_stringreplace(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)\bReplace\s*\('  # the replace call
        R'\s*({s}),'           # haystack (with brackets)
        R'\s*({s}),'           # needle (with brackets)
        R'\s*({s})\s*\)'       # insert (with brackets)
    ).format(s=formats.vbastr), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)

        @strlit.outside
        def replacement(match: re.Match[str]):
            return string_quote(
                string_unquote(match[1]).replace(
                    string_unquote(match[2]),
                    string_unquote(match[3])
                )
            )

        return self._SENTINEL.sub(replacement, data)

class deob_vba_stringreverse

This unit is implemented in refinery.units.obfuscation.vba.stringreverse and has the following commandline Interface:

usage: deob-vba-stringreverse [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_stringreverse(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)\bStrReverse\s*\('  # the reverse call
        R'\s*({s})\s*\)'          # string
    ).format(s=formats.vbastr), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)

        @strlit.outside
        def replacement(match: re.Match[str]):
            return string_quote(''.join(reversed(string_unquote(match[1]))))

        return self._SENTINEL.sub(replacement, data)

class des (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=(), aad=b'')

This unit is implemented in refinery.units.crypto.cipher.des and has the following commandline Interface:

usage: des [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-t TAG] [-a AAD]
           key

DES encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -t, --tag TAG         Only for EAX, GCM, OCB, and CCM: An authentication tag to verify the
                        message. For encryption, this parameter specifies the tag length, and the
                        tag is provided as a meta variable named "tag".
  -a, --aad AAD         Only for EAX, GCM, OCB, and CCM: Set additional authenticated data.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class des(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES)):
    """
    DES encryption and decryption.
    """

class des3 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=(), aad=b'')

This unit is implemented in refinery.units.crypto.cipher.des3 and has the following commandline Interface:

usage: des3 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-t TAG]
            [-a AAD]
            key

3-DES encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -t, --tag TAG         Only for EAX, GCM, OCB, and CCM: An authentication tag to verify the
                        message. For encryption, this parameter specifies the tag length, and the
                        tag is provided as a meta variable named "tag".
  -a, --aad AAD         Only for EAX, GCM, OCB, and CCM: Set additional authenticated data.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class des3(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES3)):
    """
    3-DES encryption and decryption.
    """

class deskd (size=8)

This unit is implemented in refinery.units.crypto.keyderive.deskd and has the following commandline Interface:

usage: deskd [-h] [-L] [-Q] [-0] [-v] [N]

Stands for "DES Key Derivation". It implements the same functionality as DES_string_to_key in
OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification.
This is not a modern key derivation function.

positional arguments:
  N              The number of bytes to generate, default is the maximum of 8.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deskd(KeyDerivation):
    """
    Stands for "DES Key Derivation". It implements the same functionality as `DES_string_to_key` in OpenSSL. It
    converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern
    key derivation function.
    """
    def __init__(self, size: Param[int, Arg(help='The number of bytes to generate, default is the maximum of 8.')] = 8):
        super().__init__(size=size, salt=None)

    def process(self, password):
        from Cryptodome.Cipher import DES
        from Cryptodome.Util.strxor import strxor

        password = bytes(password)
        key = bytearray(8)

        for i, p in enumerate(password):
            if ((i % 16) < 8):
                key[i % 8] ^= (p << 1) & 0xFF
            else:
                p = (((p << 4) & 0xf0) | ((p >> 4) & 0x0f))
                p = (((p << 2) & 0xcc) | ((p >> 2) & 0x33))
                p = (((p << 1) & 0xaa) | ((p >> 1) & 0x55))
                key[7 - (i % 8)] ^= p

        des_set_odd_parity(key)

        if password:
            n = len(password)
            password = password.ljust(n + 7 - ((n - 1) % 8), b'\0')
            des = DES.new(key, DES.MODE_ECB)
            for k in range(0, n, 8):
                key[:] = des.encrypt(strxor(password[k:k + 8], key))
            des_set_odd_parity(key)

        if self.args.size > 8:
            raise RefineryPartialResult('can provide at most 8 bytes.', partial=key)

        return key[:self.args.size]

class dexstr

This unit is implemented in refinery.units.formats.dexstr and has the following commandline Interface:

usage: dexstr [-h] [-L] [-Q] [-0] [-v]

Extract strings from DEX (Dalvik Executable) files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dexstr(Unit):
    """
    Extract strings from DEX (Dalvik Executable) files.
    """
    def process(self, data):
        dex = DexFile(data)
        for string in dex.read_strings():
            yield string.encode(self.codec)

class djb2 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.checksums and has the following commandline Interface:

usage: djb2 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Computes the DJB2 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class djb2(HashUnit):
    """
    Computes the DJB2 hash of the input data.
    """
    def _algorithm(self, data) -> bytes:
        h = 5381
        for b in data:
            h = ((h << 5) + h + b) & 0xFFFFFFFF
        return h.to_bytes(4, 'big')

class dnarrays

This unit is implemented in refinery.units.formats.pe.dotnet.dnarrays and has the following commandline Interface:

usage: dnarrays [-h] [-L] [-Q] [-0] [-v] [-F]

Extracts arrays of strings or integers that are encoded in the .NET binary as IL opcodes. The
data is exported as JSON.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class dnarrays(Unit):
    """
    Extracts arrays of strings or integers that are encoded in the .NET binary as IL opcodes.
    The data is exported as JSON.
    """
    @staticmethod
    def _read_int(reader: StructReader):
        value = reader.read_byte() - 0x16
        if value < 0:
            raise ValueError
        elif value <= 8:
            return value
        elif value == 9:
            return reader.read_byte()
        elif value == 10:
            return reader.u32()
        else:
            raise ValueError

    @staticmethod
    def _read_str(reader: StructReader, header: DotNetHeader):
        if reader.read_byte() != 0x72:
            raise ValueError
        token: int = reader.read_integer(24)
        value: str = header.meta.Streams.US[token]
        if reader.read_byte() != 0x70:
            raise ValueError
        return value

    _STACK_ARRAY_PATTERN_STR = re.compile(
        BR'''(?x)
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load array length
        (?:  \x8D...\x01               ) # newarr System.String
        (?:
        (?:  \x25                      ) # dup
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load integer index
        (?:  \x72...\x70               ) # load the string
        (?:  \xA2                      ) # stelem.ref
        ){4,}
        ''', flags=re.DOTALL)

    def _str_arrays(self, data: buf, header: DotNetHeader, tables: NetMetaDataTables):
        for match in self._STACK_ARRAY_PATTERN_STR.finditer(data):
            reader = StructReader(match[0])
            result: list[str] = []
            size = self._read_int(reader)
            if reader.read_byte() != 0x8D:
                raise RuntimeError
            stt = reader.read_integer(24)
            if reader.read_byte() != 0x01:
                raise RuntimeError
            if stt < 1 or tables.TypeRef[stt - 1].TypeName != 'String':
                continue
            self.log_info(F'str array pattern at 0x{match.start():X}, size {size}')
            for k in range(size):
                if reader.read_byte() != 0x25:
                    raise RuntimeError
                if self._read_int(reader) != k:
                    break
                result.append(self._read_str(reader, header))
                if reader.read_byte() != 0xA2:
                    raise RuntimeError
            else:
                yield match.start(), result

    _STACK_ARRAY_PATTERN_INT = re.compile(
        BR'''(?x)
        (    \x12.|\xFE\x0D..          ) # load array variable
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # push integer value
        (?:  \x52                      ) # store value into array
        (?:
        (?:  \1                        ) # load same array variable
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load integer index
        (?:  \x58                      ) # add; compute offset
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # push integer value
        (?:  \x52                      ) # store value into array
        ){4,}
        ''', flags=re.DOTALL)

    def _int_arrays(self, data: buf, header: DotNetHeader, tables: NetMetaDataTables):
        for match in self._STACK_ARRAY_PATTERN_INT.finditer(data):
            self.log_info(F'int array pattern at 0x{match.start():X}')
            reader = StructReader(match[0])
            result: list[int] = []
            opc, = reader.peek(1)
            skip = {0x12: 2, 0xFE: 4}[opc]
            reader.seekrel(skip)
            for index in itertools.count(1):
                result.append(self._read_int(reader))
                assert reader.read_byte() == 0x52
                if reader.eof:
                    yield match.start(), result
                    break
                reader.seekrel(skip)
                if self._read_int(reader) != index:
                    self.log_info('index inconsistency; aborting')
                    break
                assert reader.read_byte() == 0x58

    def process(self, data):
        header = DotNetHeader(data)
        tables = header.meta.Streams.Tables
        cp = CodePath(header)

        arrays = dict(itertools.chain(
            self._int_arrays(data, header, tables),
            self._str_arrays(data, header, tables),
        ))
        result = collections.defaultdict(list)
        for offset in sorted(arrays):
            result[cp.method_spec(offset)].append(arrays[offset])

        result = {m: {F'v{k}': v for k, v in enumerate(t, 1)} for m, t in result.items()}
        return json.dumps(result, indent=4).encode(self.codec)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnasm (*, count=None, until=None, no_il_refs=False, no_address=False, no_hexdump=False, no_args=False, description=False)

This unit is implemented in refinery.units.sinks.dnasm and has the following commandline Interface:

usage: dnasm [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-I] [-A] [-H] [-O] [-d]

Disassembles the input data as MSIL (.NET/C# bytecode) and produces a human-readable disassembly
listing. If you are looking for a more programmatic disassembly, take a look at dnopc.

options:
  -c, --count N      Maximum number of bytes to disassemble, infinite by default.
  -u, --until STR    Disassemble until the given string appears among the disassembly.
  -I, --no-il-refs   Disable reference resolution to IL_*.
  -A, --no-address   Disable address display.
  -H, --no-hexdump   Disable opcodes hexdump.
  -O, --no-args      Disable output of instruction arguments.
  -d, --description  Enable opcodes descriptions in output.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class dnasm(DotnetDisassemblerUnit):
    """
    Disassembles the input data as MSIL (.NET/C# bytecode) and produces a human-readable disassembly listing. If you are
    looking for a more programmatic disassembly, take a look at `refinery.dnopc`.
    """

    def __init__(
        self, *,
        count=None, until=None,
        no_il_refs: Param[bool, Arg.Switch('-I', help='Disable reference resolution to IL_*.')] = False,
        no_address: Param[bool, Arg.Switch('-A', help='Disable address display.')] = False,
        no_hexdump: Param[bool, Arg.Switch('-H', help='Disable opcodes hexdump.')] = False,
        no_args: Param[bool, Arg.Switch('-O', help='Disable output of instruction arguments.')] = False,
        description: Param[bool, Arg.Switch('-d', help='Enable opcodes descriptions in output.')] = False,
    ):
        self._output_factory = OutputFactory(
            il_refs=not no_il_refs,
            address=not no_address,
            hexdump=not no_hexdump,
            arguments=not no_args,
        )
        self._disassembler = Disassembler()
        super().__init__(
            count=count,
            until=until,
            description=description,
        )

    def process(self, data):
        meta = metavars(data)
        r = re.compile(r't[0-9a-f]+', re.IGNORECASE)
        self._output_factory.extend_token_labels({int(k[1:], 16): v for k, v in meta.items() if r.match(k)})
        until = str(self.args.until or '').lower()

        max_line_length = 0
        if self.args.description:
            disasm = []
            for ins in self._disassembler.disasm(data, self.args.count):
                disasm.append(ins)
                line = self._output_factory.instruction(ins)
                max_line_length = max(max_line_length, len(line))
        else:
            disasm = self._disassembler.disasm(data, self.args.count)

        for ins in disasm:
            line = self._output_factory.instruction(ins)
            if self.args.description:
                line += ' ' * (max_line_length - len(line) + 2)
                line += f'-- {ins.op.description}'
            yield line.encode("utf-8")

            if until and until in line.lower():
                break

class dnblob

This unit is implemented in refinery.units.formats.pe.dotnet.dnblob and has the following commandline Interface:

usage: dnblob [-h] [-L] [-Q] [-0] [-v] [-F]

Extracts all blobs defined in the #Blob stream of .NET executables.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class dnblob(Unit):
    """
    Extracts all blobs defined in the `#Blob` stream of .NET executables.
    """
    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        yield from header.meta.Streams.Blob.values()

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnds (dereference=True, encode=None, digest=None, arrays=False)

This unit is implemented in refinery.units.formats.pe.dotnet.dnds and has the following commandline Interface:

usage: dnds [-h] [-L] [-Q] [-0] [-v] [-r] [-e U | -d U | -a]

Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class
"BinaryFormatter". The output is a representation of the deserialized data in JSON format.

options:
  -r, --keep-references  Do not resolve Object references in serialized data.
  -e, --encode U         Select an encoder unit used to represent binary data in the JSON output.
                         This unit must be reversible and produce UTF8 encoded string output when
                         operated in reverse. Common examples are hex and b64.
  -d, --digest U         Select a hashing unit to digest all byte strings: Instead of the data,
                         only the hash will be displayed.
  -a, --arrays           Encode all byte strings as integer arrays. These arrays will have
                         unsigned integer entires between 0 and 255.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Increase the leniency, allowing partial results and ignoring more
                         errors.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class dnds(DotNetJSONEncoderUnit):
    """
    Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class
    "BinaryFormatter". The output is a representation of the deserialized data in JSON format.
    """

    def __init__(
        self,
        dereference: Param[bool, Arg.Switch('-r', '--keep-references', off=True,
            help='Do not resolve Object references in serialized data.')] = True,
        encode=None, digest=None, arrays=False
    ):
        super().__init__(encode=encode, digest=digest, arrays=arrays, dereference=dereference)

    def process(self, data):
        self.log_debug('initializing parser, will fail on malformed stream')
        bf = BinaryFormatterParser(
            data,
            keep_meta=True,
            dereference=self.args.dereference,
            ignore_errors=not self.log_debug(),
        )

        return self.to_json([
            {
                'Type': repr(record),
                'Data': record
            } for record in bf
        ])

class dnfields (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pe.dotnet.dnfields and has the following commandline Interface:

usage: dnfields [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

This unit can extract data from constant field variables in classes of .NET executables. Since
the .NET header stores only the offset and not the size of constant fields, heuristics are used
to search for opcode sequences that load the data and additional heuristics are used to guess the
size of the data type.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class dnfields(PathExtractorUnit):
    """
    This unit can extract data from constant field variables in classes of .NET
    executables. Since the .NET header stores only the offset and not the size of
    constant fields, heuristics are used to search for opcode sequences that load
    the data and additional heuristics are used to guess the size of the data
    type.
    """
    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

    def unpack(self, data):
        header = DotNetHeader(data, parse_resources=False)
        tables = header.meta.Streams.Tables
        fields = tables.FieldRVA
        cpaths = CodePath(header)

        if not fields:
            return

        icache: dict[bytes, FieldInfo] = {}
        memory = memoryview(data)

        def _guess_field_info(t: int, signature: bytes, field_name: str | None = None, sizemap: dict = {
            '^s?byte$'       : 1,
            '^s?char$'       : 2,
            '^[us]?int.?16$' : 2,
            '^[us]?int.?32$' : 4,
            '^[us]?int.?64$' : 8,
        }) -> tuple[str | None, FieldInfo]:
            try:
                info = icache[signature]
            except KeyError:
                info = None
            else:
                if field_name is not None:
                    return field_name, info
            pattern = (
                BR'(\x20....|\x1F.|[\x17-\x1E])'    # ldc.i4  count
                BR'\x8D(...)([\x01\x02])'           # newarr  col|row
                BR'\x25'                            # dup
                BR'\xD0\x%02x\x%02x\x%02x\x04'      # ldtoken t
                BR'(?:.{0,12}?'                     # ...
                BR'\x80(...)\x04)?' % (             # stsfld variable
                    (t >> 0x00) & 0xFF,
                    (t >> 0x08) & 0xFF,
                    (t >> 0x10) & 0xFF
                )
            )
            for match in re.finditer(pattern, memory, flags=re.DOTALL):
                if info is None:
                    count, j, r, name = match.groups()
                    count = integer_from_ldc(count)
                    j, r = struct.unpack('<LB', B'%s\0%s' % (j, r))
                    typename = tables[r][j - 1].TypeName
                else:
                    name = match.group(4)
                    typename = info.type
                for pattern, size in sizemap.items():
                    if not re.match(pattern, typename, flags=re.IGNORECASE):
                        continue
                    if name:
                        try:
                            name = struct.unpack('<L', B'%s\0' % name)
                            name = name[0]
                            name = tables[4][name - 1].Name
                        except Exception as E:
                            self.log_info(F'attempt to parse field name failed: {E!s}')
                            name = None
                    if name is None:
                        name = field_name
                    if info is None:
                        info = FieldInfo(typename, count, size, match.start())
                    icache[signature] = info
                    return name, info
            else:
                return None, None

        iwidth = len(str(len(fields)))
        rwidth = max(len(F'{field.RVA:X}') for field in fields)
        rwidth = max(rwidth, 4)
        remaining_field_indices = set(range(len(tables.Field)))

        unpack = []
        name_count = Counter(tables.Field[rv.Field.Index - 1].Name for rv in fields)
        name_width = len(str(len(fields)))

        for k, rv in enumerate(fields):
            _index = rv.Field.Index
            field = tables.Field[_index - 1]
            remaining_field_indices.discard(_index - 1)
            if not field.Flags.HasFieldRVA:
                continue
            fname = field.Name
            type = None
            signature: bytes = field.Signature
            offset = header.pe.rva_to_offset(rv.RVA)

            if len(signature) == 2:
                # Crude signature parser for non-array case. Reference:
                # https://www.codeproject.com/Articles/42649/NET-File-Format-Signatures-Under-the-Hood-Part-1
                # https://www.codeproject.com/Articles/42655/NET-file-format-Signatures-under-the-hood-Part-2
                guess = {
                    0x03: FieldInfo('Char',   1, 1, 0),  # noqa
                    0x04: FieldInfo('SByte',  1, 1, 0),  # noqa
                    0x05: FieldInfo('Byte',   1, 1, 0),  # noqa
                    0x06: FieldInfo('Int16',  1, 2, 0),  # noqa
                    0x07: FieldInfo('UInt16', 1, 2, 0),  # noqa
                    0x08: FieldInfo('Int32',  1, 4, 0),  # noqa
                    0x09: FieldInfo('UInt32', 1, 4, 0),  # noqa
                    0x0A: FieldInfo('Int64',  1, 8, 0),  # noqa
                    0x0B: FieldInfo('UInt64', 1, 8, 0),  # noqa
                    0x0C: FieldInfo('Single', 1, 4, 0),  # noqa
                    0x0D: FieldInfo('Double', 1, 8, 0),  # noqa
                }.get(signature[1], None)
            else:
                fname, guess = _guess_field_info(_index, signature, fname)

            if guess is None:
                self.log_warn(lambda: F'field {k:0{iwidth}d} with signature {field.Signature.hex()}: unable to guess type information')
                continue
            if not fname.isprintable() or name_count[fname] > 1:
                fname = F'Field{k + 1:0{name_width}d}'
            type = guess.type.lower()
            if guess.count > 1:
                type += F'[{guess.count}]'
            self.log_debug(
                F'field {k:0{iwidth}d}; token 0x{_index:06X}; RVA 0x{rv.RVA:04X}; count {guess.count}; type {guess.type}; name {fname}')
            end = offset + guess.count * guess.size
            path = cpaths.method_path(guess.offset) if guess.offset else ''
            unpack.append(UnpackResult(F'{path}/{fname}', memory[offset:end], name=fname, type=type))

        for _index in remaining_field_indices:
            field = tables.Field[_index]
            index = _index + 1
            name = field.Name
            if field.Flags.HasFieldRVA:
                self.log_warn(F'field {name} has RVA flag set, but no RVA was found')
            token = index.to_bytes(3, 'little')
            values = {}
            for match in re.finditer((
                BR'\x72(?P<token>...)\x70'          # ldstr
                BR'(?:\x6F(?P<function>...)\x0A)?'  # call GetBytes
                BR'\x80%s\x04'                      # stsfld
            ) % re.escape(token), data, re.DOTALL):
                md = match.groupdict()
                fn_token = md.get('function')
                fn_index = fn_token and int.from_bytes(fn_token, 'little') or None
                if fn_index is not None:
                    fn_name = tables.MemberRef[fn_index].Name
                    if fn_name != 'GetBytes':
                        self.log_info(F'skipping string assignment passing through call to {fn_name}')
                        continue
                k = int.from_bytes(md['token'], 'little')
                values[match.start()] = header.meta.Streams.US[k].encode(self.codec)
            if not values:
                continue
            if len(values) == 1:
                offset, value = values.popitem()
                path = cpaths.method_path(offset)
                unpack.append(UnpackResult(F'{path}/{name}', value, name=name, type='string'))

        unpack.sort(key=lambda u: u.path)
        yield from unpack

class dnhdr (resources=False, encode=None, digest=None, arrays=False)

This unit is implemented in refinery.units.formats.pe.dotnet.dnhdr and has the following commandline Interface:

usage: dnhdr [-h] [-L] [-Q] [-0] [-v] [-F] [-r] [-e U | -d U | -a]

Expects data that has been formatted with the BinaryFormatter class. The output is a
representation of the deserialized data in JSON format.

options:
  -r, --resources  Also parse .NET resources.
  -e, --encode U   Select an encoder unit used to represent binary data in the JSON output. This
                   unit must be reversible and produce UTF8 encoded string output when operated
                   in reverse. Common examples are hex and b64.
  -d, --digest U   Select a hashing unit to digest all byte strings: Instead of the data, only
                   the hash will be displayed.
  -a, --arrays     Encode all byte strings as integer arrays. These arrays will have unsigned
                   integer entires between 0 and 255.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class dnhdr(DotNetJSONEncoderUnit):
    """
    Expects data that has been formatted with the `BinaryFormatter` class. The
    output is a representation of the deserialized data in JSON format.
    """
    def __init__(
        self,
        resources: Param[bool, Arg.Switch('-r', '--resources', help='Also parse .NET resources.')] = False,
        encode=None, digest=None, arrays=False,
    ):
        super().__init__(encode=encode, digest=digest, arrays=arrays, resources=resources)

    def process(self, data):
        dn = DotNetHeader(data, parse_resources=self.args.resources)
        dn = {
            'Head': dn.head,
            'Meta': dn.meta
        }

        if self.args.resources:
            dn['RSRC'] = dn.resources

        return self.to_json(dn)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnmr (*paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw=False)

This unit is implemented in refinery.units.formats.pe.dotnet.dnmr and has the following commandline Interface:

usage: dnmr [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-e | -z] [-r] [-P NAME] [-w] [path ...]

Extracts subfiles from .NET managed resources.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -e, --exact      Path patterns never match on substrings.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "name".
  -w, --raw        Do not deserialize the managed resource entry data.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class dnmr(PathExtractorUnit):
    """
    Extracts subfiles from .NET managed resources.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name',
        raw: Param[bool, Arg.Switch('-w', help='Do not deserialize the managed resource entry data.')] = False
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            path=path,
            raw=raw,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
        )

    def unpack(self, data):
        try:
            managed = NetStructuredResources(data)
        except NoManagedResource:
            managed = None
        if not managed:
            raise RefineryPartialResult('no managed resources found', partial=data)
        for entry in managed:
            if entry.Error:
                self.log_warn(F'entry {entry.Name} carried error message: {entry.Error}')
            data = entry.Data
            if not self.args.raw:
                if isinstance(entry.Value, str):
                    data = entry.Value.encode('utf-16le')
                elif isbuffer(entry.Value):
                    data = entry.Value
            yield UnpackResult(entry.Name, data)

    @classmethod
    def handles(cls, data):
        return data[:4] == b'\xCE\xCA\xEF\xBE'

class dnopc (*, count=None, until=None, nvar='name', avar='addr', ovar='arg')

This unit is implemented in refinery.units.formats.pe.dotnet.dnopc and has the following commandline Interface:

usage: dnopc [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-n STR] [-a STR] [-o STR]

Disassembles the input data as MSIL (.NET/C# bytecode) and generates opcodes with metadata as
output. This is useful for programmatic disassembly, while the dnasm unit outputs a human-
readable representation.

options:
  -c, --count N    Maximum number of bytes to disassemble, infinite by default.
  -u, --until STR  Disassemble until the given string appears among the disassembly.
  -n, --nvar STR   Variable to receive the disassembled mnemonic. Default is "name".
  -a, --avar STR   Variable to receive the address of the instruction. Default is "addr".
  -o, --ovar STR   Variable prefix for instruction operands. Default is "arg". The complete
                   operand string will be in args, the first argument in arg1, the second in
                   arg2, and so on.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class dnopc(DotnetDisassemblerUnit):
    """
    Disassembles the input data as MSIL (.NET/C# bytecode) and generates opcodes with metadata as output. This
    is useful for programmatic disassembly, while the `refinery.dnasm` unit outputs a human-readable
    representation.
    """

    def __init__(
        self,
        *,
        count=None,
        until=None,
        nvar: Param[str, Arg.String(
            '-n',
            help='Variable to receive the disassembled mnemonic. Default is "{default}".',
        )] = 'name',
        avar: Param[str, Arg.String(
            '-a',
            help='Variable to receive the address of the instruction. Default is "{default}".',
        )] = 'addr',
        ovar: Param[str, Arg.String(
            '-o',
            help=('Variable prefix for instruction operands. Default is "{default}". The complete operand '
                  'string will be in {default}s, the first argument in {default}1, the second in {default}2, '
                  'and so on.'),
        )] = 'arg',
        **more
    ):
        super().__init__(
            count=count,
            until=until,
            nvar=nvar,
            avar=avar,
            ovar=ovar,
            **more
        )

    def process(self, data):
        until = str(self.args.until or '').lower()
        factory = OutputFactory()
        for ins in Disassembler().disasm(data, self.args.count):
            kwargs = {
                self.args.avar: ins.offset,
                self.args.nvar: ins.op.mnemonic,
            }
            for k, arg in enumerate(ins.arguments, 1):
                kwargs[F'{self.args.ovar}{k}'] = arg.value
            yield self.labelled(ins.data, **kwargs)

            if until and until in factory.instruction(ins).lower():
                break

class dnrc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pe.dotnet.dnrc and has the following commandline Interface:

usage: dnrc [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use
the dnmr unit to extract subfiles from managed .NET resources.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class dnrc(PathExtractorUnit):
    """
    Extracts all .NET resources whose name matches any of the given patterns
    and outputs them. Use the `refinery.units.formats.pe.dotnet.dnmr` unit to
    extract subfiles from managed .NET resources.
    """
    def unpack(self, data):
        header = DotNetHeader(data)

        if not header.resources:
            if self.args.list:
                return
            raise ValueError('This file contains no resources.')

        for resource in header.resources:
            yield UnpackResult(resource.Name, resource.Data)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnsdomain (min=1, max=0, len=0, stripspace=False, duplicates=False, longest=False, take=0)

This unit is implemented in refinery.units.pattern.dnsdomain and has the following commandline Interface:

usage: dnsdomain [-h] [-L] [-Q] [-0] [-v] [-n N] [-m N] [-e N] [-x] [-r] [-l] [-t K]

Extracts domain names in the format as they appear in DNS requests. This can be used as a quick
and dirty way to extract domains from PCAP files, for example.

options:
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -r, --duplicates  Yield every (transformed) Match, even when it was found before.
  -l, --longest     Pick longer results first. The output will be sorted by length unless the
                    --take option is specified, in which case the longest K results will be
                    returned in order of appearance.
  -t, --take K      Return only the first K occurrences in order of appearance. If --longest is
                    specified, the K longest results will be returned in order of appearance
                    within the input.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class dnsdomain(PatternExtractorBase):
    """
    Extracts domain names in the format as they appear in DNS requests. This
    can be used as a quick and dirty way to extract domains from PCAP files,
    for example.
    """

    _DOMAIN_CHARACTERS = (
        B'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        B'abcdefghijklmnopqrstuvwxyz'
        B'0123456789-_'
    )

    _DOMAIN_PATTERN = BR'(?:%s){1,20}(?:%s)\b' % (_lps(0xFF), _lps(25))

    def process(self, data):

        def transform(match):
            match = bytearray(match[0])
            pos = 0
            while pos < len(match):
                length = match[pos]
                match[pos] = 0x2E
                if len(match) < length + pos:
                    return None
                if any(x not in self._DOMAIN_CHARACTERS for x in match[pos + 1 : pos + length]):
                    return None
                pos += 1 + length
            return match[1:]

        yield from self.matches_filtered(memoryview(data), self._DOMAIN_PATTERN, transform)

class dnsfx (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pe.dotnet.dnsfx and has the following commandline Interface:

usage: dnsfx [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extracts files from .NET single file applications.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class dnsfx(PathExtractorUnit):
    """
    Extracts files from .NET single file applications.
    """
    _SIGNATURE = bytes([
        # 32 bytes represent the bundle signature: SHA-256 for '.net core bundle'
        0x8b, 0x12, 0x02, 0xb9, 0x6a, 0x61, 0x20, 0x38,
        0x72, 0x7b, 0x93, 0x02, 0x14, 0xd7, 0xa0, 0x32,
        0x13, 0xf5, 0xb9, 0xe6, 0xef, 0xae, 0x33, 0x18,
        0xee, 0x3b, 0x2d, 0xce, 0x24, 0xb3, 0x6a, 0xae
    ])

    def unpack(self, data):
        reader = StreamReader(data)
        reader.seek(self._find_bundle_manifest_offset(data))

        major_version = reader.expect(UInt32)
        minor_version = reader.expect(UInt32)
        self.log_info(F'version {major_version}.{minor_version}')

        count = reader.expect(UInt32)
        bhash = reader.expect(StringPrimitive)
        self.log_info(F'bundle {bhash} contains {count} files')

        if major_version >= 2:
            reader.expect(UInt64) # depsOffset
            reader.expect(UInt64) # depsSize
            reader.expect(UInt64) # runtimeConfigOffset
            reader.expect(UInt64) # runtimeConfigSize
            reader.expect(UInt64) # flags

        for _ in range(count):
            try:
                offset = reader.expect(UInt64)
                size = reader.expect(UInt64)
                compressed_size = 0
                if major_version >= 6:
                    compressed_size = reader.expect(UInt64)
                type = reader.expect(Byte)
                path = reader.expect(StringPrimitive)

                def _logmsg():
                    _log = F'read item at offset 0x{offset:08X}, type 0x{type:02X}, size {SizeInt(size)!r}'
                    if compressed_size:
                        return F'{_log}, compressed to size {SizeInt(compressed_size)!r}'
                    return F'{_log}, uncompressed'

                self.log_debug(_logmsg)

                with reader.checkpoint():
                    reader.seek(offset)
                    if compressed_size:
                        item_data = reader.read(compressed_size) | zl | bytearray
                    else:
                        item_data = reader.read(size)

                yield UnpackResult(path, item_data)
            except ParserEOF:
                self.log_warn('unexpected EOF while parsing bundle, terminating')
                break

    def _find_bundle_manifest_offset(self, data: bytearray) -> int:
        bundle_sig_offset = data.find(self._SIGNATURE, 0)
        if bundle_sig_offset < 0:
            raise ValueError('Cannot find valid Bundle Manifest offset. Is this a .NET Bundle?')
        return int.from_bytes(data[bundle_sig_offset - 8:bundle_sig_offset], 'little')

    @classmethod
    def handles(cls, data):
        return buffer_contains(data, cls._SIGNATURE)

class dnstr (user=True, meta=True)

This unit is implemented in refinery.units.formats.pe.dotnet.dnstr and has the following commandline Interface:

usage: dnstr [-h] [-L] [-Q] [-0] [-v] [-F] [-m | -u]

Extracts all strings defined in the #Strings and #US streams of .NET executables.

options:
  -m, --meta     Only extract from #Strings.
  -u, --user     Only extract from #US.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class dnstr(Unit):
    """
    Extracts all strings defined in the `#Strings` and `#US` streams of .NET executables.
    """

    def __init__(
        self,
        user: Param[bool, Arg.Switch('-m', '--meta', off=True, group='HEAP', help='Only extract from #Strings.')] = True,
        meta: Param[bool, Arg.Switch('-u', '--user', off=True, group='HEAP', help='Only extract from #US.')] = True,
    ):
        if not meta and not user:
            raise ValueError('Either ascii or utf16 strings must be enabled.')
        super().__init__(meta=meta, user=user)

    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        if self.args.meta:
            for string in header.meta.Streams.Strings.values():
                yield string.encode(self.codec)
        if self.args.user:
            for string in header.meta.Streams.US.values():
                yield string.encode(self.codec)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class docmeta (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.docmeta and has the following commandline Interface:

usage: docmeta [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract metadata from Word Documents such as custom document properties.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class docmeta(PathExtractorUnit):
    """
    Extract metadata from Word Documents such as custom document properties.
    """
    @PathExtractorUnit.Requires('olefile', ['formats', 'office'])
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data: bytearray):
        properties = data | xtdoc('docProps/custom.xml') | str
        if not properties:
            return
        properties = xml.parse(properties)
        while properties.tag.lower() != 'properties':
            properties = properties.children[0]
        for node in properties:
            assert node.tag.lower() == 'property'
            assert len(node.children) == 1
            content = node.children[0].content
            assert content is not None
            yield UnpackResult(node.attributes['name'], content.encode(self.codec))

class doctxt

This unit is implemented in refinery.units.formats.office.doctxt and has the following commandline Interface:

usage: doctxt [-h] [-L] [-Q] [-0] [-v]

Extracts the text body from Word documents.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class doctxt(Unit):
    """
    Extracts the text body from Word documents.
    """

    @Unit.Requires('olefile', ['formats', 'office', 'extended'])
    def _olefile():
        import olefile
        return olefile

    def process(self, data: bytearray):
        extractors: dict[str, Callable[[bytearray], str]] = OrderedDict(
            doc=self._extract_ole,
            docx=self._extract_docx,
            odt=self._extract_odt,
        )
        if data.startswith(B'PK'):
            self.log_debug('document contains zip file signature, likely a odt or docx file')
            extractors.move_to_end('doc')
            if 'opendocument' in str(data | xtzip('mimetype')):
                self.log_debug('odt signature detected')
                extractors.move_to_end('odt', last=False)
        for filetype, extractor in extractors.items():
            self.log_debug(F'trying to extract as {filetype}')
            try:
                result = extractor(data)
            except ImportError:
                raise
            except Exception as error:
                self.log_info(F'failed extractring as {filetype}: {error!s}')
            else:
                return result.encode(self.codec)
        raise ValueError('All extractors failed, the input data is not recognized as any known document format.')

    def _extract_docx(self, data: Chunk) -> str:
        NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARAGRAPH = F'{NAMESPACE}p'
        TEXT = F'{NAMESPACE}t'
        chunk = data | xtzip('word/document.xml') | bytearray
        if not chunk:
            raise ValueError('No document.xml file found.')
        root: Element = XML(chunk)
        with StringIO() as output:
            for index, paragraph in enumerate(root.iter(PARAGRAPH)):
                if index > 0:
                    output.write('\n')
                for node in paragraph.iter(TEXT):
                    if node.text:
                        output.write(node.text)
            return output.getvalue()

    def _extract_odt(self, data: bytes):
        def _extract_text(node: Element):
            NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}'
            PARAGRAPH = F'{NAMESPACE}p'
            SPAN = F'{NAMESPACE}span'
            SPACE = F'{NAMESPACE}s'
            with StringIO() as res:
                for element in node:
                    tag = element.tag
                    text = element.text or ''
                    tail = element.tail or ''
                    if tag in [PARAGRAPH, SPAN]:
                        res.write(text)
                    elif tag == SPACE:
                        res.write(' ')
                    else:
                        self.log_debug(F'unknown tag: {tag}')
                    res.write(_extract_text(element))
                    res.write(tail)
                    if tag == PARAGRAPH:
                        res.write('\n')
                return res.getvalue()

        NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}'
        BODY = F'{NAMESPACE}body'
        TEXT = F'{NAMESPACE}text'
        for part in xtzip().unpack(data):
            if part.path != 'content.xml':
                continue
            xml_content: bytes = part.get_data()
            root: Element = XML(xml_content)
            body: Element = root.find(BODY)
            text: Element = body.find(TEXT)
            return _extract_text(text)
        else:
            raise ValueError('found no text')

    def _extract_ole(self, data: bytearray) -> str:
        stream = MemoryFile(data)
        with self._olefile.OleFileIO(stream) as ole:
            doc = ole.openstream('WordDocument').read()
            with StructReader(doc) as reader:
                table_name = F'{(doc[11] >> 1) & 1}Table'
                reader.seek(0x1A2)
                offset = reader.u32()
                length = reader.u32()
            with StructReader(ole.openstream(table_name).read()) as reader:
                reader.seek(offset)
                table = reader.read(length)
            piece_table = self._load_piece_table(table)
            return self._get_text(doc, piece_table)

    def _load_piece_table(self, table: bytes) -> bytes:
        with StructReader(table) as reader:
            while not reader.eof:
                entry_type = reader.read_byte()
                if entry_type == 1:
                    reader.seekrel(reader.read_byte())
                    continue
                if entry_type == 2:
                    length = reader.u32()
                    return reader.read(length)
                raise NotImplementedError(F'Unsupported table entry type value 0x{entry_type:X}.')

    def _get_text(self, doc: bytes, piece_table: bytes) -> str:
        piece_count: int = 1 + (len(piece_table) - 4) // 12
        with StringIO() as text:
            with StructReader(piece_table) as reader:
                character_positions = [reader.u32() for _ in range(piece_count)]
                for i in range(piece_count - 1):
                    cp_start = character_positions[i]
                    cp_end = character_positions[i + 1]
                    fc_value = reader.read_one_struct('xxLxx')
                    is_ansi = bool((fc_value >> 30) & 1)
                    fc = fc_value & 0xBFFFFFFF
                    cb = cp_end - cp_start
                    if is_ansi:
                        encoding = 'cp1252'
                        fc = fc // 2
                    else:
                        encoding = 'utf16'
                        cb *= 2
                    raw = doc[fc : fc + cb]
                    text.write(raw.decode(encoding).replace('\r', '\n'))
            return text.getvalue()

class drp (consecutive=False, align=False, min=1, max=0, len=0, all=False, threshold=20, weight=0, buffer=1024, chug=False)

This unit is implemented in refinery.units.misc.drp and has the following commandline Interface:

usage: drp [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-n N] [-N N] [-l N] [-a] [-t N] [-w N] [-b N | -g]

Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data.
The unit computes a suffix tree which may require a lot of memory for large buffers.

options:
  -c, --consecutive  Assume that the repeating pattern is consecutive when observable.
  -d, --align        Assume that the pattern occurs at offsets that are multiples of its length.
  -n, --min N        Minimum size of the pattern to search for. Default is 1.
  -N, --max N        Maximum size of the pattern to search for. Default is infinite.
  -l, --len N        Set the exact size of the pattern. This is equivalent to --min=N --max=N.
  -a, --all          Produce one output for each repeating pattern that was detected.
  -t, --threshold N  Patterns must match this performance threshold in percent, lest they be
                     discarded.
  -w, --weight N     Specifies how much longer patterns are favored over small ones. Default is
                     0.
  -b, --buffer N     Maximum number of bytes to inspect at once. The default is 1024.
  -g, --chug         Compute the prefix tree for the entire buffer instead of chunking it.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class drp(Unit):
    """
    Detect Repeating Patterns - detects the most prevalent repeating byte pattern
    in a chunk of data. The unit computes a suffix tree which may require a lot of
    memory for large buffers.
    """
    def __init__(
        self,
        consecutive: Param[bool, Arg.Switch('-c',
            help='Assume that the repeating pattern is consecutive when observable.')] = False,
        align: Param[bool, Arg.Switch('-d',
            help='Assume that the pattern occurs at offsets that are multiples of its length.')] = False,
        min: Param[int, Arg.Number('-n',
            help='Minimum size of the pattern to search for. Default is {default}.')] = 1,
        max: Param[int, Arg.Number('-N',
            help='Maximum size of the pattern to search for. Default is infinite.')] = 0,
        len: Param[int, Arg.Number('-l',
            help='Set the exact size of the pattern. This is equivalent to --min=N --max=N.')] = 0,
        all: Param[bool, Arg.Switch('-a',
            help='Produce one output for each repeating pattern that was detected.')] = False,
        threshold: Param[int, Arg.Number('-t',
            help='Patterns must match this performance threshold in percent, lest they be discarded.')] = 20,
        weight: Param[int, Arg.Number('-w',
            help='Specifies how much longer patterns are favored over small ones. Default is {default}.')] = 0,
        buffer: Param[int, Arg.Number('-b', group='BFR',
            help='Maximum number of bytes to inspect at once. The default is {default}.')] = 1024,
        chug: Param[bool, Arg.Switch('-g', group='BFR',
            help='Compute the prefix tree for the entire buffer instead of chunking it.')] = False
    ):
        if len >= 1:
            min = max = len
        super().__init__(
            min=min,
            max=max or INF,
            all=all,
            consecutive=consecutive,
            align=align,
            weight=weight,
            buffer=buffer,
            chug=chug,
            threshold=threshold
        )

    def _get_patterns(self, data):
        with stackdepth(len(data)):
            tree = SuffixTree(data)
        min_size = self.args.min
        max_size = self.args.max
        patterns = set()
        cursor = 0
        while cursor < len(data):
            node = tree.root
            rest = data[cursor:]
            remaining = len(rest)
            length = 0
            offset = None
            while node.children and length < remaining:
                for child in node.children.values():
                    if tree.data[child.start] == rest[length]:
                        node = child
                        break
                if node.start >= cursor:
                    break
                offset = node.start - length
                length = node.end + 1 - offset
            if offset is None:
                cursor += 1
                continue
            length = min(remaining, length)
            if max_size >= length >= min_size:
                pattern = rest[:length].tobytes()
                patterns.add(pattern)
            cursor += length
        del tree
        return patterns

    @staticmethod
    def _consecutive_count(data, pattern):
        length = len(pattern)
        if length == 1:
            return data.count(pattern)
        view = memoryview(data)
        return max(sum(1 for i in range(k, len(view), length) if view[i:i + length] == pattern)
            for k in range(len(pattern)))

    @staticmethod
    def _truncate_pattern(pattern):
        offset = 0
        for byte in pattern[1:]:
            if byte == pattern[offset]:
                offset += 1
            else:
                offset = 0
        if offset > 0:
            pattern = pattern[:-offset]
        return pattern

    def process(self, data: bytearray):
        if len(data) <= 1:
            yield data
            return

        memview = memoryview(data)
        weight = 1 + (self.args.weight / 10)

        if self.args.chug:
            patterns = self._get_patterns(memview)
        else:
            patterns = set()
            chunksize = self.args.buffer
            for k in range(0, len(memview), chunksize):
                patterns |= self._get_patterns(memview[k:k + chunksize])
        if not patterns:
            raise RefineryPartialResult('no repeating sequences found', data)

        self.log_debug('removing duplicate pattern detections')
        duplicates = set()
        maxlen = max(len(p) for p in patterns)
        for pattern in sorted(patterns, key=len):
            for k in range(2, maxlen // len(pattern) + 1):
                repeated = pattern * k
                if repeated in patterns:
                    duplicates.add(repeated)
        patterns -= duplicates

        self.log_debug(F'counting coverage of {len(patterns)} patterns')
        pattern_count = {p: data.count(p) for p in patterns}
        pattern_performance = dict(pattern_count)

        for consecutive in (False, True):
            if consecutive:
                self.log_debug(F're-counting coverage of {len(patterns)} patterns')
                patterns = {self._truncate_pattern(p) for p in patterns}
                pattern_performance = {p: self._consecutive_count(data, p) for p in patterns}

            self.log_debug('evaluating pattern performance')
            for pattern, count in pattern_performance.items():
                pattern_performance[pattern] = count * (len(pattern) ** weight)
            best_performance = max(pattern_performance.values())
            for pattern, performance in pattern_performance.items():
                pattern_performance[pattern] = performance / best_performance

            self.log_debug('removing patterns below performance threshold')
            threshold = self.args.threshold
            patterns = {p for p in patterns if pattern_performance[p] * 100 >= threshold}
            pattern_count = {p: data.count(p) for p in patterns}

            if not self.args.consecutive:
                break

        if self.args.all:
            for pattern in sorted(patterns, key=pattern_performance.get, reverse=True):
                yield self.labelled(pattern, count=pattern_count[pattern])
            return

        best_patterns = [p for p in patterns if pattern_performance[p] == 1.0]

        if len(best_patterns) > 1:
            self.log_warn('could not determine unique best repeating pattern, returning the first of these:')
            for k, pattern in enumerate(best_patterns):
                self.log_warn(F'{k:02d}.: {pattern.hex()}')

        result = best_patterns[0]

        if self.args.align:
            def rotated(pattern):
                for k in range(len(pattern)):
                    yield pattern[k:] + pattern[:k]
            rotations = {k % len(result): r for k, r in (
                (data.find(r), r) for r in rotated(result)) if k >= 0}
            result = rotations[min(rotations)]

        yield result

class dsjava

This unit is implemented in refinery.units.formats.java.deserialize and has the following commandline Interface:

usage: dsjava [-h] [-L] [-Q] [-0] [-v]

Deserialize Java serialized data and re-serialize as JSON.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dsjava(Unit):
    """
    Deserialize Java serialized data and re-serialize as JSON.
    """
    @Unit.Requires('javaobj-py3>=0.4.0.1', ['formats'])
    def _javaobj():
        import javaobj.v2
        return javaobj.v2

    def process(self, data):
        with JavaEncoder as encoder:
            return encoder.dumps(self._javaobj.loads(data)).encode(self.codec)

class dsphp

This unit is implemented in refinery.units.formats.deserialize_php and has the following commandline Interface:

usage: dsphp [-h] [-L] [-Q] [-0] [-v] [-R]

Deserialize PHP serialized data and re-serialize as JSON.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class dsphp(Unit):
    """
    Deserialize PHP serialized data and re-serialize as JSON.
    """
    @Unit.Requires('phpserialize', ['formats'])
    def _php():
        import phpserialize
        return phpserialize

    def reverse(self, data):
        return self._php.dumps(json.loads(data))

    def process(self, data):
        phpobject = self._php.phpobject

        class encoder(json.JSONEncoder):
            def default(self, obj):
                try:
                    return super().default(obj)
                except TypeError:
                    pass
                if isinstance(obj, bytes) or isinstance(obj, bytearray):
                    return obj.decode('utf8')
                if isinstance(obj, phpobject):
                    return obj._asdict()

        return json.dumps(
            self._php.loads(
                data,
                object_hook=phpobject,
                decode_strings=True
            ),
            indent=4,
            cls=encoder
        ).encode(self.codec)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return self._php.dumps(json.loads(data))

class dump (*files, tee=False, stream=False, plain=False, force=False)

This unit is implemented in refinery.units.sinks.dump and has the following commandline Interface:

usage: dump [-h] [-L] [-Q] [-0] [-v] [-t] [-s] [-p] [-f] [file ...]

Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any
metadata field on an incoming chunk is available. Additionally, any field that can be populated
by the cm unit is also available. These include the following:

    {ext}    : Automatically guessed file extension
    {crc32}  : CRC32 checksum of the data
    {index}  : Index of the data in the input stream, starting at 0
    {size}   : Size of the data in bytes
    {md5}    : MD5 hash of the data
    {sha1}   : SHA1 hash of the data
    {sha256} : SHA-256 hash of the data
    {path}   : Associated path; defaults to {sha256} if none is given.

When not using formatted file names, the unit ingests as many incoming inputs as filenames were
specified on the command line. Unless connected to a terminal, the remaining inputs will be
forwarded on STDOUT. The -t or --tee switch can be used to forward all inputs, under all
circumstances, regardless of whether or not they have been processed.

If the data cannot be written to the specified path because a file already exists in place of a
directory that would have to be created, the unit renames the directory until dumping is
possible.

If no file is specified, all ingested inputs are concatenated and written to the clipboard. This
will only succeed when the data can successfully be encoded.

positional arguments:
  file           Optionally formatted filename.

options:
  -t, --tee      Forward all inputs to STDOUT.
  -s, --stream   Dump all incoming data to the same file.
  -p, --plain    Never apply any formatting to file names.
  -f, --force    Remove files if necessary to create dump path.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dump(Unit):
    """
    Dump incoming data to files on disk. It is possible to specify filenames with format fields.
    Any metadata field on an incoming chunk is available. Additionally, any field that can be
    populated by the `refinery.cm` unit is also available. These include the following:

        {ext}    : Automatically guessed file extension
        {crc32}  : CRC32 checksum of the data
        {index}  : Index of the data in the input stream, starting at 0
        {size}   : Size of the data in bytes
        {md5}    : MD5 hash of the data
        {sha1}   : SHA1 hash of the data
        {sha256} : SHA-256 hash of the data
        {path}   : Associated path; defaults to {sha256} if none is given.

    When not using formatted file names, the unit ingests as many incoming inputs as filenames were
    specified on the command line. Unless connected to a terminal, the remaining inputs will be
    forwarded on STDOUT. The `-t` or `--tee` switch can be used to forward all inputs, under all
    circumstances, regardless of whether or not they have been processed.

    If the data cannot be written to the specified path because a file already exists in place of a
    directory that would have to be created, the unit renames the directory until dumping is possible.

    If no file is specified, all ingested inputs are concatenated and written to the clipboard. This
    will only succeed when the data can successfully be encoded.
    """

    def __init__(
        self, *files: Param[str, Arg.String(metavar='file', help='Optionally formatted filename.')],
        tee: Param[bool, Arg.Switch('-t', help='Forward all inputs to STDOUT.')] = False,
        stream: Param[bool, Arg.Switch('-s', help='Dump all incoming data to the same file.')] = False,
        plain: Param[bool, Arg.Switch('-p', help='Never apply any formatting to file names.')] = False,
        force: Param[bool, Arg.Switch('-f', help='Remove files if necessary to create dump path.')] = False,
    ):
        if stream and len(files) != 1:
            raise ValueError('Can only use exactly one file in stream mode.')
        super().__init__(files=files, tee=tee, stream=stream, force=force)
        self.stream = None
        self._formatted = not plain and any(_has_format(f) for f in files)
        self._reset()

    def _reset(self):
        self.exhausted = False
        self.paths = cycle(self.args.files) if self._formatted else iter(self.args.files)
        self._close()

    @property
    def _clipcopy(self):
        return not self.args.files

    @lru_cache(maxsize=None)
    def _fix_path_part(self, base: Path) -> Path:
        if not _is_path_obstruction(base):
            return base
        if self.args.force:
            try:
                os.unlink(base)
            except Exception:
                raise RefineryCriticalException(F'Unable to remove path obstruction: {base}.')
            else:
                self.log_info(F'removed path obstruction: {base}')
                return base
        else:
            stem = base = base.with_suffix('')
            counter = 0
            while _is_path_obstruction(base):
                base = stem.with_suffix(F'.{counter}')
                counter += 1
            return base

    def _fix_path(self, path: Path) -> Path:
        fixed = Path()
        for p in path.parent.parts:
            fixed = self._fix_path_part(fixed / p)
        return fixed / path.name

    def _open(self, path, unc=False):
        if hasattr(path, 'close'):
            return path
        path = self._fix_path(Path(path).absolute())
        base = path.parent
        try:
            os.makedirs(base, exist_ok=True)
        except FileNotFoundError:
            if unc or os.name != 'nt':
                raise
            return self._open(F'\\\\?\\{path}', unc=True)
        except FileExistsError:
            raise RefineryCriticalException(
                F'Unknown error while attempting to create parent directory: {base}')
        except OSError as e:
            if not self.log_info():
                self.log_warn('opening:', path)
            self.log_warn('errored:', e.args[1])
            return open(os.devnull, 'wb')
        else:
            info = str(path)
            self.log_info('opening:', info[4:] if unc else info)
            mode = 'ab' if self.args.stream else 'wb'
            return path.open(mode)

    def _close(self, final=False):
        if not self.stream:
            return
        self.stream.flush()
        if self.args.stream and not final:
            return
        if self._clipcopy:
            if os.name == 'nt':
                from refinery.lib.winclip import CF, ClipBoard
                try:
                    img = self._image.open(self.stream)
                    with io.BytesIO() as out:
                        img.save(out, 'BMP')
                except Exception:
                    with ClipBoard(CF.TEXT) as cpb:
                        cpb.copy(self.stream.getvalue())
                else:
                    with ClipBoard(CF.DIB) as cpb:
                        out.seek(14, io.SEEK_SET)
                        cpb.copy(out.read())
            else:
                data = self.stream.getvalue()
                data = data.decode(self.codec, errors='backslashreplace')
                self._pyperclip.copy(data)
        self.stream.close()
        self.stream = None

    @Unit.Requires('pyperclip')
    def _pyperclip():
        import pyperclip
        return pyperclip

    @Unit.Requires('Pillow', ['formats'])
    def _image():
        from PIL import Image
        return Image

    def process(self, data: bytearray):
        forward_input_data = self.args.tee
        if self._clipcopy:
            if stream := self.stream:
                stream.write(data)
        elif not self.exhausted:
            if not self.stream:
                # This should happen only when the unit is called from Python code
                # rather than via the command line.
                try:
                    path = next(self.paths)
                except StopIteration:
                    raise RefineryCriticalException('the list of filenames was exhausted.')
                else:
                    with self._open(path) as stream:
                        stream.write(data)
            else:
                self.stream.write(data)
                self.log_debug(F'wrote 0x{len(data):08X} bytes')
                self._close()
        else:
            forward_input_data = forward_input_data or not self.isatty()
            if not forward_input_data:
                size = metavars(data).size
                self.log_warn(F'discarding unprocessed chunk of size {size!s}.')
        if forward_input_data:
            yield data

    def filter(self, chunks):
        if self.exhausted:
            self._reset()

        nostream = not self.args.stream
        clipcopy = self._clipcopy

        if clipcopy:
            self.stream = io.BytesIO()

        for index, chunk in enumerate(chunks, 0):
            if not chunk.visible:
                continue
            if not clipcopy and not self.exhausted and (nostream or not self.stream):
                try:
                    path = next(self.paths)
                except StopIteration:
                    self.exhausted = True
                else:
                    if _has_format(path):
                        meta = metavars(chunk)
                        meta.ghost = True
                        meta.index = index
                        new_path = meta.format_str(path, self.codec, [chunk])
                        if new_path != path:
                            path = new_path
                        elif self.leniency < 1:
                            raise ValueError(
                                F'Could not resolve formatting in path "{path}"; '
                                R'increase leniency to ignore this.')
                    self.stream = self._open(path)
            yield chunk

        self._close(final=True)
        self.exhausted = True

class eat (name)

This unit is implemented in refinery.units.meta.eat and has the following commandline Interface:

usage: eat [-h] [-L] [-Q] [-0] [-v] name

Consume a meta variable and replace the contents of the current chunk with it. If the variable
contains a string, it is encoded with the default codec. If the variable cannot be converted to a
byte string, the data is lost and an empty chunk is returned.

positional arguments:
  name           The name of the variable to be used.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class eat(Unit):
    """
    Consume a meta variable and replace the contents of the current chunk with it. If the variable
    contains a string, it is encoded with the default codec. If the variable cannot be converted to
    a byte string, the data is lost and an empty chunk is returned.
    """
    def __init__(
        self,
        name: Param[str, Arg.String(help='The name of the variable to be used.')],
    ):
        super().__init__(name=check_variable_name(name))

    def process(self, data: Chunk):
        def invalid_type():
            return F'variable {name} is of type "{type}", unable to convert to byte string - data is lost'
        name = self.args.name
        meta = metavars(data)
        data = meta.pop(name)
        type = data.__class__.__name__
        if isinstance(data, int):
            self.log_info(F'variable {name} is an integer, converting to string.')
            data = str(data).encode(self.codec)
        if isinstance(data, str):
            self.log_info(F'variable {name} is a string, encoding as {self.codec}')
            data = data.encode(self.codec)
        elif not isbuffer(data):
            try:
                wrapped = bytearray(data)
            except Exception:
                self.log_warn(invalid_type())
                data = None
            else:
                data = wrapped
        return data

class ef (*filenames, list=False, meta=False, size=None, read=0, wild=False, tame=False, symlinks=False, linewise=False)

This unit is implemented in refinery.units.meta.ef and has the following commandline Interface:

usage: ef [-h] [-L] [-Q] [-0] [-v] [-l] [-m] [-s start:end:step] [-r N] [-w | -t] [-y] [-i]
          FILEMASK [FILEMASK ...]

Short for "emit file". The unit reads files from disk and outputs them individually. Has the
ability to read large files in chunks.

positional arguments:
  FILEMASK                   A list of file masks. Each matching file will be read from disk and
                             emitted. The file masks can include format string expressions which
                             will be substituted from the current meta variables. The masks can
                             use wild-card expressions, but this feature is disabled by default
                             on Posix platforms, where it has to be enabled explicitly using the
                             -w switch. On Windows, the feature is enabled by default and can be
                             disabled using the -t switch.

options:
  -l, --list                 Only lists files with metadata.
  -m, --meta                 Adds the atime, mtime, ctime, and size metadata variables.
  -s, --size start:end:step  If specified, only files are read whose size is in the given range.
  -r, --read N               If specified, files will be read in chunks of size N and each chunk
                             is emitted as one element in the output list.
  -w, --wild                 Force use of wildcard patterns in file masks.
  -t, --tame                 Disable wildcard patterns in file masks.
  -y, --symlinks             Follow symbolic links and junctions, these are ignored by default.
  -i, --linewise             Read the file linewise. By default, one line is read at a time. In
                             line mode, the --read argument can be used to read the given number
                             of lines in each chunk.

generic options:
  -h, --help                 Show this help message and exit.
  -L, --lenient              Increase the leniency, allowing partial results and ignoring more
                             errors.
  -Q, --quiet                Disables all log output.
  -0, --devnull              Do not produce any output.
  -v, --verbose              Specify up to two times to increase log level.

Expand source code Browse git

class ef(Unit):
    """
    Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to
    read large files in chunks.
    """

    def __init__(self,
        *filenames: Param[str, Arg.String(metavar='FILEMASK', nargs='+', help=(
            'A list of file masks. Each matching file will be read from disk and '
            'emitted. The file masks can include format string expressions which '
            'will be substituted from the current meta variables. The masks can '
            'use wild-card expressions, but this feature is disabled by default on '
            'Posix platforms, where it has to be enabled explicitly using the -w '
            'switch. On Windows, the feature is enabled by default and can be '
            'disabled using the -t switch.'
        ))],
        list: Param[bool, Arg.Switch('-l', help='Only lists files with metadata.')] = False,
        meta: Param[bool, Arg.Switch('-m', help=(
            'Adds the atime, mtime, ctime, and size metadata variables.'
        ))] = False,
        size: Param[slice, Arg.Bounds('-s', help=(
            'If specified, only files are read whose size is in the given range.'))] = None,
        read: Param[int, Arg.Number('-r', help=(
            'If specified, files will be read in chunks of size N and each '
            'chunk is emitted as one element in the output list.'
        ))] = 0,
        wild: Param[bool, Arg.Switch('-w', group='W', help='Force use of wildcard patterns in file masks.')] = False,
        tame: Param[bool, Arg.Switch('-t', group='W', help='Disable wildcard patterns in file masks.')] = False,
        symlinks: Param[bool, Arg.Switch('-y', help='Follow symbolic links and junctions, these are ignored by default.')] = False,
        linewise: Param[bool, Arg.Switch('-i', help=(
            'Read the file linewise. By default, one line is read at a time. '
            'In line mode, the --read argument can be used to read the given '
            'number of lines in each chunk.'
        ))] = False
    ):
        if wild and tame:
            raise ValueError('Cannot be both wild and tame!')
        super().__init__(
            size=size,
            read=read,
            list=list,
            meta=meta,
            wild=wild,
            tame=tame,
            symlinks=symlinks,
            linewise=linewise,
            filenames=filenames
        )

    def _read_chunks(self, fd):
        while True:
            buffer = fd.read(self.args.read)
            if not buffer:
                break
            yield buffer

    def _read_lines(self, fd):
        count = self.args.read or 1
        if count == 1:
            while True:
                buffer = fd.readline()
                if not buffer:
                    break
                yield buffer
            return
        with MemoryFile() as out:
            while True:
                for _ in range(count):
                    buffer = fd.readline()
                    if not buffer:
                        break
                    out.write(buffer)
                if not out.tell():
                    break
                yield out.getvalue()
                out.seek(0)
                out.truncate()

    def _absolute_path(self, path_string: str):
        path = Path(path_string).resolve().absolute()
        if os.name == 'nt' and not path.parts[0].startswith('\\\\?\\'):
            # The pathlib glob method will simply fail mid-traversal if it attempts to descend into
            # a folder or to a file whose path exceeds MAX_PATH on Windows. As a workaround, we use
            # UNC paths throughout and truncate to relative paths after enumeration.
            path = Path(F'\\\\?\\{path!s}')
        return path

    def _glob(self, pattern: str) -> Iterable[Path]:
        if pattern.endswith('**'):
            pattern += '/*'
        wildcard = re.search(R'[\[\?\*]', pattern)
        if wildcard is None:
            yield self._absolute_path(pattern)
            return
        k = wildcard.start()
        base, pattern = pattern[:k], pattern[k:]
        path = self._absolute_path(base or '.')
        last = path.parts[-1]
        if base.endswith(last):
            # /base/something.*
            pattern = F'{last}{pattern}'
            path = path.parent

        scandir = os.scandir

        class EmptyIterator:
            def __enter__(self): return self
            def __exit__(self, *_, **__): pass
            def __next__(self): raise StopIteration
            def __iter__(self): return self

        if sys.version_info >= (3, 12):
            def islink(path):
                return os.path.islink(path) or os.path.isjunction(path)
        else:
            def islink(path):
                try:
                    return bool(os.readlink(path))
                except OSError:
                    return False

        paths_scanned = set()

        def _patched_scandir(path):
            if islink(path):
                if not self.args.symlinks:
                    return EmptyIterator()
                try:
                    rp = os.path.realpath(path, strict=True)
                except OSError:
                    return EmptyIterator()
                if rp in paths_scanned:
                    self.log_warn(F'file system loop at: {path!s}')
                    return EmptyIterator()
                paths_scanned.add(rp)
                path = rp
            try:
                return scandir(path)
            except Exception as e:
                ignore = _ERROR_IGNORES.get(os.name, set())
                if not any(p.lower() in ignore for p in Path(path).parts):
                    self.log_warn(F'error calling scandir, {exception_to_string(e)}: {path}')
                return EmptyIterator()

        try:
            os.scandir = _patched_scandir
            yield from path.glob(pattern)
        finally:
            os.scandir = scandir

    def process(self, data):
        meta = metavars(data)
        size = self.args.size
        size = size and bounds[size]
        meta.ghost = True
        wild = (os.name == 'nt' or self.args.wild) and not self.args.tame
        root = self._absolute_path('.')
        paths = self._glob if wild else lambda mask: [self._absolute_path(mask)]
        do_meta = self.args.meta
        do_stat = size or do_meta

        class SkipErrors:
            unit = self

            def __init__(self):
                self._history: set[type] = set()
                self._message: dict[type, str | None] = {
                    ValueError: (
                        None
                    ), PermissionError: (
                        'access error while scanning: {}'
                    ), OSError: (
                        'system error while scanning: {}'
                    ), FileNotFoundError: (
                        'file unexpectedly not found: {}'
                    ), Exception: (
                        'unknown error while reading: {}'
                    ),
                }
                self.path = None

            def reset(self, path):
                self._history.clear()
                self.path = path
                return self

            def __enter__(self):
                return self

            def __exit__(self, et, ev, trace):
                if et is None:
                    return False
                for t, msg in self._message.items():
                    if issubclass(et, t):
                        if t not in self._history:
                            self._history.add(t)
                            if msg is not None:
                                self.unit.log_info(msg.format(self.path))
                        return True
                else:
                    return False

        for mask in self.args.filenames:
            mask = meta.format_str(mask, self.codec, [data])
            self.log_debug('scanning for mask:', mask)
            kwargs = dict()
            skip_errors = SkipErrors()
            for path in paths(mask):
                skip_errors.reset(path)
                filesize = None
                with skip_errors:
                    path = path.relative_to(root)
                with skip_errors:
                    if wild and not path.is_file():
                        continue
                with skip_errors:
                    if do_stat:
                        stat = path.stat()
                        filesize = stat.st_size
                    if do_meta:
                        kwargs.update(
                            fsize=filesize,
                            atime=datetime.fromtimestamp(stat.st_atime).isoformat(' ', 'seconds'),
                            ctime=datetime.fromtimestamp(stat.st_ctime).isoformat(' ', 'seconds'),
                            mtime=datetime.fromtimestamp(stat.st_mtime).isoformat(' ', 'seconds')
                        )
                if size is not None and filesize not in size:
                    continue
                with skip_errors:
                    if self.args.list:
                        yield self.labelled(str(path).encode(self.codec), **kwargs)
                        continue
                    with path.open('rb') as stream:
                        if self.args.linewise:
                            yield from self._read_lines(stream)
                        elif self.args.read:
                            yield from self._read_chunks(stream)
                        else:
                            contents = stream.read()
                            self.log_info(lambda: F'reading: {path!s} ({len(contents)} bytes)')
                            yield self.labelled(contents, path=path.as_posix(), **kwargs)

class emit (*data)

This unit is implemented in refinery.units.meta.emit and has the following commandline Interface:

usage: emit [-h] [-L] [-Q] [-0] [-v] [data ...]

positional arguments:
  data           Data to be emitted. If no argument is specified, data is retrieved from the
                 clipboard. Multiple arguments are output in framed format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class emit(Unit):

    def __init__(self, *data: Param[buf, Arg(help=(
        'Data to be emitted. If no argument is specified, data is retrieved from '
        'the clipboard. Multiple arguments are output in framed format.'
    ))]):
        super().__init__(data=data)

    @Unit.Requires('pyperclip')
    def _pyperclip():
        import pyperclip
        return pyperclip

    def process(self, data):
        if self.args.data:
            yield from self.args.data
            return
        if os.name == 'nt':
            from refinery.lib.winclip import get_any_data
            mode, data = get_any_data()
            if mode is not None:
                self.log_info(F'retrieved clipboard data in {mode.name} format')
            yield data
        else:
            data = self._pyperclip.paste()
            if not data:
                return
            yield data.encode(self.codec, 'replace')

class esc (hex=False, unicode=False, greedy=False, unquoted=False, quoted=False, bare=False)

This unit is implemented in refinery.units.encoding.esc and has the following commandline Interface:

usage: esc [-h] [-L] [-Q] [-0] [-v] [-R] [-x] [-u] [-g] [-p | -q] [-b]

Encodes and decodes common ASCII escape sequences.

options:
  -x, --hex       Hex encode everything, do not use C escape sequences.
  -u, --unicode   Use unicode escape sequences and UTF-8 encoding.
  -g, --greedy    Replace \x by x and \u by u when not followed by two or four hex digits,
                  respectively.
  -p, --unquoted  Never remove enclosing quotes.
  -q, --quoted    Remove enclosing quotes while decoding and add them for encoding.
  -b, --bare      Do not escape quote characters.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.
  -R, --reverse   Use the reverse operation.

Expand source code Browse git

class esc(Unit):
    """
    Encodes and decodes common ASCII escape sequences.
    """
    _ESCAPE = {
        0x00: BR'\0',
        0x07: BR'\a',
        0x08: BR'\b',
        0x0C: BR'\f',
        0x0A: BR'\n',
        0x0D: BR'\r',
        0x09: BR'\t',
        0x0B: BR'\v',
        0x5C: BR'\\',
        0x27: BR'\'',
        0x22: BR'\"'
    }
    _UNESCAPE = {
        BR'0': B'\x00',
        BR'a': B'\x07',
        BR'b': B'\x08',
        BR'f': B'\x0C',
        BR'n': B'\x0A',
        BR'r': B'\x0D',
        BR't': B'\x09',
        BR'v': B'\x0B',
        B'\\': B'\x5C',
        BR"'": B'\x27',
        BR'"': B'\x22'
    }

    def __init__(self,
        hex: Param[bool, Arg.Switch('-x',
            help='Hex encode everything, do not use C escape sequences.')] = False,
        unicode: Param[bool, Arg.Switch('-u',
            help='Use unicode escape sequences and UTF-8 encoding.')] = False,
        greedy: Param[bool, Arg.Switch('-g',
            help='Replace \\x by x and \\u by u when not followed by two or four hex digits, respectively.')] = False,
        unquoted: Param[bool, Arg.Switch('-p', group='Q',
            help='Never remove enclosing quotes.')] = False,
        quoted: Param[bool, Arg.Switch('-q', group='Q',
            help='Remove enclosing quotes while decoding and add them for encoding.')] = False,
        bare: Param[bool, Arg.Switch('-b',
            help='Do not escape quote characters.')] = False,
    ): pass # noqa

    def process(self, data):
        data = memoryview(data)

        if self.args.quoted:
            quote = data[0]
            if data[-1] != quote:
                self.log_info('string is not correctly quoted')
            else:
                data = data[1:-1]
        elif not self.args.unquoted:
            quote = data[:1]
            strip = data[1:-1]
            if data[-1:] == quote and not re.search(br'(?<!\\)' + re.escape(quote), strip):
                self.log_info('removing automatically detected quotes')
                data = strip

        def unescape(match: re.Match[bytes]):
            c = match[1]
            if len(c) > 1:
                if c[0] == 0x75:
                    # unicode
                    upper = int(c[1:3], 16)
                    lower = int(c[3:5], 16)
                    if self.args.unicode:
                        return bytes((lower, upper)).decode('utf-16le').encode(self.codec)
                    return bytes((lower,))
                elif c[0] == 0x78:
                    # hexadecimal
                    return bytes((int(c[1:3], 16),))
                else:
                    # octal escape sequence
                    return bytes((int(c, 8) & 0xFF,))
            elif c in B'ux':
                return c if self.args.greedy else match[0]
            return self._UNESCAPE.get(c, c)
        data = re.sub(
            RB'\\(u[a-fA-F0-9]{4}|x[a-fA-F0-9]{1,2}|[0-7]{3}|.)', unescape, data)
        return data

    def reverse(self, data):
        if self.args.unicode:
            string = data.decode(self.codec).encode('UNICODE_ESCAPE')
        else:
            if not self.args.hex:
                def escape(match: re.Match[bytes]):
                    c = match[0][0]
                    return self._ESCAPE.get(c, RB'\x%02x' % c)
                pattern = RB'[\x00-\x1F\x22\x27\x5C\x7F-\xFF]'
                if self.args.bare:
                    pattern = RB'[\x00-\x1F\x5C\x7F-\xFF]'
                string = re.sub(pattern, escape, data)
            else:
                string = bytearray(4 * len(data))
                for k in range(len(data)):
                    a = k * 4
                    b = k * 4 + 4
                    string[a:b] = RB'\x%02x' % data[k]
        if self.args.quoted:
            string = B'"%s"' % string
        return string

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    if self.args.unicode:
        string = data.decode(self.codec).encode('UNICODE_ESCAPE')
    else:
        if not self.args.hex:
            def escape(match: re.Match[bytes]):
                c = match[0][0]
                return self._ESCAPE.get(c, RB'\x%02x' % c)
            pattern = RB'[\x00-\x1F\x22\x27\x5C\x7F-\xFF]'
            if self.args.bare:
                pattern = RB'[\x00-\x1F\x5C\x7F-\xFF]'
            string = re.sub(pattern, escape, data)
        else:
            string = bytearray(4 * len(data))
            for k in range(len(data)):
                a = k * 4
                b = k * 4 + 4
                string[a:b] = RB'\x%02x' % data[k]
    if self.args.quoted:
        string = B'"%s"' % string
    return string

class escps

This unit is implemented in refinery.units.encoding.escps and has the following commandline Interface:

usage: escps [-h] [-L] [-Q] [-0] [-v] [-R]

Escapes and unescapes PowerShell strings.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class escps(Unit):
    """
    Escapes and unescapes PowerShell strings.
    """
    UNESCAPE = {
        '`0': '\0',
        '`a': '\a',
        '`b': '\b',
        '`f': '\f',
        '`n': '\n',
        '`r': '\r',
        '`t': '\t',
        '`v': '\v',
        '``': '`',
        "`'": '\'',
        '`"': '\"',
    }
    ESCAPE = {
        '`' : '``',
        '$' : '`$',
        '\0': '`0',
        '\a': '`a',
        '\b': '`b',
        '\f': '`f',
        '\n': '`n',
        '\r': '`r',
        '\t': '`t',
        '\v': '`v',
        '\'': "`'",
        '\"': '""',
    }

    def __init__(self): pass

    @unicoded
    def process(self, data):
        match = re.fullmatch(R'''@(['"])\s*?[\r\n](.*?)[\r\n]\1@''', data, flags=re.DOTALL)
        if match:
            return match.group(2)
        if data[0] not in '\'\"' or data[-1] != data[0]:
            raise ValueError(
                'No quotes found at beginning of input. To escape a PowerShell string, the '
                'quotes must be included because quote escaping depends on whether a single '
                'or double quote was used.')

        quote, data = data[0], data[1:-1]

        def unescape(match):
            string = match[0]
            return self.UNESCAPE.get(string, string[1:])

        if quote == '"':
            if re.search(R'(?<!`)\$(?=[\w\(\{\$\?\^:])', data):
                self.log_warn('Loss of information: double quoted string contains variable substitutions.')
            data = re.sub('`.', unescape, data)

        return data.replace(quote + quote, quote)

    @unicoded
    def reverse(self, data):
        def escaper(match):
            char = match[0]
            return escps.ESCAPE.get(char, char)
        return '"{}"'.format(re.sub(R'''[\x00\x07-\x0D`$'"]''', escaper, data))

Methods

def reverse(self, data)

Expand source code Browse git

@unicoded
def reverse(self, data):
    def escaper(match):
        char = match[0]
        return escps.ESCAPE.get(char, char)
    return '"{}"'.format(re.sub(R'''[\x00\x07-\x0D`$'"]''', escaper, data))

class escvb

This unit is implemented in refinery.units.encoding.escvb and has the following commandline Interface:

usage: escvb [-h] [-L] [-Q] [-0] [-v] [-R]

Escapes and unescapes Visual Basic strings.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class escvb(Unit):
    """
    Escapes and unescapes Visual Basic strings.
    """
    def process(self, data):
        if data[:1] == B'"' and data[-1:] == B'"':
            data = data[1:-1]
        return data.replace(B'""', B'"')

    def reverse(self, data):
        return B'"%s"' % data.replace(B'"', B'""')

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return B'"%s"' % data.replace(B'"', B'""')

class evtx (raw=False)

This unit is implemented in refinery.units.formats.evtx and has the following commandline Interface:

usage: evtx [-h] [-L] [-Q] [-0] [-v] [-r]

Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a
single output chunk in XML format.

options:
  -r, --raw      Extract raw event data rather than XML.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class evtx(Unit):
    """
    Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single
    output chunk in XML format.
    """

    def __init__(self, raw: Param[bool, Arg.Switch('-r', help='Extract raw event data rather than XML.')] = False):
        super().__init__(raw=raw)

    @Unit.Requires('python-evtx', ['formats'])
    def _evtx():
        from Evtx.Evtx import Evtx
        return Evtx

    def process(self, data):
        with VirtualFileSystem() as vfs:
            raw = self.args.raw
            with self._evtx(vfs.new(data)) as log:
                for record in log.records():
                    yield record.data() if raw else record.xml().encode(self.codec)

class fernet (key)

This unit is implemented in refinery.units.crypto.cipher.fernet and has the following commandline Interface:

usage: fernet [-h] [-L] [-Q] [-0] [-v] key

Decrypt Fernet messages.

positional arguments:
  key            A fernet key, either in base64 or raw binary.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class fernet(Unit):
    """
    Decrypt Fernet messages.
    """
    def __init__(self, key: Param[buf, Arg(help='A fernet key, either in base64 or raw binary.')]):
        super().__init__(key=key)

    def _b64(self, data):
        try:
            return data | b64(urlsafe=True) | bytearray
        except Exception:
            return data

    def process(self, data):
        fk = self._b64(self.args.key)
        if len(fk) != 32:
            raise ValueError(F'The given Fernet key has length {len(fk)}, expected 32 bytes.')
        signing_key = fk[:16]
        encryption_key = fk[16:]
        decoded = self._b64(data)
        reader = StructReader(memoryview(decoded), bigendian=True)
        signed_data = reader.peek(reader.remaining_bytes - 32)
        version = reader.u8()
        timestamp = datetime.fromtimestamp(reader.u64())
        iv = reader.read(16)
        if version != 0x80:
            self.log_warn(F'The Fernet version is 0x{version:02X}, the only documented one is 0x80.')
        ciphertext = reader.read(reader.remaining_bytes - 32)
        if len(ciphertext) % 16 != 0:
            raise ValueError('The encoded ciphertext is not 16-byte block aligned.')
        signature = reader.read(32)
        hmac = HMAC.new(signing_key, digestmod=SHA256)
        hmac.update(signed_data)
        if hmac.digest() != signature:
            self.log_warn('HMAC verification failed; the message has been tampered with.')
            self.log_info(F'computed signature: {hmac.hexdigest().upper()}')
            self.log_info(F'provided signature: {signature.hex().upper()}')
        plaintext = ciphertext | aes(mode='cbc', iv=iv, key=encryption_key) | bytearray
        return self.labelled(plaintext, timestamp=timestamp.isoformat(' ', 'seconds'))

class flz (level=0)

This unit is implemented in refinery.units.compression.flz and has the following commandline Interface:

usage: flz [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-l N]

FastLZ (or FLZ for short) compression and decompression. This implementation was ported to pure
Python from the C reference and is therefore much slower.

options:
  -l, --level N  Specify a FastLZ level (either 0 or 1). By default, compression will select a
                 level based on buffer length like the reference implementation. Decompression
                 reads level information from the header by default.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class flz(Unit):
    """
    FastLZ (or FLZ for short) compression and decompression. This implementation was ported to
    pure Python from the C reference and is therefore much slower.
    """
    def __init__(
        self,
        level: Param[int, Arg.Number('-l', bound=(1, 2), help=(
            'Specify a FastLZ level (either 0 or 1). By default, compression will select a level '
            'based on buffer length like the reference implementation. Decompression reads level '
            'information from the header by default.'))] = 0
    ):
        super().__init__(level=level)

    def reverse(self, data):
        if not data:
            return data
        if (level := self.args.level) <= 0:
            level = 1 + int(len(data) >= 0x10000)
        output = bytearray()
        _flz_compress(memoryview(data), output, level - 1)
        return output

    def process(self, data):
        try:
            hl = 1 + (data[0] >> 5)
        except IndexError:
            return None
        if (level := self.args.level) == 0:
            level = hl
        if level != hl:
            self.log_info(F'Using level {level} despite header-defined level {hl}.')
        return _flz_decompress(memoryview(data), level - 1)

    @classmethod
    def handles(cls, data):
        if data and (data[0] >> 5) > 1:
            return False

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    if not data:
        return data
    if (level := self.args.level) <= 0:
        level = 1 + int(len(data) >= 0x10000)
    output = bytearray()
    _flz_compress(memoryview(data), output, level - 1)
    return output

class gost (key, iv=b'', padding=None, mode=None, raw=False, swap=False, sbox=SBOX.R34, *, aad=b'', tag=(), segment_size=0, little_endian=False)

This unit is implemented in refinery.units.crypto.cipher.gost and has the following commandline Interface:

usage: gost [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-x {cbr,r34}] [-e]
            [-S N]
            key

GOST encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -s, --swap            Decode blocks as big endian rather than little endian.
  -x, --sbox {cbr,r34}  Choose an SBOX. The default is R34, which corresponds to the R-34.12.2015
                        standard. The other option is CBR, which is the SBOX used by the Central
                        Bank of Russia.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class gost(StandardBlockCipherUnit, cipher=BlockCipherFactory(GOST)):
    """
    GOST encryption and decryption.
    """
    def __init__(
        self, key, iv=B'', padding=None, mode=None, raw=False,
        swap: Param[bool, Arg.Switch('-s', help='Decode blocks as big endian rather than little endian.')] = False,
        sbox: Param[str, Arg.Option('-x', choices=SBOX, help=(
            'Choose an SBOX. The default is {default}, which corresponds to the R-34.12.2015 standard. '
            'The other option is CBR, which is the SBOX used by the Central Bank of Russia.'
        ))] = SBOX.R34, **more
    ):
        sbox = Arg.AsOption(sbox, SBOX)
        super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, swap=swap, sbox=sbox, **more)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            swap=self.args.swap,
            sbox=self.args.sbox,
            **optionals
        )

class group (size)

This unit is implemented in refinery.units.meta.group and has the following commandline Interface:

usage: group [-h] [-L] [-Q] [-0] [-v] N

Group incoming chunks into frames of the given size.

positional arguments:
  N              Size of each group; must be at least 2.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class group(Unit):
    """
    Group incoming chunks into frames of the given size.
    """
    def __init__(self, size: Param[int, Arg.Number(help='Size of each group; must be at least 2.', bound=(2, None))]):
        super().__init__(size=size)

    def process(self, data: Chunk):
        if not data.temp:
            return
        yield data
        yield from islice(data.temp, 0, self.args.size - 1)

    def filter(self, chunks):
        it = iter(chunks)
        while True:
            try:
                head: Chunk = next(it)
            except StopIteration:
                return
            head.temp = it
            yield head

class groupby (name)

This unit is implemented in refinery.units.meta.groupby and has the following commandline Interface:

usage: groupby [-h] [-L] [-Q] [-0] [-v] name

Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot
stream any output until the input frame is consumed: It has to read every input chunk to make
sure that all groupings are complete.

positional arguments:
  name           name of the meta variable

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class groupby(Unit):
    """
    Group incoming chunks by the contents of a meta variable. Note that the unit
    blocks and cannot stream any output until the input frame is consumed: It has
    to read every input chunk to make sure that all groupings are complete.
    """
    def __init__(self, name: Param[str, Arg.String(help='name of the meta variable')]):
        super().__init__(name=check_variable_name(name))

    def process(self, data):
        yield from data.temp

    def filter(self, chunks: Iterable[Chunk]) -> Generator[Chunk]:
        name = self.args.name
        members = defaultdict(list)
        for chunk in chunks:
            try:
                value = chunk.meta[name]
            except KeyError:
                value = None
            members[value].append(chunk)
        for chunklist in members.values():
            dummy = chunklist[0]
            dummy.temp = chunklist
            yield dummy

class hc128 (key, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.hc128 and has the following commandline Interface:

usage: hc128 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key

HC-128 encryption and decryption.

positional arguments:
  key              The encryption key.

options:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class hc128(StreamCipherUnit):
    """
    HC-128 encryption and decryption.
    """
    key_size = {32}

    def keystream(self) -> Iterable[int]:
        return HC128(self.args.key)

class hc256 (key, iv=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.hc256 and has the following commandline Interface:

usage: hc256 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key [iv]

HC-256 encryption and decryption.

positional arguments:
  key              The encryption key.
  iv               An initialization vector; the default is a sequence of 32 zero bytes.

options:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class hc256(StreamCipherUnit):
    """
    HC-256 encryption and decryption.
    """
    key_size = {32}

    def __init__(
        self, key,
        iv: Param[buf, Arg(help='An initialization vector; the default is a sequence of 32 zero bytes.')] = bytes(32),
        discard=0, stateful=False,
    ):
        super().__init__(key=key, iv=iv, stateful=stateful, discard=discard)
        self._keystream = None

    def keystream(self) -> Iterable[int]:
        for num in HC256(self.args.key, self.args.iv):
            yield from num.to_bytes(4, 'little')

class hex

This unit is implemented in refinery.units.encoding.hex and has the following commandline Interface:

usage: hex [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding,
any odd trailing hex digits are stripped as two hex digits are required to represent a byte.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class hex(Unit):
    """
    Hex-decodes and encodes binary data. Non-hex characters are removed from
    the input. For decoding, any odd trailing hex digits are stripped as two
    hex digits are required to represent a byte.
    """

    def reverse(self, data):
        import base64
        return base64.b16encode(data)

    def process(self, data):
        import base64
        import re
        data = re.sub(B'[^A-Fa-f0-9]+', B'', data)
        if len(data) % 2:
            data = data[:-1]
        return base64.b16decode(data, casefold=True)

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        if formats.b16s.value.bin.fullmatch(data):
            return True

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    import base64
    return base64.b16encode(data)

class hexload (blocks=1, dense=False, expand=False, narrow=False, width=0)

This unit is implemented in refinery.units.formats.hexload and has the following commandline Interface:

usage: hexload [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] [-D] [-E] [-N] [-W N]

Convert hex dumps back to the original data and vice versa. All options of this unit apply to its
reverse operation where binary data is converted to a readable hexdump format. The default mode
of the unit expects the input data to contain a readable hexdump and converts it back to binary.

options:
  -B, --blocks N  Group hexadecimal bytes in blocks of the given size; default is 1.
  -D, --dense     Do not insert spaces in hexdump.
  -E, --expand    Do not compress sequences of identical lines in hexdump
  -N, --narrow    Do not show addresses in hexdump
  -W, --width N   Specify the number of hexadecimal characters to use in preview.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.
  -R, --reverse   Use the reverse operation.

Expand source code Browse git

class hexload(HexViewer):
    """
    Convert hex dumps back to the original data and vice versa. All options of this unit apply
    to its reverse operation where binary data is converted to a readable hexdump format.
    The default mode of the unit expects the input data to contain a readable hexdump and
    converts it back to binary.
    """
    @regex
    class _ENCODED_BYTES:
        R"""
        (?ix)(?:^|(?<=\s))                      # encoded byte patches must be prefixed by white space
        (?:
            (?:                                 # separated chunks of hex data
                [a-f0-9]{2}                     # hexadecimal chunk; single byte (two hexadecimal letters)
                \s{1,2}                         # encoded byte followed by whitespace
                (?:                             # at least one more encoded byte
                    [a-f0-9]{2}                 # followed by more encoded bytes
                    (?:\s{1,2}[a-f0-9]{2})*     # unless it was just a single byte
                )?
            )
            | (?:[a-f0-9]{4}\s{1,2}             # 2-byte chunks
              (?:[a-f0-9]{4}
              (?:\s{1,2}[a-f0-9]{4})*)?)
            | (?:[a-f0-9]{8}\s{1,2}             # 4-byte chunks
              (?:[a-f0-9]{8}
              (?:\s{1,2}[a-f0-9]{8})*)?)
            | (?:(?:[a-f0-9]{2})+)              # continuous line of hexadecimal characters
        )(?=\s|$)                               # terminated by a whitespace or line end
        """

    def __init__(self, blocks=1, dense=False, expand=False, narrow=False, width=0):
        super().__init__(blocks=blocks, dense=dense, expand=expand, narrow=narrow, width=width)
        self._hexline_pattern = re.compile(F'{make_hexline_pattern(1)}(?:[\r\n]|$)', flags=re.MULTILINE)

    def process(self, data: bytearray):
        if not (lines := [
            line for line in data.decode(self.codec).splitlines(keepends=False)
            if line.strip()
        ]):
            return None

        result = bytearray()
        encoded_byte_matches: list[dict[int, int]] = []

        for check in lines:
            matches: dict[int, int] = {}
            encoded_byte_matches.append(matches)
            for match in self._ENCODED_BYTES.finditer(check):
                a, b = match.span()
                matches[a] = b - a

        it = iter(encoded_byte_matches)
        offsets = set(next(it).keys())
        for matches in it:
            offsets.intersection_update(matches.keys())
        if not offsets:
            raise ValueError('unable to determine the position of the hex bytes in this dump')
        lengths: dict[int, list[int]] = {offset: [] for offset in offsets}
        del offsets
        for matches in encoded_byte_matches:
            for offset in lengths:
                lengths[offset].append(matches[offset])
        for offset in lengths:
            lengths[offset].sort()
        midpoint = len(encoded_byte_matches) // 2
        offset, length = max(((offset, lengths[offset][midpoint]) for offset in lengths),
            key=operator.itemgetter(1))
        end = offset + length
        del lengths

        line_checks: list[HexLineCheck] = []

        for k, check in enumerate(lines, 1):
            encoded = check[offset:end]
            onlyhex = re.search(r'^[\sA-Fa-f0-9]+', encoded)
            if not onlyhex:
                self.log_warn(F'ignoring line without hexadecimal data: {check}')
                continue
            if onlyhex.group(0) != encoded:
                if k != len(lines):
                    self.log_warn(F'ignoring line with mismatching hex data length: {check}')
                    continue
                encoded = onlyhex.group(0)
            self.log_debug(F'decoding: {encoded.strip()}')
            decoded = bytes.fromhex(encoded)
            result.extend(decoded)
            matched = True
            if preview := check[end:]:
                pattern = re.compile(
                    '.'.join(re.escape(x.decode('ascii')) for x in re.split(b'[^!-~]', decoded)))
                matched = pattern.search(preview) is not None
            line_checks.append(HexLineCheck(len(decoded), len(preview), matched))

        decoded_sizes: set[int] = set()
        for last, hl in lookahead(line_checks):
            if not last:
                decoded_sizes.add(hl.decoded_length)
                if len(decoded_sizes) > 1:
                    raise RefineryPartialResult('inconsistent text preview sizes', result)

        for k, check in enumerate(line_checks, 1):
            if check.preview_length and not check.matched_binary:
                self.log_warn(F'preview mismatch in line {k}: {lines[k - 1]}', result)

        if result:
            yield result

    def reverse(self, data):
        metrics = self._get_metrics(len(data))
        if not self.args.width:
            metrics.fit_to_width(allow_increase=True)
        for line in self.hexdump(data, metrics):
            yield line.encode(self.codec)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    metrics = self._get_metrics(len(data))
    if not self.args.width:
        metrics.fit_to_width(allow_increase=True)
    for line in self.hexdump(data, metrics):
        yield line.encode(self.codec)

class hkdf (size, salt, hash='SHA512')

This unit is implemented in refinery.units.crypto.keyderive.hkdf and has the following commandline Interface:

usage: hkdf [-h] [-L] [-Q] [-0] [-v] N salt [hash]

HKDF Key derivation

positional arguments:
  N              The number of bytes to generate.
  salt           Salt for the derivation.
  hash           Specify one of these algorithms (default is SHA512): md2, md4, md5, sha1,
                 sha256, sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class hkdf(KeyDerivation):
    """HKDF Key derivation"""

    def __init__(self, size, salt, hash='SHA512'):
        super().__init__(size=size, salt=salt, hash=hash)

    def process(self, data):
        from Cryptodome.Protocol.KDF import HKDF
        return HKDF(data, self.args.size, self.args.salt, self.hash)

class hmac (salt, hash='SHA1', size=None)

This unit is implemented in refinery.units.crypto.keyderive.hmac and has the following commandline Interface:

usage: hmac [-h] [-L] [-Q] [-0] [-v] salt [hash] [N]

HMAC based key derivation

positional arguments:
  salt           Salt for the derivation.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384
  N              The number of bytes to generate.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class hmac(KeyDerivation):
    """
    HMAC based key derivation
    """

    def __init__(self, salt, hash='SHA1', size=None):
        super().__init__(salt=salt, size=size, hash=hash)

    def process(self, data):
        from Cryptodome.Hash import HMAC
        return HMAC.new(data, self.args.salt, digestmod=self.hash).digest()

class htmlesc

This unit is implemented in refinery.units.encoding.htmlesc and has the following commandline Interface:

usage: htmlesc [-h] [-L] [-Q] [-0] [-v] [-R]

Encodes and decodes HTML entities.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class htmlesc(Unit):
    """
    Encodes and decodes HTML entities.
    """

    @unicoded
    def process(self, data: str) -> str:
        return html_entities.unescape(data)

    @unicoded
    def reverse(self, data: str) -> str:
        return html_entities.escape(data)

Methods

def reverse(self, data)

Expand source code Browse git

@unicoded
def reverse(self, data: str) -> str:
    return html_entities.escape(data)

class httprequest

This unit is implemented in refinery.units.formats.httprequest and has the following commandline Interface:

usage: httprequest [-h] [-L] [-Q] [-0] [-v] [-F]

Parses HTTP request data, as you would obtain from a packet dump. The unit extracts POST data in
any format; each uploaded file is emitted as a separate chunk.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class httprequest(Unit):
    """
    Parses HTTP request data, as you would obtain from a packet dump. The unit extracts
    POST data in any format; each uploaded file is emitted as a separate chunk.
    """
    def process(self, data: Chunk):
        def header(line: bytes):
            name, colon, data = line.decode('utf8').partition(':')
            if colon:
                yield (name.strip().lower(), data.strip())

        head, _, body = data.partition(b'\r\n\r\n')
        request, *headers = head.splitlines(False)
        headers = dict(t for line in headers for t in header(line))
        method, path, _, *rest = request.split()

        mode = _Fmt.RawBody

        if rest:
            self.log_warn('unexpected rest data while parsing HTTP request:', rest)

        if method == b'GET' and not body:
            mode = _Fmt.UrlEncode
            body = path.partition(B'?')[1]
        if method == b'POST' and (ct := headers.get('content-type', None)):
            ct, _ = _parse_header(ct)
            try:
                mode = _Fmt(ct)
            except ValueError:
                mode = _Fmt.RawBody

        def chunks(upload: dict[bytes, list[bytes]]):
            for key, values in upload.items():
                for value in values:
                    yield self.labelled(value, name=key.decode('utf8'))

        if mode is _Fmt.RawBody:
            yield body
            return
        if mode is _Fmt.Multipart:
            _, _, message_data = data.partition(b'\n')
            msg = BytesParser().parsebytes(message_data)
            for part in msg.walk():
                payloads = part.get_payload(decode=True)
                if not isinstance(payloads, list):
                    payloads = [payloads]
                for payload in payloads:
                    if buffer := asbuffer(payload):
                        if name := part.get_filename():
                            buffer = self.labelled(buffer, name=name)
                        yield buffer

        if mode is _Fmt.UrlEncode:
            yield from chunks(parse_qs(body, keep_blank_values=True))

    @classmethod
    def handles(cls, data) -> bool | None:
        return data[:5] == B'POST ' or data[:4] == B'GET '

class httpresponse

This unit is implemented in refinery.units.formats.httpresponse and has the following commandline Interface:

usage: httpresponse [-h] [-L] [-Q] [-0] [-v] [-F]

Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked
or compressed transfer encoding was used.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class httpresponse(Unit):
    """
    Parses HTTP response text, as you would obtain from a packet dump. This can be
    useful if chunked or compressed transfer encoding was used.
    """
    def process(self, data):
        with SockWrapper(data) as mock:
            mock.seek(0)
            parser = HTTPResponse(mock) # type:ignore
            parser.begin()
            try:
                payload = parser.read()
            except IncompleteRead as incomplete:
                msg = F'incomplete read: {len(incomplete.partial)} bytes processed, {incomplete.expected} more expected'
                raise RefineryPartialResult(msg, incomplete.partial) from incomplete
            try:
                date = parser.headers['date'] | datefix | str
            except Exception:
                pass
            else:
                if len(date) == 19:
                    payload = self.labelled(payload, date=date)
            return payload

    @classmethod
    def handles(cls, data) -> bool | None:
        return data[:6] == B'HTTP/1'

class iemap (legend=False, bgfill=False, fgchar='#', *label)

This unit is implemented in refinery.units.sinks.iemap and has the following commandline Interface:

usage: iemap [-h] [-L] [-Q] [-0] [-v] [-l] [-b] [-c C] [label-part ...]

The information entropy map displays a colored bar on the terminal visualizing the file's local
entropy from beginning to end.

positional arguments:
  label-part          The remaining command line specifies a format string expression that will
                      be printed over the heat map display of each processed chunk.

options:
  -l, --legend        Show entropy color legend.
  -b, --bgfill        Generate the bar by coloring the bgfill.
  -c, --block-char C  Character used for filling the bar, default is #

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.

Expand source code Browse git

class iemap(Unit):
    """
    The information entropy map displays a colored bar on the terminal visualizing the file's local
    entropy from beginning to end.
    """
    def __init__(
        self,
        legend: Param[bool, Arg.Switch('-l', help='Show entropy color legend.')] = False,
        bgfill: Param[bool, Arg.Switch('-b', help='Generate the bar by coloring the bgfill.')] = False,
        fgchar: Param[str, Arg.String('-c', '--block-char', metavar='C',
            help='Character used for filling the bar, default is {default}')] = '#',
        *label: Param[str, Arg.String(metavar='label-part', help=(
            'The remaining command line specifies a format string expression that will be printed '
            'over the heat map display of each processed chunk.'
        ))]
    ):
        super().__init__(label=' '.join(label), bgfill=bgfill, legend=legend, fgchar=fgchar)

    @Unit.Requires('colorama', ['display', 'default', 'extended'])
    def _colorama():
        import colorama
        return colorama

    def process(self, data):
        from os import name as os_name
        from sys import stderr
        colorama = self._colorama
        colorama.init(autoreset=False, convert=(os_name == 'nt'))

        nobg = not self.args.bgfill
        meta = metavars(data)

        label = meta.format_str(self.args.label, self.codec, [data])
        if label:
            if not label.endswith(' '):
                label = F'{label} '
            if not label.startswith(' '):
                label = F' {label}'

        bgmap = [
            colorama.Back.BLACK,
            colorama.Back.WHITE,
            colorama.Back.YELLOW,
            colorama.Back.CYAN,
            colorama.Back.BLUE,
            colorama.Back.GREEN,
            colorama.Back.LIGHTRED_EX,
            colorama.Back.MAGENTA,
        ]
        fgmap = [
            colorama.Fore.LIGHTBLACK_EX,
            colorama.Fore.LIGHTWHITE_EX,
            colorama.Fore.LIGHTYELLOW_EX,
            colorama.Fore.LIGHTCYAN_EX,
            colorama.Fore.LIGHTBLUE_EX,
            colorama.Fore.LIGHTGREEN_EX,
            colorama.Fore.LIGHTRED_EX,
            colorama.Fore.LIGHTMAGENTA_EX,
        ]

        _reset = colorama.Back.BLACK + colorama.Fore.WHITE + colorama.Style.RESET_ALL

        clrmap = fgmap if nobg else bgmap
        header = '['
        header_length = 1
        footer_length = 4 + 7

        if self.args.legend:
            header = '[{1}{0}] {2}'.format(_reset, ''.join(F'{bg}{k}' for k, bg in enumerate(clrmap, 1)), header)
            header_length += 3 + len(clrmap)

        _tw = get_terminal_size()
        width = _tw - header_length - footer_length
        if width < 16:
            raise RuntimeError(F'computed terminal width {_tw} is too small for heatmap')

        def entropy_select(value, map):
            index = min(len(map) - 1, math.floor(value * len(map)))
            return map[index]

        view = memoryview(data)
        size = len(data)
        chunk_size = 0

        for block_size in range(1, width + 1):
            block_count = width // block_size
            chunk_size = size // block_count
            if chunk_size > 1024:
                break

        q, remainder = divmod(width, block_size)
        assert q == block_count
        indices = list(range(q))
        random.seed(sum(view[:1024]))
        random.shuffle(indices)

        block_sizes = [block_size] * q
        q, r = divmod(remainder, block_count)
        for i in indices:
            block_sizes[i] += q
        for i in indices[:r]:
            block_sizes[i] += 1
        assert sum(block_sizes) == width

        q, remainder = divmod(size, block_count)
        assert q == chunk_size
        chunk_sizes = [chunk_size] * block_count
        for i in indices[:remainder]:
            chunk_sizes[i] += 1
        assert sum(chunk_sizes) == size

        stream = MemoryFile(view)
        filler = self.args.fgchar if nobg else ' '

        try:
            stderr.write(header)
            if label is not None:
                stderr.write(colorama.Fore.WHITE)
                stderr.flush()
            it = itertools.chain(itertools.repeat(filler, 3), label, itertools.cycle(filler))
            cp = None
            for chunk_size, block_size in zip(chunk_sizes, block_sizes):
                chunk = stream.read(chunk_size)
                chunk_entropy = entropy(chunk)
                pp = entropy_select(chunk_entropy, clrmap)
                string = ''.join(itertools.islice(it, block_size))
                if pp != cp:
                    string = F'{pp}{string}'
                cp = pp
                stderr.write(string)
                stderr.flush()
        except BaseException:
            eraser = ' ' * width
            stderr.write(F'\r{_reset}{eraser}\r')
            raise
        else:
            stderr.write(F'{_reset}] [---.--%]')
            te = meta['entropy']
            stderr.write('\b' * footer_length)
            stderr.write(F'] [{te!r:>7}]\n')
            stderr.flush()
        if not self.isatty():
            yield data

class iff (*expression, ge=None, gt=None, le=None, lt=None, ct=None, ne=None, iN=None, eq=None, retain=False)

This unit is implemented in refinery.units.meta.iff and has the following commandline Interface:

usage: iff [-h] [-L] [-Q] [-0] [-v] [-R]
           [-ge RHS | -gt RHS | -le RHS | -lt RHS | -ct RHS | -ne RHS | -in RHS | -eq RHS] [-r]
           [token ...]

Filter incoming chunks depending on whether a given Python expression evaluates to true. If no
expression is given, the unit filters out empty chunks.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

positional arguments:
  token          All "token" arguments to this unit are joined with spaces to produce the
                 expression to be evaluated. This is done so that unnecessary shell quoting is
                 avoided.

options:
  -ge RHS        check that the expression is greater or equal to RHS
  -gt RHS        check that the expression is greater than RHS
  -le RHS        check that the expression is less or equal to RHS
  -lt RHS        check that the expression is less than RHS
  -ct RHS        check that the expression contains RHS
  -ne RHS        check that the expression is equal to RHS
  -in RHS        check that the expression is contained in RHS
  -eq RHS        check that the expression is equal to RHS
  -r, --retain   Move non-matching chunks out of scope rather than discarding them.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class iff(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether a given Python expression evaluates to true. If no
    expression is given, the unit filters out empty chunks.
    """
    def __init__(
        self,
        *expression: Param[str, Arg.String(metavar='token', help=(
            'All "token" arguments to this unit are joined with spaces to produce the expression '
            'to be evaluated. This is done so that unnecessary shell quoting is avoided.'))],
        ge: Param[str, Arg.String('-ge', metavar='RHS', group='OP',
            help='check that the expression is greater or equal to {varname}')] = None,
        gt: Param[str, Arg.String('-gt', metavar='RHS', group='OP',
            help='check that the expression is greater than {varname}')] = None,
        le: Param[str, Arg.String('-le', metavar='RHS', group='OP',
            help='check that the expression is less or equal to {varname}')] = None,
        lt: Param[str, Arg.String('-lt', metavar='RHS', group='OP',
            help='check that the expression is less than {varname}')] = None,
        ct: Param[str, Arg.String('-ct', metavar='RHS', group='OP',
            help='check that the expression contains {varname}')] = None,
        ne: Param[str, Arg.String('-ne', metavar='RHS', group='OP',
            help='check that the expression is equal to {varname}')] = None,
        iN: Param[str, Arg.String('-in', metavar='RHS', group='OP',
            help='check that the expression is contained in {varname}')] = None,
        eq: Param[str, Arg.String('-eq', metavar='RHS', group='OP',
            help='check that the expression is equal to {varname}')] = None,
        retain=False,
    ):
        def encodings(v: str):
            if not isinstance(v, str):
                return
            for codec in [self.codec, 'latin1', 'utf-16le']:
                yield v.encode(codec)

        def __br_contains__(container, value):
            if value in container:
                return True
            if isinstance(value, str):
                return any(b in container for b in encodings(value))
            else:
                return any(value == b for v in container for b in encodings(v))

        operators = [
            (ge, operator.__ge__),
            (gt, operator.__gt__),
            (le, operator.__le__),
            (lt, operator.__lt__),
            (eq, operator.__eq__),
            (ne, operator.__ne__),
            (ct, __br_contains__),
            (iN, lambda a, b: __br_contains__(b, a)),
        ]

        operators = [
            (rhs, cmp)
            for (rhs, cmp) in operators
            if rhs is not None
        ]

        rhs, cmp, lhs = None, None, '\x20'.join(expression) or None

        if len(operators) > 0:
            if not lhs:
                raise ValueError('Comparison operator with empty left hand side.')
            if len(operators) > 1:
                raise ValueError('Only one comparison operation can be specified.')
            rhs, cmp = operators[0]

        super().__init__(
            lhs=lhs,
            rhs=rhs,
            cmp=cmp,
            retain=retain,
        )

    def match(self, chunk):
        meta = metavars(chunk)
        lhs: str | None = self.args.lhs
        rhs: Any | None = self.args.rhs
        cmp: Callable[[Any, Any], bool] | None = self.args.cmp

        if cmp is None and rhs is not None:
            raise ValueError('right hand side defined but no operator')

        if lhs is not None:
            if rhs is not None:
                lhs = DelayedNumSeqArgument(lhs, additional_types=(float, str))(chunk)
            else:
                lhs = PythonExpression.Evaluate(lhs, meta)

        rhs = rhs and DelayedNumSeqArgument(rhs, additional_types=(float, str))(chunk)

        self.log_info(F'lhs: type={lhs.__class__.__name__}; value={lhs!r}')
        self.log_info(F'rhs: type={rhs.__class__.__name__}; value={rhs!r}')

        if lhs is None:
            return bool(chunk)
        if rhs is None:
            return bool(lhs)

        return cmp(lhs, rhs)

class iffc (*bounds, retain=False)

This unit is implemented in refinery.units.meta.iffc and has the following commandline Interface:

usage: iffc [-h] [-L] [-Q] [-0] [-v] [-R] [-r] [start:end:step ...]

Filter incoming chunks depending on whether their size is within any of the given bounds.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

positional arguments:
  start:end:step  Specifies an (inclusive) range to check for.

options:
  -r, --retain    Move non-matching chunks out of scope rather than discarding them.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.
  -R, --reverse   Use the reverse operation.

Expand source code Browse git

class iffc(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether their size is within any of the given bounds.
    """
    def __init__(
        self,
        *bounds: Param[slice, Arg.Bounds(help='Specifies an (inclusive) range to check for.', intok=True)],
        retain=False,
    ):
        if not bounds:
            raise ValueError('cannot filter for size without specifying any bounds')
        super().__init__(
            bounds=bounds,
            retain=retain,
        )

    def match(self, chunk):
        length: int = len(chunk)
        for bounds in self.args.bounds:
            if isinstance(bounds, int):
                if length == bounds:
                    return True
            if isinstance(bounds, slice):
                a = bounds.start or 0
                b = bounds.stop or INF
                t = bounds.step or 1
                if a <= length <= b and not (length - a) % t:
                    return True
        return False

class iffp (*patterns, partial=False, retain=False)

This unit is implemented in refinery.units.meta.iffp and has the following commandline Interface:

usage: iffp [-h] [-L] [-Q] [-0] [-v] [-R] [-p] [-r] [pattern ...]

Filter incoming chunks depending on whether it matches any of a given set of patterns. The
available patterns are the following: integer, float, number, string, multiline_string, cmdstr,
ps1str, vbastr, vbaint, printable, urlquote, urlquote_coarse, urlquote_narrow, intarray,
strarray, numarray, word, letters, wshenc, alphanumeric, b32, b58, b62, b64, b85, a85, z85, b92,
b64url, hex, b16, b16s, b64s, b85s, a85s, z85s, utf8, hexdump, hexarray, uuencode, domain, email,
guid, date, ipv4, ipv6, md5, sha1, sha256, hostname, socket, subdomain, url, btc, pem, xmr,
path_terse, path, winpath, nixpath, environment_variable.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

positional arguments:
  pattern

options:
  -p, --partial  Allow partial matches on the data.
  -r, --retain   Move non-matching chunks out of scope rather than discarding them.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class iffp(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether it matches any of a given set of patterns. The
    available patterns are the following: {}.
    """

    def __init__(
        self,
        *patterns: Param[str, Arg.Choice(metavar='pattern', choices=list(_PATTERNS))],
        partial: Param[bool, Arg.Switch('-p', help='Allow partial matches on the data.')] = False,
        retain=False
    ):
        super().__init__(
            retain=retain,
            patterns=patterns,
            partial=partial
        )

    def match(self, chunk):
        for name in self.args.patterns:
            p: pattern = _PATTERNS[name]
            matcher = p.match if self.args.partial else p.fullmatch
            if matcher(chunk):
                return True
        return False

class iffs (needle, retain=False)

This unit is implemented in refinery.units.meta.iffs and has the following commandline Interface:

usage: iffs [-h] [-L] [-Q] [-0] [-v] [-R] [-r] needle

Filter incoming chunks depending on whether they contain a given binary substring.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

positional arguments:
  needle         the string to search for

options:
  -r, --retain   Move non-matching chunks out of scope rather than discarding them.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class iffs(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether they contain a given binary substring.
    """
    def __init__(
        self,
        needle: Param[buf, Arg(help='the string to search for')],
        retain=False,
    ):
        super().__init__(
            needle=needle,
            retain=retain,
        )

    def match(self, chunk):
        return self.args.needle in chunk

class iffx (regex, count=0, fullmatch=False, multiline=False, ignorecase=False, retain=False)

This unit is implemented in refinery.units.meta.iffx and has the following commandline Interface:

usage: iffx [-h] [-L] [-Q] [-0] [-v] [-R] [-c N] [-U] [-M] [-I] [-r] regex

Filter incoming chunks by discarding those that do not match the given regular expression.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

positional arguments:
  regex             Regular expression to match.

options:
  -c, --count N     Specify the maximum number of operations to perform.
  -U, --fullmatch   Regular expressions are matched against the full input, not substrings of it.
  -M, --multiline   Caret and dollar in regular expressions match the beginning and end of a line
                    and a dot does not match line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters in regular expressions.
  -r, --retain      Move non-matching chunks out of scope rather than discarding them.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.
  -R, --reverse     Use the reverse operation.

Expand source code Browse git

class iffx(SingleRegexUnit, ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks by discarding those that do not match the given
    regular expression.
    """
    def __init__(self, regex, count=0, fullmatch=False, multiline=False, ignorecase=False, retain=False):
        pass

    def match(self, chunk):
        if matcher := self._make_matcher(self.args.regex):
            return bool(matcher(chunk))
        else:
            return True

class ifps (bytes, codec='cp1252')

This unit is implemented in refinery.units.formats.ifps and has the following commandline Interface:

usage: ifps [-h] [-L] [-Q] [-0] [-v] [-F] [-b] [codec]

Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These
scripts can be found, for example, when unpacking InnoSetup installers using innounp.

positional arguments:
  codec          Optionally specify the string encoding. The default is "cp1252".

options:
  -b, --bytes    Print opcode bytes in the disassembly.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class ifps(IFPSBase):
    """
    Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These
    scripts can be found, for example, when unpacking InnoSetup installers using innounp.
    """
    def __init__(
        self,
        bytes: Param[bool, Arg.Switch('-b', help='Print opcode bytes in the disassembly.')],
        codec='cp1252'
    ):
        super().__init__(codec=codec, bytes=bytes)

    def process(self, data):
        return IFPSFile(data, self.args.codec).disassembly(self.args.bytes).encode(self.codec)

    @classmethod
    def handles(cls, data) -> bool:
        return data[:len(IFPSFile.Magic)] == IFPSFile.Magic

class ifpsstr (codec='cp1252')

This unit is implemented in refinery.units.formats.ifpsstr and has the following commandline Interface:

usage: ifpsstr [-h] [-L] [-Q] [-0] [-v] [-F] [codec]

Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS".
These scripts can be found, for example, when unpacking InnoSetup installers using innounp.

positional arguments:
  codec          Optionally specify the string encoding. The default is "cp1252".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class ifpsstr(IFPSBase):
    """
    Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS".
    These scripts can be found, for example, when unpacking InnoSetup installers using innounp.
    """
    def process(self, data):
        ifps = IFPSFile(data, self.args.codec)
        for string in ifps.strings:
            yield string.encode(self.codec)

    @classmethod
    def handles(cls, data) -> bool:
        return data[:len(IFPSFile.Magic)] == IFPSFile.Magic

class imgdb

This unit is implemented in refinery.units.formats.imgdb and has the following commandline Interface:

usage: imgdb [-h] [-L] [-Q] [-0] [-v] [-F]

Provides access to the direct bytes of an image file. Each row of pixels is emitted as an
individual chunk.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class imgdb(Unit):
    """
    Provides access to the direct bytes of an image file. Each row of pixels is emitted as an
    individual chunk.
    """
    @Unit.Requires('Pillow', ['formats'])
    def _image():
        from PIL import Image
        return Image

    def _get_rows(self, image: Image):
        width = image.width
        pixels = iter(image.getdata())
        while row := list(islice(pixels, 0, width)):
            yield row

    def process(self, data):
        try:
            image = self._image.open(MemoryFile(data, output=bytes))
        except Exception:
            raise ValueError('input could not be parsed as an image')
        test = image.getpixel((0, 0))
        if isinstance(test, int):
            self.log_info('reading each pixel as an integer')
            for row in self._get_rows(image):
                yield bytearray(row)
        else:
            self.log_info('reading each pixel as a color value tuple')
            count = len(test)
            total = count * image.width
            out = bytearray(total)
            for row in self._get_rows():
                for pixel, offset in zip(row, range(0, total, count)):
                    out[offset:offset + count] = pixel
            yield out

    @classmethod
    def handles(cls, data) -> bool | None:
        if get_image_format(data) is not None:
            return True

class imgto (format='png')

This unit is implemented in refinery.units.formats.imgto and has the following commandline Interface:

usage: imgto [-h] [-L] [-Q] [-0] [-v] [-F] [format]

Convert an image to a given format.

positional arguments:
  format         An image file format like png, jpg, or bmp. The default is png.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class imgto(Unit):
    """
    Convert an image to a given format.
    """
    def __init__(
        self,
        format: Param[str, Arg.String(
            help='An image file format like png, jpg, or bmp. The default is {default}.')] = 'png'
    ):
        super().__init__(format=format)

    @Unit.Requires('Pillow', ['formats'])
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        try:
            image = self._image.open(MemoryFile(data, output=bytes))
        except ImportError:
            raise
        except Exception:
            raise ValueError('input could not be parsed as an image')
        with io.BytesIO() as out:
            image.save(out, self.args.format)
            return out.getvalue()

    @classmethod
    def handles(cls, data) -> bool | None:
        if get_image_format(data) is not None:
            return True

class imgtp (transformation='R')

This unit is implemented in refinery.units.formats.imgtp and has the following commandline Interface:

usage: imgtp [-h] [-L] [-Q] [-0] [-v] [-F] [transformation]

Perform a number of transpositions on an input image. The transformation string must be a
sequence composed of the letters H, V, and R. Each letter represents an operation:

- R rotates the image to the left by 90 degrees.
- V flips the image top to bottom (vertically).
- H flips the image left to right (horizontally).

These transpositions are performed in the order in which they are specified.

positional arguments:
  transformation  The transformation sequence; default is R.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.
  -F, --iff       Only apply unit if it can handle the input format. Specify twice to drop all
                  other chunks.

Expand source code Browse git

class imgtp(Unit):
    """
    Perform a number of transpositions on an input image. The transformation string must be a
    sequence composed of the letters H, V, and R. Each letter represents an operation:

    - R rotates the image to the left by 90 degrees.
    - V flips the image top to bottom (vertically).
    - H flips the image left to right (horizontally).

    These transpositions are performed in the order in which they are specified.
    """
    def __init__(
        self,
        transformation: Param[str, Arg.String(help='The transformation sequence; default is {default}.')] = 'R'
    ):
        transformation = [Arg.AsOption(t, T) for t in transformation]
        super().__init__(transformation=transformation)

    @Unit.Requires('Pillow', ['formats'])
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        imglib = self._image

        try:
            image = imglib.open(MemoryFile(data, output=bytes))
        except Exception:
            raise ValueError('input could not be parsed as an image')
        else:
            format = image.format
        conversion = {
            T.V: imglib.Transpose.FLIP_TOP_BOTTOM,
            T.H: imglib.Transpose.FLIP_LEFT_RIGHT,
            T.R: imglib.Transpose.ROTATE_90,
        }
        for tf in self.args.transformation:
            image = image.transpose(conversion[tf])
        with io.BytesIO() as out:
            image.save(out, format)
            return out.getvalue()

    @classmethod
    def handles(cls, data) -> bool | None:
        if get_image_format(data) is not None:
            return True

class imphash (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.imphash and has the following commandline Interface:

usage: imphash [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Implements the import hash for PE files.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class imphash(HashUnit):
    """
    Implements the import hash for PE files.
    """

    def _algorithm(self, data):
        pe = lief.load_pe(data)
        th = lief.PE.get_imphash(pe, lief.PE.IMPHASH_MODE.PEFILE)
        return bytes.fromhex(th)

class innopwd

This unit is implemented in refinery.units.formats.archive.innopwd and has the following commandline Interface:

usage: innopwd [-h] [-L] [-Q] [-0] [-v] [-F]

This unit emulates an InnoSetup installer in an attempt to determine the installer password. This
works only when the password is contained within the script, but several malware samples are
known to use this technique.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class innopwd(Unit):
    """
    This unit emulates an InnoSetup installer in an attempt to determine the installer password.
    This works only when the password is contained within the script, but several malware samples
    are known to use this technique.
    """
    def process(self, data: bytearray):
        if data.startswith(IFPSFile.Magic):
            inno = IFPSFile(data)
            self.log_info('running in script-only mode; cannot check passwords')
            file = None
        else:
            inno = InnoArchive(data, self)
            file = inno.get_encrypted_sample()
            if file is None:
                self.log_info('the archive is not password-protected, password is empty')
                return None
            self.log_info('password type:', file.password_type.name)
            self.log_info('password hash:', file.password_hash)
            self.log_info('password salt:', file.password_salt)

        emulator = InnoSetupEmulator(inno)

        for password in emulator.emulate_installation():
            if not isinstance(password, NewPassword):
                continue
            if file and not inno.check_password(file, password):
                self.log_info('discarding password:', password)
                continue
            yield password.encode(self.codec)
            if file is not None:
                self.log_info('aborting emulation after validating password')
                return

    @classmethod
    def handles(cls, data):
        return is_inno_setup(data)

class isaac (key, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.isaac and has the following commandline Interface:

usage: isaac [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key

The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.

positional arguments:
  key              The encryption key.

options:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class isaac(StreamCipherUnit):
    """
    The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.
    """

    def keystream(self) -> Iterable[int]:
        key = self.args.key

        A: int = 0
        B: int = 0
        C: int = 0
        S: list[int] = [0x9E3779B9] * 8
        T: list[int] = []
        K = list(chunks.unpack(key + bytearray(0x400 - len(key)), 4, bigendian=False))
        U = 0xFFFFFFFF

        def _mix_state():
            a, b, c, d, e, f, g, h = S
            a ^= (b << 0x0B) & U; d = d + a & U; b = b + c & U # noqa
            b ^= (c >> 0x02) & U; e = e + b & U; c = c + d & U # noqa
            c ^= (d << 0x08) & U; f = f + c & U; d = d + e & U # noqa
            d ^= (e >> 0x10) & U; g = g + d & U; e = e + f & U # noqa
            e ^= (f << 0x0A) & U; h = h + e & U; f = f + g & U # noqa
            f ^= (g >> 0x04) & U; a = a + f & U; g = g + h & U # noqa
            g ^= (h << 0x08) & U; b = b + g & U; h = h + a & U # noqa
            h ^= (a >> 0x09) & U; c = c + h & U; a = a + b & U # noqa
            S[:] = a, b, c, d, e, f, g, h
            return S

        def _initialize_with(R: list[int]):
            for i in range(0, 0x100, 8):
                S[:] = (x + R[j] & U for j, x in enumerate(S, i))
                T[i:i + 8] = _mix_state()

        for _ in range(4):
            _mix_state()

        _initialize_with(K)
        _initialize_with(T)

        operations = [
            (__lshift__, 0x0D),
            (__rshift__, 0x06),
            (__lshift__, 0x02),
            (__rshift__, 0x10),
        ]

        while True:
            C = (C + 1) & U
            B = (B + C) & U
            for i in range(0x100):
                X = T[i]
                shift, k = operations[i % 4]
                A = (A ^ shift(A, k)) & U
                A = (A + T[i ^ 0x80]) & U
                Y = T[+i] = T[X // 4 & 0xFF] + A + B & U
                B = K[~i] = X + T[Y // 1024 & 0xFF] & U
            yield from chunks.pack(K, 4, True)

class jamv (name, data=None)

This unit is implemented in refinery.units.meta.jamv and has the following commandline Interface:

usage: jamv [-h] [-L] [-Q] [-0] [-v] format [data]

Short for "Join as Meta Variables": It joins all chunks in the current frame into a single one by
storing the contents of each chunk as the contents of a meta variable in the output.

positional arguments:
  format         A format string that specifies the variable name for storing the chunk.
  data           Optionally specify the body of the fused output chunk; empty by default.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class jamv(Unit):
    """
    Short for "Join as Meta Variables": It joins all chunks in the current frame into a single one
    by storing the contents of each chunk as the contents of a meta variable in the output.
    """
    def __init__(
        self,
        name: Param[str, Arg.String(metavar='format', help=(
            'A format string that specifies the variable name for storing the chunk.'))],
        data: Param[buf, Arg.Binary(metavar='data', help=(
            'Optionally specify the body of the fused output chunk; empty by default.'))] = None,
    ):
        super().__init__(name=name, data=data)

    def process(self, data: Chunk):
        try:
            meta = data.temp
        except Exception:
            meta = None
        if not isinstance(meta, dict):
            raise RuntimeError('this unit can only be used inside a frame')
        data.meta.update(meta)
        data[:] = self.args.data or B''
        return data

    def filter(self, inputs):
        head = None
        spec = self.args.name
        meta = {}
        for chunk in inputs:
            if not chunk.visible:
                yield chunk
                continue
            used = set()
            name = chunk.meta.format_str(spec, self.codec, [chunk], used=used)
            if head is None:
                for u in used:
                    chunk.meta.discard(u)
                head = chunk
            if name in meta:
                self.log_warn('overwriting duplicate variable:', name, clip=True)
            meta[name] = chunk
        if head:
            head.temp = meta
            yield head

class jcalg (ignore_header=False)

This unit is implemented in refinery.units.compression.jcalg and has the following commandline Interface:

usage: jcalg [-h] [-L] [-Q] [-0] [-v] [-F] [-g]

JCALG decompression.

options:
  -g, --ignore-header  Keep decompressing even after the output has reached the final size as
                       given by the header value.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.
  -F, --iff            Only apply unit if it can handle the input format. Specify twice to drop
                       all other chunks.

Expand source code Browse git

class jcalg(Unit):
    """
    JCALG decompression.
    """
    def __init__(
        self,
        ignore_header: Param[bool, Arg('-g', help=(
            'Keep decompressing even after the output has reached the final size as given by the header value.'))] = False,
    ):
        super().__init__(ignore_header=ignore_header)

    def process(self, data: bytearray):
        with MemoryFile() as output, StructReader(data) as reader:
            if reader.read(2) != B'JC':
                self.log_warn('data does not begin with magic sequence, assuming that header is missing')
                reader.seek(0)
                size = checksum = None
            else:
                size = reader.u32()
                checksum = reader.u32()
            if self.args.ignore_header:
                size = None
            self._decompress(output, reader, size)
            if size is not None:
                if len(output) > size:
                    self.log_info(F'tuncating to size {size}')
                    output.truncate(size)
                elif len(output) < size:
                    self.log_warn(F'header size was {size}, but only {len(data)} bytes were decompressed')
            data = output.getvalue()
            if checksum:
                c = self._checksum(data)
                if c != checksum:
                    self.log_warn(F'header checksum was {checksum:08X}, computed value is {c:08X}')
            return data

    @classmethod
    def handles(cls, data):
        if data[:2] == B'JC':
            return True

    def _checksum(self, data):
        from refinery.lib import chunks
        checksum = 0
        it = chunks.unpack(data, 4)
        if len(data) % 4:
            import itertools
            it = itertools.chain(it, (int.from_bytes(data[-4:], 'little'),))
        for chunk in it:
            checksum += chunk
            checksum ^= ((chunk & 0x7FFFFFFF) << 1) + (chunk >> 31) + 1
            checksum &= 0xFFFFFFFF
        return checksum

    def _decompress(self, writer: MemoryFile, reader_: StructReader[bytearray], size: int | None = None):
        index = 1
        base = 8
        literal_bits = None
        literal_offset = None
        flags = BitBufferedReader(reader_, 32)

        while True:
            if size and len(writer) >= size:
                break
            if flags.next():
                b = flags.read(literal_bits) + literal_offset
                b = b & 0xFF
                writer.write_byte(b)
                continue
            if flags.next():
                high = flags.variable_length_integer()
                if high == 2:
                    match_length = flags.variable_length_integer()
                else:
                    index = ((high - 3) << base) + flags.read(base)
                    match_length = flags.variable_length_integer()
                    if index >= 0x10000:
                        match_length += 3
                    elif index >= 0x37FF:
                        match_length += 2
                    elif index >= 0x27F:
                        match_length += 1
                    elif index <= 127:
                        match_length += 4
                writer.replay(index, match_length)
                continue
            if not flags.next():
                new_index = flags.read(7)
                match_length = 2 + flags.read(2)
                if new_index == 0:
                    if match_length == 2:
                        break
                    base = flags.read(match_length + 1)
                else:
                    index = new_index
                    writer.replay(index, match_length)
                continue
            one_byte_phrase_value = flags.read(4) - 1
            if one_byte_phrase_value == 0:
                writer.write_byte(0)
            elif one_byte_phrase_value > 0:
                b = writer.getvalue()[-one_byte_phrase_value]
                writer.write_byte(b)
            else:
                if not flags.next():
                    literal_bits = 7 + flags.next()
                    literal_offset = 0
                    if literal_bits != 8:
                        literal_offset = flags.read(8)
                    continue
                while True:
                    for _ in range(0x100):
                        b = flags.read(8)
                        writer.write_byte(b)
                    if not flags.next():
                        break

class jvdasm (*paths, gray=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.java.jvdasm and has the following commandline Interface:

usage: jvdasm [-h] [-L] [-Q] [-0] [-v] [-F] [-g] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
              [path ...]

Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The
unit is implemented as a path extractor and each path name corresponds to the name of one method
defined in the class file.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -g, --gray       Disable colored output.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class jvdasm(PathExtractorUnit):
    """
    Disassembles the JVM bytecode instructions of methods of classes defined in Java class
    files. The unit is implemented as a path extractor and each path name corresponds to the
    name of one method defined in the class file.
    """
    _OPC_STRLEN = max(len(op.name) for op in opc)

    def _hex(self, bytestring, sep=''):
        return sep.join(F'{x:02x}' for x in bytestring)

    def __init__(
        self, *paths,
        gray: Param[bool, Arg.Switch('-g', help='Disable colored output.')] = False,
        **keywords
    ):
        super().__init__(*paths, gray=gray, **keywords)

    def unpack(self, data):
        def _name(method: JvClassMember):
            name = method.name
            if name == '<init>':
                _, _, name = str(jc.this).rpartition('/')
            elif m := re.fullmatch('<(.*?)>', name):
                name = F'.{m[0]}'
            return name

        def _path(method: JvClassMember):
            return F'{jc.this!s}/{_name(method)}'
        try:
            if self.args.gray or not self.isatty():
                raise ImportError
            import colorama
        except ImportError:
            class _FG():
                def __getattr__(self, _):
                    return ''
            FG = _FG()
            RS = ''
        else:
            FG = colorama.Fore
            RS = colorama.Style.RESET_ALL
        finally:
            c_none = RS
            c_space = FG.LIGHTCYAN_EX
            c_types = FG.LIGHTCYAN_EX
            c_member = FG.LIGHTYELLOW_EX
            c_kwd = FG.LIGHTYELLOW_EX
            c_const = FG.LIGHTRED_EX
            c_string = FG.LIGHTRED_EX
            c_address = FG.LIGHTBLACK_EX
            c_label = RS

        def _color(arg, offset):
            if isinstance(arg, (str, JvString)):
                color = c_string
            elif isinstance(arg, (JvClassProperty, JvTypePath)):
                ns, dd, prop = str(arg).partition('::')
                if not dd:
                    return repr(arg)
                ns = ns.split('.')
                ns = '.'.join(F'{c_space}{p}{c_none}' for p in ns)
                return F'{ns}{dd}{c_member}{prop}{c_none}'
            elif isinstance(arg, int) and arg + offset in labels:
                return F'{c_label}0x{arg + offset:08X}{c_none}'
            elif isinstance(arg, (bool, int, float)):
                color = c_const
            elif isinstance(arg, JvBaseType):
                color = c_kwd
            else:
                return repr(arg)
            return F'{color}{arg!r}{c_none}'

        jc = JvClassFile(data)
        tab = ' '
        namespace = '.'.join(str(jc.this).split('/'))
        opcw = self._OPC_STRLEN
        path_counter = collections.defaultdict(int)
        path_index = collections.defaultdict(int)

        for method in jc.methods:
            path_counter[_path(method)] += 1
        for method in jc.methods:
            for attribute in method.attributes:
                if attribute.name == 'Code':
                    break
            else:
                self.log_warn(F'no code found for method: {method.name}')
                continue
            code: JvCode = attribute.parse(JvCode)
            with io.StringIO() as display:
                rv, args = _parse_descriptor(method.descriptor, c_none, c_space, c_types, c_kwd)
                args = ', '.join(args)
                print(
                    F'{c_types}{rv}{c_none} {c_space}{namespace}{c_none}'
                    F'::{c_member}{_name(method)}{c_none}({args})', file=display)
                offset = 0
                labels = set()
                addresses = set()

                for op in code.disassembly:
                    addresses.add(offset)
                    if op.table:
                        labels.update(offset + jmp for jmp in op.table.values())
                    elif op.code in (opc.goto, opc.goto_w):
                        labels.update(offset + arg for arg in op.arguments if isinstance(arg, int))
                    offset += len(op.raw)

                offset = 0
                labels = labels & addresses

                for op in code.disassembly:
                    if offset in labels:
                        label = F'{c_label}{offset:08X}{c_none}:'
                    else:
                        label = F'{c_address}{offset:08X}{c_none}:'
                    addr = offset
                    olen = len(op.raw)
                    offset += olen
                    if op.table is None:
                        args = ', '.join(_color(a, addr) for a in op.arguments)
                    else:
                        ow = 4 if op.code is opc.tableswitch else 8
                        olen = olen - (len(op.table) - 1) * ow
                        args = F'___default => {c_label}{op.table[None] + addr:#010x}{c_none}'
                        jmps = []
                        for k, (key, jmp) in enumerate(op.table.items()):
                            if key is None:
                                continue
                            raw = self._hex(op.raw[olen + k * ow: olen + k * ow + ow], ' ')
                            jmps.append(
                                F'{label}{tab}'
                                F'{raw!s:<{opcw + 15}} '
                                F'{c_const}{key:#010x}{c_none} => '
                                F'{c_label}{jmp + addr:#010x}{c_none}')
                        args = '\n'.join((args, *jmps))
                    opch = self._hex(op.raw[:olen], ' ')
                    if len(opch) > 14:
                        opch += F'\n{label}{tab}{tab:<15}'
                    print(
                        F'{label}{tab}'
                        F'{opch:<15}'
                        F'{c_kwd}{op.code!r:<{opcw}}{c_none} {args}', file=display)
                path = _path(method)
                if path_counter[path] > 1:
                    k = path_index[path]
                    path_index[path] = k + 1
                    path = F'{path}[{k}]'
                yield UnpackResult(path, display.getvalue().encode(self.codec))

    @classmethod
    def handles(cls, data):
        return data.startswith(B'\xCA\xFE\xBA\xBE')

class jvstr

This unit is implemented in refinery.units.formats.java.jvstr and has the following commandline Interface:

usage: jvstr [-h] [-L] [-Q] [-0] [-v] [-F]

Extract string constants from Java class files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class jvstr(Unit):
    """
    Extract string constants from Java class files.
    """
    def process(self, data):
        jc = JvClassFile(data)
        for string in jc.strings:
            yield string.encode(self.codec)

    @classmethod
    def handles(cls, data):
        return data.startswith(B'\xCA\xFE\xBA\xBE')

class kblob

This unit is implemented in refinery.units.crypto.keyderive.kblob and has the following commandline Interface:

usage: kblob [-h] [-L] [-Q] [-0] [-v]

Extracts a key from a Microsoft Crypto API BLOB structure.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class kblob(Unit):
    """
    Extracts a key from a Microsoft Crypto API BLOB structure.
    """

    def process(self, data):
        blob = CRYPTOKEY(data)
        try:
            return self.labelled(
                bytes(blob.key),
                type=blob.header.type.name,
                algorithm=blob.header.algorithm.name
            )
        except AttributeError as A:
            raise ValueError(F'unable to derive key from {blob.header.type!s}') from A

class keccak (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: keccak [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the KECCAK hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class kramer

This unit is implemented in refinery.units.malware.kramer and has the following commandline Interface:

usage: kramer [-h] [-L] [-Q] [-0] [-v]

Deobfuscate Python samples obfuscated with Kramer.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class kramer(Unit):
    """
    Deobfuscate Python samples obfuscated with Kramer.
    """

    _LINEBREAK_MAGIC = 950

    def process(self, data):
        kramer = ''
        secret = set()
        _pyver = None

        def crawl(code: CodeType, depth=1):
            nonlocal kramer
            for instruction in disassemble_code(code, _pyver):
                arg = instruction.argval
                if arg is None:
                    continue
                if isinstance(arg, tuple):
                    continue
                if isinstance(arg, str):
                    if len(arg) > len(kramer):
                        kramer = arg
                    continue
                if isinstance(arg, int):
                    secret.add(arg)
                    continue
                try:
                    crawl(arg, depth + 1)
                except Exception as E:
                    self.log_info(F'error crawling arg of type {type(arg).__name__} at depth {depth}: {E}')

        for code in extract_code_from_buffer(bytes(data)):
            _pyver = code.version
            crawl(code.container)

        if not kramer:
            raise ValueError('could not find the encoded string')

        separator = re.search('[^a-fA-F0-9]+', kramer)

        if not separator:
            raise ValueError('no separator detected; encoding method may have changed')

        def rotchar(c: int):
            if c in range(0x61, 0x7a) or c in range(0x30, 0x39):
                return c + 1
            if c == 0x7a:
                return 0x30
            if c == 0x39:
                return 0x61
            return c

        def decrypt(c: int, k: int):
            if c >= k:
                out = rotchar(c - k)
                if out not in range(0x100):
                    raise _WrongKey
                return out
            if c == self._LINEBREAK_MAGIC:
                return 0x0A
            raise _WrongKey

        def decrypt_with_key(key: int):
            decrypted = bytearray(decrypt(c, key) for c in encrypted)
            if not re.fullmatch(B'[\\s!-~]+', decrypted):
                raise _WrongKey
            return decrypted

        separator = separator.group(0)
        encrypted = [ord(bytes.fromhex(e).decode()) for e in kramer.split(separator)]

        ubound = min(x for x in encrypted if x != self._LINEBREAK_MAGIC)
        lbound = ubound - 0xFF

        secret = {k for k in secret if k > lbound and k < ubound}
        self.log_debug('potential secrets from code:', secret)

        for key in sorted(secret, reverse=True):
            try:
                return decrypt_with_key(key)
            except _WrongKey:
                pass

        self.log_info(F'all candidates failed, searching [{lbound}, {ubound}]')

        for key in range(ubound, lbound - 1, -1):
            try:
                self.log_debug('attempting key:', key)
                return decrypt_with_key(key)
            except _WrongKey:
                pass

        raise RuntimeError('could not find decryption key')

class lnk (tabular=False, details=False)

This unit is implemented in refinery.units.formats.lnk and has the following commandline Interface:

usage: lnk [-h] [-L] [-Q] [-0] [-v] [-F] [-t] [-d]

Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit
is a thin wrapper around the LnkParse3 library.

options:
  -t, --tabular  Print information in a table rather than as JSON.
  -d, --details  Print all details; some properties are hidden by default.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lnk(Unit):
    """
    Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This
    unit is a thin wrapper around the LnkParse3 library.
    """

    @Unit.Requires('LnkParse3>=1.4.0', ['formats', 'default', 'extended'])
    def _LnkParse3():
        import LnkParse3
        return LnkParse3

    _PATHS = {
        'data': ...,
        'header': {'creation_time', 'accessed_time', 'modified_time', 'windowstyle'},
        'link_info': {'local_base_path', 'location'},
    }

    def __init__(
        self,
        tabular: Param[bool, Arg('-t', help='Print information in a table rather than as JSON.')] = False,
        details: Param[bool, Arg('-d', help='Print all details; some properties are hidden by default.')] = False,
    ):
        super().__init__(tabular=tabular, details=details)

    def process(self, data):
        with NoLogging():
            parsed = self._LnkParse3.lnk_file(MemoryFile(data)).get_json()
        if not self.args.details:
            paths = self._PATHS
            noise = [key for key in parsed if key not in paths]
            for key in noise:
                del parsed[key]
            for path, scope in paths.items():
                if scope is (...):
                    continue
                try:
                    section = parsed[path]
                except KeyError:
                    continue
                noise = [key for key in section if key not in scope]
                for key in noise:
                    del section[key]
        with JSONEncoderEx as encoder:
            pp = ppjson(tabular=self.args.tabular)
            yield from pp._pretty_output(
                parsed, indent=4, cls=encoder, ensure_ascii=False)

    @classmethod
    def handles(cls, data):
        return data[:20] == B'L\0\0\0\01\x14\02\0\0\0\0\0\xC0\0\0\0\0\0\0F'

class loop (iterations, statements, do_while, do_until, fullmatch=False, multiline=False, ignorecase=False)

This unit is implemented in refinery.units.meta.loop and has the following commandline Interface:

usage: loop [-h] [-L] [-Q] [-0] [-v] [-w RE] [-u RE] [-U] [-M] [-I] iterations statements

Applies a given multibin suffix to the input chunk repeatedly. For example, the following command
would carve the largest base64-encoded buffer from the input, decode it, and then decompress the
result 20 times:

    emit data | loop 20 csd[b64]:zl

Notably, the argument after the iterations is a suffix, which means that handlers are applied
from left to right (not from right to left). The loop is aborted and the previous result returned
if the newly computed result is empty. If the an error occurs while computing the statements and
the unit is lenient (i.e. the -L switch is set), the last known result is returned.

positional arguments:
  iterations        The number of repeated applications of the statements.
  statements        A multibin expression suffix representing the loop statements.

options:
  -w, --while RE    Halt when the given regular expression does not match the data.
  -u, --until RE    Halt when the given regular expression matches the data.
  -U, --fullmatch   Regular expressions are matched against the full input, not substrings of it.
  -M, --multiline   Caret and dollar in regular expressions match the beginning and end of a line
                    and a dot does not match line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters in regular expressions.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class loop(RegexUnit):
    """
    Applies a given multibin suffix to the input chunk repeatedly. For example, the following
    command would carve the largest base64-encoded buffer from the input, decode it, and then
    decompress the result 20 times:

        emit data | loop 20 csd[b64]:zl

    Notably, the argument after the iterations is a suffix, which means that handlers are applied
    from left to right (not from right to left). The loop is aborted and the previous result
    returned if the newly computed result is empty. If the an error occurs while computing
    the statements and the unit is lenient (i.e. the `-L` switch is set), the last known result
    is returned.
    """

    def __init__(
        self,
        iterations: Param[int, Arg.Number(metavar='iterations',
            help='The number of repeated applications of the statements.')],
        statements: Param[str, Arg.String(metavar='statements',
            help='A multibin expression suffix representing the loop statements.')],
        do_while: Param[str, Arg.RegExp('-w', '--while', metavar='RE',
            help='Halt when the given regular expression does not match the data.')],
        do_until: Param[str, Arg.RegExp('-u', '--until', metavar='RE',
            help='Halt when the given regular expression matches the data.')],
        fullmatch=False, multiline=False, ignorecase=False,
    ):
        super().__init__(
            iterations=iterations,
            statements=statements,
            do_while=do_while,
            do_until=do_until,
            fullmatch=fullmatch,
            multiline=multiline,
            ignorecase=ignorecase,
        )

    def process(self, data):
        _count = self.args.iterations
        _width = len(str(_count))
        _while = self._while
        _until = self._until

        for k in range(_count):
            if _while and not _while(data):
                self.log_info(F'step {k:0{_width}}: stopping, while-condition violated')
                break
            if _until and _until(data):
                self.log_info(F'step {k:0{_width}}: stopping, until-condition satisfied')
                break
            try:
                out = DelayedBinaryArgument(
                    self.args.statements, reverse=True, seed=data)(data)
            except Exception as error:
                self.log_info(F'step {k:0{_width}}: error;', exception_to_string(error))
                msg = F'Stopped after {k} steps, increase verbosity for additional details.'
                raise RefineryPartialResult(msg, data) from error
            if not out:
                self.log_info(F'step {k:0{_width}}: stopping after empty result')
                break
            data[:] = out
            self.log_debug(F'step {k:0{_width}}: data =', data, clip=True)

        return data

    @property
    def _while(self):
        return self._make_matcher(self.args.do_while)

    @property
    def _until(self):
        return self._make_matcher(self.args.do_until)

class lz4

This unit is implemented in refinery.units.compression.lz4 and has the following commandline Interface:

usage: lz4 [-h] [-L] [-Q] [-0] [-v]

LZ4 block decompression. See also:
https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lz4(Unit):
    """
    LZ4 block decompression. See also:
    https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format
    """
    def _read_block(self, reader: StructReader, output: io.BytesIO, ubound=None):
        entry = reader.tell()
        lastend = 0

        def ubound_check():
            if ubound is None:
                return False
            consumed = reader.tell() - entry
            if consumed > ubound:
                raise ValueError(F'upper bound {ubound} exceeded by {consumed - ubound} in LZ4 block')
            return consumed == ubound

        while not reader.eof:
            reflen = reader.read_nibble()
            litlen = reader.read_nibble()
            litlen = reader.read_size(litlen)
            literal = reader.read(litlen)
            output.write(literal)
            if ubound_check():
                break
            try:
                refpos = reader.u16()
            except EOFError:
                break
            if refpos - 1 not in range(output.tell()):
                with StreamDetour(output, lastend):
                    if output.read(len(literal)) == literal:
                        # This literal could have been encoded in the last match, but it wasn't.
                        # Therefore, it is very likely that we have reached the end of the stream.
                        break
                position = reader.tell()
                remaining = len(literal) - position
                raise RefineryPartialResult(
                    F'encountered invalid match offset value {refpos} at position {position} with {remaining} bytes remaining',
                    partial=output.getvalue())
            reflen = reader.read_size(reflen)
            if ubound_check():
                raise ValueError('last sequence in block contained a match')
            reflen += 4
            available_bytes = min(refpos, reflen)
            q, r = divmod(reflen, available_bytes)
            with StreamDetour(output, -refpos, io.SEEK_CUR):
                match = output.read(available_bytes)
                match = q * match + match[:r]
                assert len(match) == reflen
                lastend = output.tell() - available_bytes + r
            output.write(match)

    def process(self, data):
        output = io.BytesIO()
        reader = LZ4Reader(memoryview(data))
        try:
            magic = reader.u32() == 0x184D2204
        except EOFError:
            magic = False
        if not magic:
            reader.seek(0)
            self._read_block(reader, output)
            return output.getvalue()

        (v1, v2, blocks_independent, blocks_checksummed,
            content_size, content_checksummed, rsrv1, dict_id) = reader.read_bits(8)
        rsrv2 = reader.read_nibble()
        try:
            block_maximum = {
                7: 0x400000,
                6: 0x100000,
                5: 0x040000,
                4: 0x010000,
            }[reader.read_integer(3)]
        except KeyError:
            raise ValueError('unknown maximum block size value in LZ4 frame header')
        rsrv3 = reader.read_bit()
        if any((rsrv1, rsrv2, rsrv3)):
            self.log_warn('nonzero reserved value in LZ4 frame header')
        if (v1, v2) != (0, 1):
            self.log_warn(F'invalid version ({v1},{v2}) in LZ4 frame header')
        content_size = content_size and reader.u64() or None
        dict_id = dict_id and reader.u32() or None
        # Header Checksum
        xxh = xxhash(data[4:reader.tell()]).intdigest() >> 8 & 0xFF
        chk = reader.read_byte()
        if chk != xxh:
            self.log_warn(F'header checksum {chk:02X} does not match computed value {xxh:02X}')

        self.log_debug(lambda: F'dictionary id: {dict_id}')
        self.log_debug(lambda: F'block max: 0x{block_maximum:X}')
        if content_size is not None:
            self.log_debug(lambda: F'chunk max: 0x{content_size:X}')
        self.log_debug(lambda: F'blocks independent: {bool(blocks_independent)}')
        self.log_debug(lambda: F'blocks checksummed: {bool(blocks_checksummed)}')

        blockindex = 0

        while True:
            blockindex += 1
            size = reader.read_integer(31)
            uncompressed = reader.read_bit()
            if not size:
                assert not uncompressed
                break
            self.log_info(F'reading block of size 0x{size:06X}')
            assert reader.byte_aligned
            assert size <= block_maximum, 'block size exceeds maximum size'
            if uncompressed:
                output.write(reader.read_exactly(size))
            else:
                self._read_block(reader, output, size)
            if blocks_checksummed:
                with StreamDetour(reader, -size, io.SEEK_CUR):
                    xxh = xxhash(reader.read_exactly(size)).intdigest()
                chk = reader.u32()
                if chk != xxh:
                    self.log_warn(F'block {blockindex} had checksum {chk:08X} which did not match computed value {xxh:08X}')
        value = output.getvalue()
        if content_checksummed:
            self.log_info('computing checksum')
            xxh = xxhash(value).intdigest()
            chk = reader.u32()
            if chk != xxh:
                self.log_warn(F'the given checksum {chk:08X} did not match the computed checksum {xxh:08X}')
        if not reader.eof:
            pos = reader.tell()
            self.log_warn(F'found {len(data) - pos} additional bytes starting at position 0x{pos:X} after compressed data')
        return value

class lzf (fast=False)

This unit is implemented in refinery.units.compression.lzf and has the following commandline Interface:

usage: lzf [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-x]

This unit implements LZF compression and decompression.

options:
  -x, --fast     Enable fast compression mode.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lzf(Unit):
    """
    This unit implements LZF compression and decompression.
    """

    def __init__(self, fast: Param[bool, Arg.Switch('-x', help='Enable fast compression mode.')] = False):
        super().__init__(fast=fast)

    def reverse(self, data):
        def FRST(p: memoryview) -> int:
            return ((p[0]) << 8) | p[1]

        def NEXT(v: int, p: memoryview) -> int:
            return ((v << 8) | p[2]) & 0xFFFFFFFF

        def DELTA(p: memoryview):
            return view.nbytes - p.nbytes

        if self.args.fast:
            def HIDX(h: int) -> int:
                return (((h >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))
        else:
            def HIDX(h: int) -> int:
                q = (h ^ (h << 5))
                return (((q >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))

        if not data:
            return data

        ip = view = memoryview(data)
        op = bytearray()

        if len(data) == 1:
            op.append(0)
            op.extend(data)
            return op

        hval = FRST(ip)
        htab = [0] * _HSIZE
        fast = 1 if self.args.fast else 0

        lit = 0

        def begin_literal():
            nonlocal lit
            op.append(0)
            lit = 0

        def advance_literal():
            nonlocal lit, ip
            lit += 1
            op.append(ip[0])
            ip = ip[1:]
            if lit == _MAX_LIT:
                op[-lit - 1] = lit - 1
                begin_literal()

        def commit_literal():
            if lit > 0:
                op[-lit - 1] = lit - 1
            else:
                op.pop()

        begin_literal()

        while ip.nbytes > 2:
            hval = NEXT(hval, ip)
            hpos = HIDX(hval)
            ipos = DELTA(ip)
            length = 2
            r, htab[hpos] = htab[hpos], ipos
            off = ipos - r - 1
            ref = view[r:]

            if off >= _MAX_OFF or r <= 0 or ref[:3] != ip[:3]:
                advance_literal()
                continue
            else:
                commit_literal()

            maxlen = min(_MAX_REF, ip.nbytes - length)

            while True:
                length += 1
                if length >= maxlen or ref[length] != ip[length]:
                    length -= 2
                    break

            if length < 7:
                op.append((off >> 8) + (length << 5))
            else:
                op.append((off >> 8) + (7 << 5))
                op.append(length - 7)

            op.append(off & 0xFF)
            begin_literal()

            if ip.nbytes <= length + 3:
                ip = ip[length + 2:]
                break
            if fast:
                ip = ip[length:]
                hval = FRST(ip)
                for _ in range(2):
                    hval = NEXT(hval, ip)
                    htab[HIDX(hval)] = DELTA(ip)
                    ip = ip[1:]
            else:
                ip = ip[1:]
                for _ in range(length + 1):
                    hval = NEXT(hval, ip)
                    htab[HIDX(hval)] = DELTA(ip)
                    ip = ip[1:]
        while ip.nbytes:
            advance_literal()
        commit_literal()
        return op

    def _decompress_chunk(self, data: memoryview, out: MemoryFile):
        ip = StructReader(data)
        while not ip.eof:
            ctrl = ip.u8()
            if ctrl < 0B100000:
                ctrl += 1
                out.write(ip.read_exactly(ctrl))
            else:
                length = ctrl >> 5
                offset = 1 + ((ctrl & 0B11111) << 8)
                if length == 7:
                    length += ip.u8()
                offset += ip.u8()
                length += 2
                out.replay(offset, length)

    def process(self, data):
        mem = memoryview(data)
        out = MemoryFile()

        try:
            reader = StructReader(mem)
            header = LZFHeader(reader)
        except Exception:
            self.log_info('no header detected, decompressing as raw stream')
            self._decompress_chunk(mem, out)
            return out.getvalue()

        for k in itertools.count(1):
            self.log_info(F'chunk: e=0x{header.encoded_size:04X} d=0x{header.decoded_size:04X}')
            chunk = reader.read(header.encoded_size)
            if header.compressed:
                self._decompress_chunk(chunk, out)
            else:
                out.write(chunk)
            if reader.eof:
                break
            try:
                header = LZFHeader(reader)
            except Exception as E:
                msg = F'failed parsing next header after {k} chunks: {E!s}'
                raise RefineryPartialResult(msg, out.getvalue())

        return out.getvalue()

    @classmethod
    def handles(cls, data):
        if data[:2] == LZFHeader.MAGIC:
            return True

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    def FRST(p: memoryview) -> int:
        return ((p[0]) << 8) | p[1]

    def NEXT(v: int, p: memoryview) -> int:
        return ((v << 8) | p[2]) & 0xFFFFFFFF

    def DELTA(p: memoryview):
        return view.nbytes - p.nbytes

    if self.args.fast:
        def HIDX(h: int) -> int:
            return (((h >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))
    else:
        def HIDX(h: int) -> int:
            q = (h ^ (h << 5))
            return (((q >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))

    if not data:
        return data

    ip = view = memoryview(data)
    op = bytearray()

    if len(data) == 1:
        op.append(0)
        op.extend(data)
        return op

    hval = FRST(ip)
    htab = [0] * _HSIZE
    fast = 1 if self.args.fast else 0

    lit = 0

    def begin_literal():
        nonlocal lit
        op.append(0)
        lit = 0

    def advance_literal():
        nonlocal lit, ip
        lit += 1
        op.append(ip[0])
        ip = ip[1:]
        if lit == _MAX_LIT:
            op[-lit - 1] = lit - 1
            begin_literal()

    def commit_literal():
        if lit > 0:
            op[-lit - 1] = lit - 1
        else:
            op.pop()

    begin_literal()

    while ip.nbytes > 2:
        hval = NEXT(hval, ip)
        hpos = HIDX(hval)
        ipos = DELTA(ip)
        length = 2
        r, htab[hpos] = htab[hpos], ipos
        off = ipos - r - 1
        ref = view[r:]

        if off >= _MAX_OFF or r <= 0 or ref[:3] != ip[:3]:
            advance_literal()
            continue
        else:
            commit_literal()

        maxlen = min(_MAX_REF, ip.nbytes - length)

        while True:
            length += 1
            if length >= maxlen or ref[length] != ip[length]:
                length -= 2
                break

        if length < 7:
            op.append((off >> 8) + (length << 5))
        else:
            op.append((off >> 8) + (7 << 5))
            op.append(length - 7)

        op.append(off & 0xFF)
        begin_literal()

        if ip.nbytes <= length + 3:
            ip = ip[length + 2:]
            break
        if fast:
            ip = ip[length:]
            hval = FRST(ip)
            for _ in range(2):
                hval = NEXT(hval, ip)
                htab[HIDX(hval)] = DELTA(ip)
                ip = ip[1:]
        else:
            ip = ip[1:]
            for _ in range(length + 1):
                hval = NEXT(hval, ip)
                htab[HIDX(hval)] = DELTA(ip)
                ip = ip[1:]
    while ip.nbytes:
        advance_literal()
    commit_literal()
    return op

class lzg

This unit is implemented in refinery.units.compression.lzg and has the following commandline Interface:

usage: lzg [-h] [-L] [-Q] [-0] [-v] [-F]

LZG decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lzg(Unit):
    """
    LZG decompression.
    """
    def process(self, data: bytearray):
        stream = LZGStream(data)
        out = stream.decompress()
        if len(out) != stream.decoded_size:
            msg = F'LZG header announced {stream.decoded_size} bytes, but decompressed buffer had size {len(out)}.'
            raise RefineryPartialResult(msg, out)
        return out

    @classmethod
    def handles(cls, data):
        if data[:3] == B'LZG':
            return True

class lzip

This unit is implemented in refinery.units.compression.lzip and has the following commandline Interface:

usage: lzip [-h] [-L] [-Q] [-0] [-v] [-F]

LZIP decompression

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lzip(Unit):
    """
    LZIP decompression
    """
    def process(self, data: bytearray):
        view = memoryview(data)
        with MemoryFile() as output, StructReader(view) as reader:
            for k in count(1):
                if reader.eof:
                    break
                trailing_size = len(data) - reader.tell()
                try:
                    ID, VN, DS = reader.read_struct('4sBB')
                    if ID != B'LZIP':
                        if k > 1:
                            raise EOF
                        else:
                            self.log_warn(F'ignoring invalid LZIP signature: {ID.hex()}')
                    if VN != 1:
                        self.log_warn(F'ignoring invalid LZIP version: {VN}')
                    dict_size = 1 << (DS & 0x1F)
                    dict_size -= (dict_size // 16) * ((DS >> 5) & 7)
                    if dict_size not in range(_MIN_DICT_SIZE, _MAX_DICT_SIZE + 1):
                        raise ValueError(
                            F'The dictionary size {dict_size} is out of the valid range '
                            F'[{_MIN_DICT_SIZE}, {_MAX_DICT_SIZE}]; unable to proceed.'
                        )
                    decoder = MemberDecoder(dict_size, reader, output)
                    if not decoder():
                        raise ValueError(F'Data error in stream {k}.')
                    crc32, data_size, member_size = reader.read_struct('<LQQ')
                    if crc32 != decoder.crc32:
                        self.log_warn(F'checksum in stream {k} was {decoder.crc:08X}, should have been {crc32:08X}.')
                    if member_size - 20 != decoder.member_position:
                        self.log_warn(F'member size in stream {k} was {decoder.member_position}, should have been {member_size}.')
                    if data_size != decoder.data_position:
                        self.log_warn(F'data size in stream {k} was {decoder.data_position}, should have been {data_size}.')
                except EOFError:
                    if k <= 1:
                        raise
                    self.log_info(F'silently ignoring {trailing_size} bytes of trailing data')
                    break

            return output.getvalue()

    @classmethod
    def handles(cls, data):
        return data[:4] == B'LZIP'

class lzjb

This unit is implemented in refinery.units.compression.lzjb and has the following commandline Interface:

usage: lzjb [-h] [-L] [-Q] [-0] [-v] [-R]

LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class lzjb(Unit):
    """
    LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.
    """
    def reverse(self, src):
        # https://web.archive.org/web/20100807223517/ ..
        # .. http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/fs/zfs/lzjb.c
        output = bytearray()
        lempel = [0] * _LEMPEL_SIZE
        copymask = 0x80
        position = 0
        while position < len(src):
            copymask <<= 1
            if copymask >= 0x100:
                copymask = 1
                copymap = len(output)
                output.append(0)
            if position > len(src) - _MATCH_MAX:
                output.append(src[position])
                position += 1
                continue
            hsh = (src[position] << 16) + (src[position + 1] << 8) + src[position + 2]
            hsh += hsh >> 9
            hsh += hsh >> 5
            hsh %= len(lempel)
            offset = (position - lempel[hsh]) & _OFFSET_MASK
            lempel[hsh] = position
            cpy = position - offset
            if cpy >= 0 and cpy != position and src[position:position + 3] == src[cpy:cpy + 3]:
                output[copymap] |= copymask
                for mlen in range(_MATCH_MIN, min(len(src) - position, _MATCH_MAX)):
                    if src[position + mlen] != src[cpy + mlen]:
                        break
                output.append(((mlen - _MATCH_MIN) << (8 - _MATCH_LEN)) | (offset >> 8))
                output.append(offset & 255)
                position += mlen
            else:
                output.append(src[position])
                position += 1
        return output

    def process(self, data):
        dst = bytearray()
        src = StructReader(data)
        while not src.eof:
            copy = src.read_byte()
            for mask in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80):
                if src.eof:
                    break
                if not copy & mask:
                    dst.append(src.read_byte())
                    continue
                elif not dst:
                    raise ValueError('copy requested against empty buffer')
                with src.be:
                    match_len = src.read_integer(6) + _MATCH_MIN
                    match_pos = src.read_integer(10)
                if not match_pos or match_pos > len(dst):
                    raise RuntimeError(F'invalid match offset at position {src.tell()}')
                match_pos = len(dst) - match_pos
                while match_len > 0:
                    match = dst[match_pos:match_pos + match_len]
                    dst.extend(match)
                    match_pos += len(match)
                    match_len -= len(match)
        return dst

Methods

def reverse(self, src)

Expand source code Browse git

def reverse(self, src):
    # https://web.archive.org/web/20100807223517/ ..
    # .. http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/fs/zfs/lzjb.c
    output = bytearray()
    lempel = [0] * _LEMPEL_SIZE
    copymask = 0x80
    position = 0
    while position < len(src):
        copymask <<= 1
        if copymask >= 0x100:
            copymask = 1
            copymap = len(output)
            output.append(0)
        if position > len(src) - _MATCH_MAX:
            output.append(src[position])
            position += 1
            continue
        hsh = (src[position] << 16) + (src[position + 1] << 8) + src[position + 2]
        hsh += hsh >> 9
        hsh += hsh >> 5
        hsh %= len(lempel)
        offset = (position - lempel[hsh]) & _OFFSET_MASK
        lempel[hsh] = position
        cpy = position - offset
        if cpy >= 0 and cpy != position and src[position:position + 3] == src[cpy:cpy + 3]:
            output[copymap] |= copymask
            for mlen in range(_MATCH_MIN, min(len(src) - position, _MATCH_MAX)):
                if src[position + mlen] != src[cpy + mlen]:
                    break
            output.append(((mlen - _MATCH_MIN) << (8 - _MATCH_LEN)) | (offset >> 8))
            output.append(offset & 255)
            position += mlen
        else:
            output.append(src[position])
            position += 1
    return output

class lzma (raw=False, alone=False, xz=False, level=9, delta=0)

This unit is implemented in refinery.units.compression.lz and has the following commandline Interface:

usage: lzma [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-r | -a | -x] [-l N] [-d N]

LZMA compression and decompression.

options:
  -r, --raw      Use raw (no container) format.
  -a, --alone    Use the lzma container format.
  -x, --xz       Use the default xz format.
  -l, --level N  The compression level preset; between 0 and 9.
  -d, --delta N  Add a delta filter when compressing.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lzma(Unit):
    """
    LZMA compression and decompression.
    """

    _SEARCH_MIN_DICT = 0x1_0000
    _SEARCH_MAX_DICT = 0x1000_0000
    _SEARCH_MAX_BLOW = 1.2
    _SEARCH_SKIP1 = 0x08
    _SEARCH_SKIP2 = 0x10
    _ATTEMPT_PARTIAL = True

    def __init__(
        self,
        raw: Param[bool, Arg.Switch('-r', group='MODE', help='Use raw (no container) format.')] = False,
        alone: Param[bool, Arg.Switch('-a', group='MODE', help='Use the lzma container format.')] = False,
        xz: Param[bool, Arg.Switch('-x', group='MODE', help='Use the default xz format.')] = False,
        level: Param[int, Arg.Number('-l', bound=(0, 9), help='The compression level preset; between 0 and 9.')] = 9,
        delta: Param[int, Arg.Number('-d', help='Add a delta filter when compressing.')] = 0,
    ):
        if (raw, alone, xz).count(True) > 1:
            raise ValueError('Only one container format can be enabled.')
        if level not in range(10):
            raise ValueError('Compression level must be a number between 0 and 9.')
        super().__init__(filter=filter, raw=raw, alone=alone, xz=xz, delta=delta,
            level=level | PRESET_EXTREME)

    def reverse(self, data):
        filters = []
        if self.args.delta > 0:
            self.log_debug('adding delta filter')
            filters.append({'id': FILTER_DELTA, 'dist': self.args.delta})
        if self.args.alone:
            self.log_debug('setting alone format')
            mode = FORMAT_ALONE
            filters.append({'id': FILTER_LZMA1, 'preset': self.args.level})
        elif self.args.raw:
            self.log_debug('setting raw format')
            mode = FORMAT_RAW
            filters.append({'id': FILTER_LZMA2, 'preset': self.args.level})
        else:
            if not self.args.xz:
                self.log_info('choosing default .xz container format for compression')
            mode = FORMAT_XZ
            filters.append({'id': FILTER_LZMA2, 'preset': self.args.level})
        lz = LZMACompressor(mode, filters=filters)
        output = lz.compress(data)
        output += lz.flush()
        return output

    def _decompress(self, data: bytearray, lz: LZMADecompressor, partial: bool = False):
        temp = bytearray()
        sizes = repeat(1) if partial else [len(data)]
        with MemoryFile(temp) as output:
            with MemoryFile(data) as stream:
                for size in sizes:
                    if stream.eof or stream.closed:
                        break
                    try:
                        offset = stream.tell()
                        output.write(lz.decompress(stream.read(size)))
                    except (EOFError, LZMAError):
                        raise RefineryPartialResult(
                            F'compression failed at offset {offset}', temp)
        if n := len(lz.unused_data):
            raise RefineryPartialResult(F'Data stream is truncated, {n} bytes unused.', temp)
        return temp

    def _process(self, data: bytearray, partial=False):
        try:
            dc = LZMADecompressor()
            return self._decompress(data, dc, partial)
        except RefineryPartialResult as pe:
            best = pe
        except Exception:
            best = None
            self.log_info('default LZMA decompressor failed, brute-forcing custom header')
        view = memoryview(data)
        min_original_size = {
            # https://sourceforge.net/p/sevenzip/discussion/45797/thread/b6bd62f8/
            1: int((len(data) - 64_000) / 1.100), # noqa
            2: int((len(data) -  1_000) / 1.001), # noqa
        }
        for (version, p), offset_prop, to_data in product(
            ((1, 5),
             (2, 1)),
            range(self._SEARCH_SKIP1 + 1),
            range(self._SEARCH_SKIP2 + 1),
        ):
            if offset_prop + to_data > p + 20:
                # expect no more than a 20 byte header on top of the properties
                # that would be enough for, e.g. compressed & uncompressed size
                # each filling a full 64bit integer and 4 additional bytes.
                continue
            try:
                filter = parse_lzma_properties(
                    view[offset_prop:offset_prop + p],
                    version,
                    min_dict=self._SEARCH_MIN_DICT,
                    max_dict=self._SEARCH_MAX_DICT,
                )
                self.log_debug(F'attempt LZMA{version} at {offset_prop:02d}, skipping {to_data:02d}, filter: {filter!r}')
                engine = LZMADecompressor(FORMAT_RAW, filters=[filter])
                result = self._decompress(view[offset_prop + p + to_data:], engine, partial)
            except RefineryPartialResult as pe:
                if best is None:
                    best = pe
                elif len(best.partial) < len(pe.partial):
                    best = pe
                continue
            except Exception:
                continue
            if len(result) < min_original_size[version]:
                continue
            if len(result) * self._SEARCH_MAX_BLOW < len(data):
                continue
            self.log_info(
                F'success with LZMA{version} properties at {offset_prop} and raw stream starting at {to_data + offset_prop + p}')
            return result
        if partial or not self._ATTEMPT_PARTIAL:
            if best and len(best.partial) > 0:
                raise best
            raise ValueError('unable to find an LZMA stream')

    def process(self, data: bytearray):
        if out := self._process(data):
            return out
        return self._process(data, partial=True)

    @classmethod
    def handles(cls, data):
        if data[:4] == B'\x5D\0\0\0':
            return True
        if data[:5] == B'\xFD7zXZ':
            return True

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    filters = []
    if self.args.delta > 0:
        self.log_debug('adding delta filter')
        filters.append({'id': FILTER_DELTA, 'dist': self.args.delta})
    if self.args.alone:
        self.log_debug('setting alone format')
        mode = FORMAT_ALONE
        filters.append({'id': FILTER_LZMA1, 'preset': self.args.level})
    elif self.args.raw:
        self.log_debug('setting raw format')
        mode = FORMAT_RAW
        filters.append({'id': FILTER_LZMA2, 'preset': self.args.level})
    else:
        if not self.args.xz:
            self.log_info('choosing default .xz container format for compression')
        mode = FORMAT_XZ
        filters.append({'id': FILTER_LZMA2, 'preset': self.args.level})
    lz = LZMACompressor(mode, filters=filters)
    output = lz.compress(data)
    output += lz.flush()
    return output

class lznt1 (chunk_size=4096)

This unit is implemented in refinery.units.compression.lznt1 and has the following commandline Interface:

usage: lznt1 [-h] [-L] [-Q] [-0] [-v] [-R] [-c N]

LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API
routine RtlDecompressBuffer, for example.

options:
  -c, --chunk-size N  Optionally specify the chunk size for compression, default is 0x1000.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.
  -R, --reverse       Use the reverse operation.

Expand source code Browse git

class lznt1(Unit):
    """
    LZNT1 compression and decompression. This compression algorithm is expected
    by the Win32 API routine `RtlDecompressBuffer`, for example.
    """

    def _decompress_chunk(self, chunk):
        out = B''
        while chunk:
            flags = chunk[0]
            chunk = chunk[1:]
            for i in range(8):
                if not (flags >> i & 1):
                    out += chunk[:1]
                    chunk = chunk[1:]
                else:
                    flag = struct.unpack('<H', chunk[:2])[0]
                    pos = len(out) - 1
                    l_mask = 0xFFF
                    o_shift = 12
                    while pos >= 0x10:
                        l_mask >>= 1
                        o_shift -= 1
                        pos >>= 1
                    length = (flag & l_mask) + 3
                    offset = (flag >> o_shift) + 1
                    if length >= offset:
                        tmp = out[-offset:] * (0xFFF // len(out[-offset:]) + 1)
                        out += tmp[:length]
                    else:
                        out += out[-offset:length - offset]
                    chunk = chunk[2:]
                if len(chunk) == 0:
                    break
        return out

    def _find(self, src, target, max_len):
        result_offset = 0
        result_length = 0
        for i in range(1, max_len):
            offset = src.rfind(target[:i])
            if offset == -1:
                break
            tmp_offset = len(src) - offset
            tmp_length = i
            if tmp_offset == tmp_length:
                tmp = src[offset:] * (0xFFF // len(src[offset:]) + 1)
                for j in range(i, max_len + 1):
                    offset = tmp.rfind(target[:j])
                    if offset == -1:
                        break
                    tmp_length = j
            if tmp_length > result_length:
                result_offset = tmp_offset
                result_length = tmp_length
        if result_length < 3:
            return 0, 0
        return result_offset, result_length

    def _compress_chunk(self, chunk):
        blob = copy.copy(chunk)
        out = B''
        pow2 = 0x10
        l_mask3 = 0x1002
        o_shift = 12
        while len(blob) > 0:
            bits = 0
            tmp = B''
            for i in range(8):
                bits >>= 1
                while pow2 < (len(chunk) - len(blob)):
                    pow2 <<= 1
                    l_mask3 = (l_mask3 >> 1) + 1
                    o_shift -= 1
                if len(blob) < l_mask3:
                    max_len = len(blob)
                else:
                    max_len = l_mask3
                offset1, length1 = self._find(
                    chunk[:len(chunk) - len(blob)], blob, max_len)
                # try to find more compressed pattern
                offset2, length2 = self._find(
                    chunk[:len(chunk) - len(blob) + 1], blob[1:], max_len)
                if length1 < length2:
                    length1 = 0
                if length1 > 0:
                    symbol = ((offset1 - 1) << o_shift) | (length1 - 3)
                    tmp += struct.pack('<H', symbol)
                    bits |= 0x80  # set the highest bit
                    blob = blob[length1:]
                else:
                    tmp += blob[:1]
                    blob = blob[1:]
                if len(blob) == 0:
                    break
            out += struct.pack('B', bits >> (7 - i))
            out += tmp
        return out

    def reverse(self, buf):
        out = B''
        while buf:
            chunk = buf[:self.args.chunk_size]
            compressed = self._compress_chunk(chunk)
            if len(compressed) < len(chunk):  # chunk is compressed
                flags = 0xB000
                header = struct.pack('<H', flags | (len(compressed) - 1))
                out += header + compressed
            else:
                flags = 0x3000
                header = struct.pack('<H', flags | (len(chunk) - 1))
                out += header + chunk
            buf = buf[self.args.chunk_size:]
        return out

    def process(self, data):
        out = io.BytesIO()
        offset = 0
        while offset < len(data):
            try:
                header, = struct.unpack('<H', data[offset:offset + 2])
            except struct.error as err:
                raise RefineryPartialResult(str(err), partial=out.getvalue())
            offset += 2
            size = (header & 0xFFF) + 1
            if size + 1 >= len(data):
                raise RefineryPartialResult(
                    F'chunk header indicates size {size}, but only {len(data)} bytes remain.',
                    partial=out.getvalue()
                )
            chunk = data[offset:offset + size]
            offset += size
            if header & 0x8000:
                chunk = self._decompress_chunk(chunk)
            out.write(chunk)
        return out.getvalue()

    def __init__(
        self,
        chunk_size: Param[int, Arg.Number('-c', help=(
            'Optionally specify the chunk size for compression, default is 0x1000.')
        )] = 0x1000
    ):
        super().__init__(chunk_size=chunk_size)

Methods

def reverse(self, buf)

Expand source code Browse git

def reverse(self, buf):
    out = B''
    while buf:
        chunk = buf[:self.args.chunk_size]
        compressed = self._compress_chunk(chunk)
        if len(compressed) < len(chunk):  # chunk is compressed
            flags = 0xB000
            header = struct.pack('<H', flags | (len(compressed) - 1))
            out += header + compressed
        else:
            flags = 0x3000
            header = struct.pack('<H', flags | (len(chunk) - 1))
            out += header + chunk
        buf = buf[self.args.chunk_size:]
    return out

class lzo

This unit is implemented in refinery.units.compression.lzo and has the following commandline Interface:

usage: lzo [-h] [-L] [-Q] [-0] [-v] [-F]

LZO decompression. The code works against simple test cases, but it is known to fail for certain
outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when
the -9 switch is used).

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lzo(Unit):
    """
    LZO decompression. The code works against simple test cases, but it is known to fail for
    certain outputs produced by the lzop command-line tool when high compression ratio is
    favoured (i.e. when the -9 switch is used).
    """
    def decompress_stream(self, data: buf, LZOv1: bool = False) -> bytearray:
        """
        An implementation of LZO decompression. We use the article
        "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)"
        as a reference since no proper specification is available.
        """
        def integer() -> int:
            length = 0
            while True:
                byte = src.read_byte()
                if byte:
                    return length + byte
                length += 0xFF
                if length > 0x100000:
                    raise LZOError('Too many zeros in integer encoding.')

        def literal(count):
            dst.write(src.read_bytes(count))

        def copy(distance: int, length: int):
            if distance > len(dst):
                raise LZOError(F'Distance {distance} > bufsize {len(dst)}')
            buffer = dst.getvalue()
            if distance > length:
                start = len(buffer) - distance
                end = start + length
                dst.write(buffer[start:end])
            else:
                block = buffer[-distance:]
                while len(block) < length:
                    block += block[:length - len(block)]
                if len(block) > length:
                    block[length:] = ()
                dst.write(block)

        src = StructReader(memoryview(data))
        dst = MemoryFile()

        state = 0
        first = src.read_byte()

        if first == 0x10:
            raise LZOError('Invalid first stream byte 0x10.')
        elif first <= 0x12:
            src.seekrel(-1)
        elif first <= 0x15:
            state = first - 0x11
            literal(state)
        else:
            state = 4
            literal(first - 0x11)

        while True:
            instruction = src.read_byte()
            if instruction < 0x10:
                if state == 0:
                    length = instruction or integer() + 15
                    state = length + 3
                    if state < 4:
                        raise LZOError('Literal encoding is too short.')
                else:
                    state = instruction & 0b0011
                    D = (instruction & 0b1100) >> 2
                    H = src.read_byte()
                    distance = (H << 2) + D + 1
                    if state >= 4:
                        distance += 0x800
                        length = 3
                    else:
                        length = 2
                    copy(distance, length)
            elif instruction < 0x20:
                L = instruction & 0b0111
                H = instruction & 0b1000
                length = L or integer() + 7
                argument = src.u16()
                state = argument & 3
                distance = (H << 11) + (argument >> 2)
                if not distance:
                    return dst.getvalue()
                if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265):
                    raise LZOError('Compressed data contains sequence that is banned in LZOv1.')
                if LZOv1 and distance == 0xBFFF:
                    X = src.read_byte()
                    count = ((X << 3) | L) + 4
                    self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.')
                    dst.write(B'\0' * count)
                else:
                    copy(distance + 0x4000, length + 2)
            elif instruction < 0x40:
                L = instruction & 0b11111
                length = L or integer() + 31
                argument = src.u16()
                state = argument & 3
                distance = (argument >> 2) + 1
                copy(distance, length + 2)
            else:
                if instruction < 0x80:
                    length = 3 + ((instruction >> 5) & 1)
                else:
                    length = 5 + ((instruction >> 5) & 3)
                H = src.read_byte()
                D = (instruction & 0b11100) >> 2
                state = instruction & 3
                distance = (H << 3) + D + 1
                copy(distance, length)
            if state:
                literal(state)

    def process(self, data):
        try:
            lzo = LZO(data)
        except LZOError:
            self.log_info('Not an LZO archive, processing raw stream.')
            return self.decompress_stream(data)
        with MemoryFile() as output:
            for k, chunk in enumerate(lzo, 1):
                self.log_debug(F'decompressing chunk {k}')
                output.write(self.decompress_stream(chunk.data))
            return self.labelled(
                output.getvalue(),
                path=lzo.name,
                date=date_from_timestamp(lzo.mtime)
            )

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:len(LZO.SIGNATURE)] == LZO.SIGNATURE:
            return True

Methods

def decompress_stream(self, data, LZOv1=False)

An implementation of LZO decompression. We use the article "LZO stream format as understood by Linux's LZO decompressor" as a reference since no proper specification is available.

Expand source code Browse git

def decompress_stream(self, data: buf, LZOv1: bool = False) -> bytearray:
    """
    An implementation of LZO decompression. We use the article
    "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)"
    as a reference since no proper specification is available.
    """
    def integer() -> int:
        length = 0
        while True:
            byte = src.read_byte()
            if byte:
                return length + byte
            length += 0xFF
            if length > 0x100000:
                raise LZOError('Too many zeros in integer encoding.')

    def literal(count):
        dst.write(src.read_bytes(count))

    def copy(distance: int, length: int):
        if distance > len(dst):
            raise LZOError(F'Distance {distance} > bufsize {len(dst)}')
        buffer = dst.getvalue()
        if distance > length:
            start = len(buffer) - distance
            end = start + length
            dst.write(buffer[start:end])
        else:
            block = buffer[-distance:]
            while len(block) < length:
                block += block[:length - len(block)]
            if len(block) > length:
                block[length:] = ()
            dst.write(block)

    src = StructReader(memoryview(data))
    dst = MemoryFile()

    state = 0
    first = src.read_byte()

    if first == 0x10:
        raise LZOError('Invalid first stream byte 0x10.')
    elif first <= 0x12:
        src.seekrel(-1)
    elif first <= 0x15:
        state = first - 0x11
        literal(state)
    else:
        state = 4
        literal(first - 0x11)

    while True:
        instruction = src.read_byte()
        if instruction < 0x10:
            if state == 0:
                length = instruction or integer() + 15
                state = length + 3
                if state < 4:
                    raise LZOError('Literal encoding is too short.')
            else:
                state = instruction & 0b0011
                D = (instruction & 0b1100) >> 2
                H = src.read_byte()
                distance = (H << 2) + D + 1
                if state >= 4:
                    distance += 0x800
                    length = 3
                else:
                    length = 2
                copy(distance, length)
        elif instruction < 0x20:
            L = instruction & 0b0111
            H = instruction & 0b1000
            length = L or integer() + 7
            argument = src.u16()
            state = argument & 3
            distance = (H << 11) + (argument >> 2)
            if not distance:
                return dst.getvalue()
            if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265):
                raise LZOError('Compressed data contains sequence that is banned in LZOv1.')
            if LZOv1 and distance == 0xBFFF:
                X = src.read_byte()
                count = ((X << 3) | L) + 4
                self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.')
                dst.write(B'\0' * count)
            else:
                copy(distance + 0x4000, length + 2)
        elif instruction < 0x40:
            L = instruction & 0b11111
            length = L or integer() + 31
            argument = src.u16()
            state = argument & 3
            distance = (argument >> 2) + 1
            copy(distance, length + 2)
        else:
            if instruction < 0x80:
                length = 3 + ((instruction >> 5) & 1)
            else:
                length = 5 + ((instruction >> 5) & 3)
            H = src.read_byte()
            D = (instruction & 0b11100) >> 2
            state = instruction & 3
            distance = (H << 3) + D + 1
            copy(distance, length)
        if state:
            literal(state)

class lzw

This unit is implemented in refinery.units.compression.lzw and has the following commandline Interface:

usage: lzw [-h] [-L] [-Q] [-0] [-v] [-F]

LZW decompression based on ancient Unix sources.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class lzw(Unit):
    '''
    LZW decompression based on ancient Unix sources.
    '''

    _MAGIC = B'\x1F\x9D'

    def process(self, data: bytearray):
        out = MemoryFile()
        inf = StructReader(memoryview(data))

        if inf.peek(2) != self._MAGIC:
            self.log_info('No LZW signature found, assuming raw stream.')
            maxbits = LZW.BITS
            block_mode = True
        else:
            inf.seekrel(2)
            maxbits = inf.read_integer(5)
            if inf.read_integer(2) != 0:
                self.log_info('reserved bits were set in LZW header')
            block_mode = bool(inf.read_bit())

        if maxbits > LZW.BITS:
            raise ValueError(F'Compressed with {maxbits} bits; cannot handle file.')

        maxmaxcode = 1 << maxbits

        ibuf = inf.read()

        tab_suffix = bytearray(LZW.WSIZE * 2)
        tab_prefix = array('H', itertools.repeat(0, 1 << LZW.BITS))

        n_bits = LZW.INIT_BITS
        maxcode = (1 << n_bits) - 1
        bitmask = (1 << n_bits) - 1
        oldcode = ~0
        finchar = +0
        posbits = +0

        free_entry = LZW.FIRST if block_mode else 0x100
        tab_suffix[:0x100] = range(0x100)
        resetbuf = True

        while resetbuf:
            resetbuf = False

            ibuf = ibuf[posbits >> 3:]
            insize = len(ibuf)
            posbits = 0
            inbits = (insize << 3) - (n_bits - 1)

            while inbits > posbits:
                if free_entry > maxcode:
                    n = n_bits << 3
                    p = posbits - 1
                    posbits = p + (n - (p + n) % n)
                    n_bits += 1
                    if (n_bits == maxbits):
                        maxcode = maxmaxcode
                    else:
                        maxcode = (1 << n_bits) - 1
                    bitmask = (1 << n_bits) - 1
                    resetbuf = True
                    break

                p = ibuf[posbits >> 3:]
                code = int.from_bytes(p[:3], 'little') >> (posbits & 7) & bitmask
                posbits += n_bits

                if oldcode == -1:
                    if code >= 256:
                        raise ValueError('corrupt input.')
                    oldcode = code
                    finchar = oldcode
                    out.write_byte(finchar)
                    continue

                if code == LZW.CLEAR and block_mode:
                    tab_prefix[:0x100] = array('H', itertools.repeat(0, 0x100))
                    free_entry = LZW.FIRST - 1
                    n = n_bits << 3
                    p = posbits - 1
                    posbits = p + (n - (p + n) % n)
                    n_bits = LZW.INIT_BITS
                    maxcode = (1 << n_bits) - 1
                    bitmask = (1 << n_bits) - 1
                    resetbuf = True
                    break

                incode = code
                stack = bytearray()

                if code >= free_entry:
                    if code > free_entry:
                        raise RefineryPartialResult('corrupt input.', out.getvalue())
                    stack.append(finchar)
                    code = oldcode
                while code >= 256:
                    stack.append(tab_suffix[code])
                    code = tab_prefix[code]

                finchar = tab_suffix[code]
                stack.append(finchar)
                stack.reverse()
                out.write(stack)
                code = free_entry

                if code < maxmaxcode:
                    tab_prefix[code] = oldcode & 0xFFFF
                    tab_suffix[code] = finchar & 0x00FF
                    free_entry = code + 1

                oldcode = incode

        return out.getvalue()

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:len(cls._MAGIC)] == cls._MAGIC:
            return True

class lzx (window=15, wim=False)

This unit is implemented in refinery.units.compression.lzx and has the following commandline Interface:

usage: lzx [-h] [-L] [-Q] [-0] [-v] [-w] [window]

positional arguments:
  window         Optionally specify the window size; the default is 15.

options:
  -w, --wim      Use the WIM flavor of LZX.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lzx(Unit):

    def __init__(
        self,
        window: Param[int, Arg(help='Optionally specify the window size; the default is {default}.')] = 15,
        wim: Param[bool, Arg('-w', help='Use the WIM flavor of LZX.')] = False,
    ):
        super().__init__(window=window, wim=wim)

    def process(self, data):
        lzx = LzxDecoder(self.args.wim)
        lzx.set_params_and_alloc(self.args.window)

        try:
            return lzx.decompress(memoryview(data))
        except Exception as E:
            if out := lzx.get_output_data():
                raise RefineryPartialResult(str(E), out) from E
            raise

class m2h (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m2h [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 32bit Murmur Hash, Version 2.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m2h(MurMurHash):
    """
    Returns the 32bit Murmur Hash, Version 2.
    """
    def _algorithm(self, data) -> bytes:
        return v2_mmh32digest(data, self.args.seed)

class m2h64a (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m2h64a [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 64bit Murmur Hash, Version 2, Variant A.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m2h64a(MurMurHash):
    """
    Returns the 64bit Murmur Hash, Version 2, Variant A.
    """
    def _algorithm(self, data) -> bytes:
        return v2_mmh64digestA(data, self.args.seed)

class m2h64b (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m2h64b [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 64bit Murmur Hash, Version 2, Variant B.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m2h64b(MurMurHash):
    """
    Returns the 64bit Murmur Hash, Version 2, Variant B.
    """
    def _algorithm(self, data) -> bytes:
        return v2_mmh64digestB(data, self.args.seed)

class m2ha (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m2ha [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 32bit Murmur Hash, Version 2, Variant A.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m2ha(MurMurHash):
    """
    Returns the 32bit Murmur Hash, Version 2, Variant A.
    """
    def _algorithm(self, data) -> bytes:
        return v2_mmh32digestA(data, self.args.seed)

class m3h (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m3h [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 32bit Murmur Hash, Version 3.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m3h(MurMurHash):
    """
    Returns the 32bit Murmur Hash, Version 3.
    """
    def _algorithm(self, data) -> bytes:
        return v3_mmh32digest(data, self.args.seed)

class m3h32 (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m3h32 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 128bit Murmur Hash, Version 3, 32bit digest size.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m3h32(MurMurHash):
    """
    Returns the 128bit Murmur Hash, Version 3, 32bit digest size.
    """
    def _algorithm(self, data) -> bytes:
        return v3_mmh128digest32(data, self.args.seed)

class m3h64 (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: m3h64 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 128bit Murmur Hash, Version 3, 64bit digest size.

positional arguments:
  N              Optional seed value, defaults to 0.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class m3h64(MurMurHash):
    """
    Returns the 128bit Murmur Hash, Version 3, 64bit digest size.
    """
    def _algorithm(self, data) -> bytes:
        return v3_mmh128digest64(data, self.args.seed)

class machometa (all=True, header=False, linked_images=False, signatures=False, version=False, load_commands=False, exports=False, imports=False, tabular=False)

This unit is implemented in refinery.units.formats.macho.machometa and has the following commandline Interface:

usage: machometa [-h] [-L] [-Q] [-0] [-v] [-c] [-H] [-K] [-S] [-V] [-D] [-E] [-I] [-t]

Extract metadata from Mach-O files.

options:
  -c, --custom         Unless enabled, all default categories will be extracted.
  -H, --header         Parse basic data from the Mach-O header.
  -K, --linked-images  Parse all library images linked by the Mach-O.
  -S, --signatures     Parse signature and entitlement information.
  -V, --version        Parse version information from the Mach-O load commands.
  -D, --load-commands  Parse load commands from the Mach-O header.
  -E, --exports        List all exported functions.
  -I, --imports        List all imported functions.
  -t, --tabular        Print information in a table rather than as JSON

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class machometa(Unit):
    """
    Extract metadata from Mach-O files.
    """
    def __init__(
        self, all: Param[bool, Arg('-c', '--custom',
            help='Unless enabled, all default categories will be extracted.')] = True,
        header: Param[bool, Arg('-H', help='Parse basic data from the Mach-O header.')] = False,
        linked_images: Param[bool, Arg('-K', help='Parse all library images linked by the Mach-O.')] = False,
        signatures: Param[bool, Arg('-S', help='Parse signature and entitlement information.')] = False,
        version: Param[bool, Arg('-V', help='Parse version information from the Mach-O load commands.')] = False,
        load_commands: Param[bool, Arg('-D', help='Parse load commands from the Mach-O header.')] = False,
        exports: Param[bool, Arg('-E', help='List all exported functions.')] = False,
        imports: Param[bool, Arg('-I', help='List all imported functions.')] = False,
        tabular: Param[bool, Arg('-t', help='Print information in a table rather than as JSON')] = False,
    ):
        super().__init__(
            header=all or header,
            linked_images=all or linked_images,
            version=all or version,
            signatures=all or signatures,
            load_commands=load_commands,
            imports=imports,
            exports=exports,
            tabular=tabular,
        )

    def compute_symhash(self, macho: lief.MachO.Binary) -> dict:
        def _symbols(symbols: Iterable[lief.MachO.Symbol]):
            for sym in symbols:
                if sym.category != lief.MachO.Symbol.CATEGORY.UNDEFINED:
                    continue
                yield lief.string(sym.name)
        symbols = sorted(set(_symbols(macho.symbols)))
        symbols: str = ','.join(symbols)
        return md5(symbols.encode('utf8')).hexdigest()

    def parse_macho_header(self, macho: lief.MachO.Binary, data=None) -> dict:
        info = {}
        if header := macho.header:
            st = header.cpu_subtype & 0x7FFFFFFF
            ht = 'mach_header_64' if header.magic in {
                lief.MachO.MACHO_TYPES.CIGAM_64,
                lief.MachO.MACHO_TYPES.MAGIC_64,
            } else 'mach_header'
            info['Type'] = ht
            info['Magic'] = header.magic.value
            info['CPUType'] = header.cpu_type.__name__.upper()
            info['CPUSubType'] = _CPU_SUBTYPES.get(header.cpu_type, {}).get(st, st)
            info['FileType'] = header.file_type.__name__
            info['LoadCount'] = header.nb_cmds
            info['LoadSize'] = header.sizeof_cmds
            info['Flags'] = sorted(flag.__name__ for flag in header.flags_list)
            info['Reserved'] = header.reserved
        return info

    def parse_linked_images(self, macho: lief.MachO.Binary, data=None) -> dict:
        load_command_images = {}
        load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
        for load_command in load_commands:
            if not isinstance(load_command, lief.MachO.DylibCommand):
                continue
            images: list[str] = load_command_images.setdefault(load_command.command.__name__, [])
            images.append(load_command.name)
        return load_command_images

    def parse_signature(self, macho_image: lief.MachO.Binary, data=None) -> dict:

        if not macho_image.has_code_signature:
            return {}

        info = {}
        reader = StructReader(macho_image.code_signature.content)
        super_blob = SuperBlob(reader)

        for blob in super_blob.blobs:

            if blob.type == BlobType.CODEDIRECTORY:
                codedirectory_blob = CodeDirectoryBlob(blob.data)
                if codedirectory_blob.flags & CS_ADHOC != 0:
                    info['AdHocSigned'] = True
                else:
                    info['AdHocSigned'] = False
                reader.seekset(codedirectory_blob.identOffset + blob.offset)
                info['SignatureIdentifier'] = reader.read_c_string('utf8')
                continue

            if blob.type == BlobType.CMS_SIGNATURE:
                reader.seekset(blob.offset)
                cms_signature = blob.data
                if not cms_signature:
                    continue
                try:
                    parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature))
                    info['Signature'] = parsed_cms_signature
                except ValueError as pkcs7_parse_error:
                    self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}')
                continue

            if blob.type == BlobType.REQUIREMENTS:
                # TODO: Parse the requirements blob,
                # which is encoded according to the code signing requirements language:
                # https://developer.apple.com/library/archive/documentation/Security
                #        /Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html
                info['Requirements'] = blob.data.hex()
                continue

            if blob.type == BlobType.XML_ENTITLEMENTS:
                entitlements = bytes(blob.data)
                if not entitlements:
                    continue
                try:
                    entitlements = plistlib.loads(entitlements)
                except Exception as error:
                    self.log_warn(F'failed to parse entitlements: {error!s}')
                else:
                    info['Entitlements'] = entitlements

        return info

    def parse_version(self, macho: lief.MachO.Binary, data=None) -> dict:
        info = {}
        load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
        for load_command in load_commands:
            if load_command.command == lief.MachO.LoadCommand.TYPE.SOURCE_VERSION:
                if 'SourceVersion' not in info:
                    cmd: lief.MachO.SourceVersion = load_command
                    info['SourceVersion'] = cmd.version[0]
                else:
                    self.log_warn('More than one load command of type SOURCE_VERSION found; the MachO file is possibly malformed')
                continue
            if load_command.command == lief.MachO.LoadCommand.TYPE.BUILD_VERSION:
                if 'BuildVersion' not in info:
                    cmd: lief.MachO.BuildVersion = load_command
                    info['BuildVersion'] = {}
                    info['BuildVersion']['Platform'] = cmd.platform.__name__
                    info['BuildVersion']['MinOS'] = '.'.join(str(v) for v in cmd.minos)
                    info['BuildVersion']['SDK'] = '.'.join(str(v) for v in cmd.sdk)
                    info['BuildVersion']['Ntools'] = len(cmd.tools)
                else:
                    self.log_warn('More than one load command of type BUILD_VERSION found; the MachO file is possibly malformed')
                continue
        return info

    def parse_load_commands(self, macho: lief.MachO.Binary, data=None) -> list:
        info = []
        load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
        for load_command in load_commands:
            info.append(dict(
                Type=load_command.command.__name__,
                Size=load_command.size,
                Data=load_command.data.hex(),
            ))
        return info

    def parse_imports(self, macho: lief.MachO.Binary, data=None) -> list:
        info = []
        imports: Iterable[lief.MachO.Symbol] = macho.imported_symbols
        for imp in imports:
            info.append(lief.string(imp.name))
        return info

    def parse_exports(self, macho: lief.MachO.Binary, data=None) -> list:
        info = []
        exports: Iterable[lief.MachO.Symbol] = macho.exported_symbols
        for exp in exports:
            info.append(lief.string(exp.name))
        return info

    def process(self, data: bytearray):
        result = {}
        slices = []
        macho = lief.load_macho(data)
        macho_slices: list[lief.MachO.Binary] = []

        for k in itertools.count():
            if not (ms := macho.at(k)):
                break
            macho_slices.append(ms)

        result['FileType'] = 'FAT' if len(macho_slices) > 1 else 'THIN'

        for image in macho_slices:
            slice_result = {}

            for switch, resolver, name in [
                (self.args.header,          self.parse_macho_header,  'Header'),       # noqa
                (self.args.linked_images,   self.parse_linked_images, 'LinkedImages'), # noqa
                (self.args.signatures,      self.parse_signature,     'Signatures'),   # noqa
                (self.args.version,         self.parse_version,       'Version'),      # noqa
                (self.args.load_commands,   self.parse_load_commands, 'LoadCommands'), # noqa
                (self.args.imports,         self.parse_imports,       'Imports'),      # noqa
                (self.args.exports,         self.parse_exports,       'Exports'),      # noqa
            ]:
                if not switch:
                    continue
                self.log_debug(F'parsing: {name}')
                try:
                    info = resolver(image, data)
                except Exception as E:
                    self.log_info(F'failed to obtain {name}: {E!s}')
                    continue
                if info:
                    slice_result[name] = info

            if image.uuid is not None:
                uuid = bytes(image.uuid.uuid)
                slice_result['UUID'] = uuid.hex()
            slice_result['SymHash'] = self.compute_symhash(image)
            if fileset_name := image.fileset_name:
                slice_result['FilesetName'] = fileset_name
            slices.append(slice_result)

        if slices:
            result['Slices'] = slices
            yield from ppjson(
                tabular=self.args.tabular
            )._pretty_output(result, indent=4, ensure_ascii=False)

Methods

def compute_symhash(self, macho)

Expand source code Browse git

def compute_symhash(self, macho: lief.MachO.Binary) -> dict:
    def _symbols(symbols: Iterable[lief.MachO.Symbol]):
        for sym in symbols:
            if sym.category != lief.MachO.Symbol.CATEGORY.UNDEFINED:
                continue
            yield lief.string(sym.name)
    symbols = sorted(set(_symbols(macho.symbols)))
    symbols: str = ','.join(symbols)
    return md5(symbols.encode('utf8')).hexdigest()

def parse_macho_header(self, macho, data=None)

Expand source code Browse git

def parse_macho_header(self, macho: lief.MachO.Binary, data=None) -> dict:
    info = {}
    if header := macho.header:
        st = header.cpu_subtype & 0x7FFFFFFF
        ht = 'mach_header_64' if header.magic in {
            lief.MachO.MACHO_TYPES.CIGAM_64,
            lief.MachO.MACHO_TYPES.MAGIC_64,
        } else 'mach_header'
        info['Type'] = ht
        info['Magic'] = header.magic.value
        info['CPUType'] = header.cpu_type.__name__.upper()
        info['CPUSubType'] = _CPU_SUBTYPES.get(header.cpu_type, {}).get(st, st)
        info['FileType'] = header.file_type.__name__
        info['LoadCount'] = header.nb_cmds
        info['LoadSize'] = header.sizeof_cmds
        info['Flags'] = sorted(flag.__name__ for flag in header.flags_list)
        info['Reserved'] = header.reserved
    return info

def parse_linked_images(self, macho, data=None)

Expand source code Browse git

def parse_linked_images(self, macho: lief.MachO.Binary, data=None) -> dict:
    load_command_images = {}
    load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
    for load_command in load_commands:
        if not isinstance(load_command, lief.MachO.DylibCommand):
            continue
        images: list[str] = load_command_images.setdefault(load_command.command.__name__, [])
        images.append(load_command.name)
    return load_command_images

def parse_signature(self, macho_image, data=None)

Expand source code Browse git

def parse_signature(self, macho_image: lief.MachO.Binary, data=None) -> dict:

    if not macho_image.has_code_signature:
        return {}

    info = {}
    reader = StructReader(macho_image.code_signature.content)
    super_blob = SuperBlob(reader)

    for blob in super_blob.blobs:

        if blob.type == BlobType.CODEDIRECTORY:
            codedirectory_blob = CodeDirectoryBlob(blob.data)
            if codedirectory_blob.flags & CS_ADHOC != 0:
                info['AdHocSigned'] = True
            else:
                info['AdHocSigned'] = False
            reader.seekset(codedirectory_blob.identOffset + blob.offset)
            info['SignatureIdentifier'] = reader.read_c_string('utf8')
            continue

        if blob.type == BlobType.CMS_SIGNATURE:
            reader.seekset(blob.offset)
            cms_signature = blob.data
            if not cms_signature:
                continue
            try:
                parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature))
                info['Signature'] = parsed_cms_signature
            except ValueError as pkcs7_parse_error:
                self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}')
            continue

        if blob.type == BlobType.REQUIREMENTS:
            # TODO: Parse the requirements blob,
            # which is encoded according to the code signing requirements language:
            # https://developer.apple.com/library/archive/documentation/Security
            #        /Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html
            info['Requirements'] = blob.data.hex()
            continue

        if blob.type == BlobType.XML_ENTITLEMENTS:
            entitlements = bytes(blob.data)
            if not entitlements:
                continue
            try:
                entitlements = plistlib.loads(entitlements)
            except Exception as error:
                self.log_warn(F'failed to parse entitlements: {error!s}')
            else:
                info['Entitlements'] = entitlements

    return info

def parse_version(self, macho, data=None)

Expand source code Browse git

def parse_version(self, macho: lief.MachO.Binary, data=None) -> dict:
    info = {}
    load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
    for load_command in load_commands:
        if load_command.command == lief.MachO.LoadCommand.TYPE.SOURCE_VERSION:
            if 'SourceVersion' not in info:
                cmd: lief.MachO.SourceVersion = load_command
                info['SourceVersion'] = cmd.version[0]
            else:
                self.log_warn('More than one load command of type SOURCE_VERSION found; the MachO file is possibly malformed')
            continue
        if load_command.command == lief.MachO.LoadCommand.TYPE.BUILD_VERSION:
            if 'BuildVersion' not in info:
                cmd: lief.MachO.BuildVersion = load_command
                info['BuildVersion'] = {}
                info['BuildVersion']['Platform'] = cmd.platform.__name__
                info['BuildVersion']['MinOS'] = '.'.join(str(v) for v in cmd.minos)
                info['BuildVersion']['SDK'] = '.'.join(str(v) for v in cmd.sdk)
                info['BuildVersion']['Ntools'] = len(cmd.tools)
            else:
                self.log_warn('More than one load command of type BUILD_VERSION found; the MachO file is possibly malformed')
            continue
    return info

def parse_load_commands(self, macho, data=None)

Expand source code Browse git

def parse_load_commands(self, macho: lief.MachO.Binary, data=None) -> list:
    info = []
    load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
    for load_command in load_commands:
        info.append(dict(
            Type=load_command.command.__name__,
            Size=load_command.size,
            Data=load_command.data.hex(),
        ))
    return info

def parse_imports(self, macho, data=None)

Expand source code Browse git

def parse_imports(self, macho: lief.MachO.Binary, data=None) -> list:
    info = []
    imports: Iterable[lief.MachO.Symbol] = macho.imported_symbols
    for imp in imports:
        info.append(lief.string(imp.name))
    return info

def parse_exports(self, macho, data=None)

Expand source code Browse git

def parse_exports(self, macho: lief.MachO.Binary, data=None) -> list:
    info = []
    exports: Iterable[lief.MachO.Symbol] = macho.exported_symbols
    for exp in exports:
        info.append(lief.string(exp.name))
    return info

class map (index, image, default=(), blocksize=1)

This unit is implemented in refinery.units.blockwise.map and has the following commandline Interface:

usage: map [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] index image [default]

Each block of the input data which occurs as a block of the index argument is replaced by the
corresponding block of the image argument. If a block size is specified, and if the index or
image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes
that are not an integer multiple of the block size are discarded. To prevent any automatic
chunking, the btoi handler can be used. An optional default value can be provided to serve as
inserts for any blocks in the input that do not occur in the index sequence. If this argument is
not specified, such blocks are left unchanged.

positional arguments:
  index              index characters
  image              image characters
  default            default value

options:
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class map(BlockTransformation):
    """
    Each block of the input data which occurs as a block of the index argument is replaced by the
    corresponding block of the image argument. If a block size is specified, and if the index or
    image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes
    that are not an integer multiple of the block size are discarded. To prevent any automatic
    chunking, the `refinery.lib.argformats.DelayedArgument.btoi` handler can be used.
    An optional default value can be provided to serve as inserts for any blocks in the input that
    do not occur in the index sequence. If this argument is not specified, such blocks are left
    unchanged.
    """
    _map: dict[int, int]

    def __init__(
        self,
        index   : Param[isq, Arg.NumSeq(help='index characters')],
        image   : Param[isq, Arg.NumSeq(help='image characters')],
        default : Param[isq, Arg.NumSeq(help='default value')] = (),
        blocksize=1
    ):
        super().__init__(blocksize=blocksize, index=index, image=image, default=default, _truncate=2)
        self._map = {}

    def reverse(self, data):
        return self._process(data, self.args.image, self.args.index, self.args.default)

    def process(self, data):
        return self._process(data, self.args.index, self.args.image, self.args.default)

    def _process(self, data: bytearray, index: Sequence[int], image: Sequence[int], default: Sequence[int]):
        if not self.bytestream:
            if isbuffer(index):
                self.log_info(F'chunking index sequence into blocks of size {self.blocksize}')
                index = list(self.chunk(index))
                self.log_debug(F'index sequence: {index}')
            if isbuffer(image):
                self.log_info(F'chunking image sequence into blocks of size {self.blocksize}')
                image = list(self.chunk(image))
                self.log_debug(F'image sequence: {image}')
            if isbuffer(default):
                self.log_info(F'chunking default sequence into blocks of size {self.blocksize}')
                default = list(self.chunk(default))
                self.log_debug(F'default sequence: {default}')
        if len(set(index)) != len(index):
            raise ValueError('The index sequence contains duplicates.')
        if len(index) > len(image):
            raise ValueError('The index sequence is longer than the image sequence.')

        if self.bytestream:
            mapping = dict(zip(index, image))
            if default:
                d = iter(cycle(default))
                mapping = bytes(mapping.get(c, d) for c in range(0x100))
            else:
                mapping = bytes(mapping.get(c, c) for c in range(0x100))
            if not isinstance(data, bytearray):
                data = bytearray(data)
            data[:] = (mapping[b] for b in data)
            return data
        try:
            self.log_info(default)
            self._def = cycle(default) if default else None
            self._map = dict(zip(index, image))
            return super().process(data)
        finally:
            self._map = {}

    def process_block(self, block):
        default = next(it) if (it := self._def) else block
        return self._map.get(block, default)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return self._process(data, self.args.image, self.args.index, self.args.default)

class maru (seed=0, reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.maru and has the following commandline Interface:

usage: maru [-h] [-L] [-Q] [-0] [-v] [-r N] [-t] [N]

Returns the 64bit maru hash of the input data.

positional arguments:
  N              optional seed value

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class maru(HashUnit):
    """
    Returns the 64bit maru hash of the input data.
    """
    def __init__(
        self,
        seed: Param[int, Arg.Number(help='optional seed value')] = 0,
        reps=1,
        text=False,
    ):
        super().__init__(seed=seed, text=text, reps=reps)

    def _algorithm(self, data) -> bytes:
        return maru32digest(data, self.args.seed)

class max_ (key=None)

This unit is implemented in refinery.units.meta.max and has the following commandline Interface:

usage: max [-h] [-L] [-Q] [-0] [-v] [key]

Picks the maximum of all elements in the current frame.

positional arguments:
  key            A meta variable expression to sort by instead of sorting the content.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class max_(Unit):
    """
    Picks the maximum of all elements in the current `refinery.lib.frame`.
    """

    def __init__(
        self,
        key: Param[str, Arg.String('key', help='A meta variable expression to sort by instead of sorting the content.')] = None,
    ):
        super().__init__(key=key)

    def filter(self, chunks: Iterable[Chunk]):
        def get_value(chunk: Chunk):
            if key is None:
                return chunk
            return metavars(chunk).get(key)

        key = self.args.key
        it = iter(chunks)

        for max_chunk in it:
            if not max_chunk.visible:
                yield max_chunk
            else:
                max_index = 0
                max_value = get_value(max_chunk)
                break
        else:
            return

        for index, chunk in enumerate(chunks, 1):
            if not chunk.visible:
                yield chunk
                continue
            value = get_value(chunk)
            try:
                is_max = value > max_value
            except TypeError:
                if max_value is None:
                    self.log_info(
                        F'Discarding chunk {max_index} in favor of {index} because {key} was not '
                        F'set on the former; new maximum is {value!r}.')
                    is_max = True
                else:
                    self.log_info(
                        F'Discarding chunk {index} because {key} had value {value!r}; it could not '
                        F'be compared to the current maximum {max_value!r} on chunk {max_index}.')
                    is_max = False
            if is_max:
                max_value = value
                max_chunk = chunk
                max_index = index

        yield max_chunk

class md2 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: md2 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the MD2 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class md4 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: md4 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the MD4 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class md5 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: md5 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the MD5 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class mimewords

This unit is implemented in refinery.units.pattern.mimewords and has the following commandline Interface:

usage: mimewords [-h] [-L] [-Q] [-0] [-v]

Implements the decoding of MIME encoded-word syntax from RFC-2047.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mimewords(Unit):
    """
    Implements the decoding of MIME encoded-word syntax from RFC-2047.
    """
    @classmethod
    def convert(cls, word: str) -> str:
        """
        Converts the MIME word.
        """
        def replacer(match):
            decoded, = decode_header(match[0])
            raw, codec = decoded
            if not isinstance(codec, str):
                codec = cls.codec
            return codecs.decode(raw, codec, errors='surrogateescape')
        return re.sub(R"=(?:\?[^\?]*){3}\?=", replacer, word)

    @unicoded
    def process(self, data):
        return self.convert(data)

Static methods

def convert(word): Converts the MIME word.

class min_ (key=None)

This unit is implemented in refinery.units.meta.min and has the following commandline Interface:

usage: min [-h] [-L] [-Q] [-0] [-v] [key]

Picks the minimum of all elements in the current frame.

positional arguments:
  key            A meta variable expression to sort by instead of sorting the content.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class min_(Unit):
    """
    Picks the minimum of all elements in the current `refinery.lib.frame`.
    """

    def __init__(
        self,
        key: Param[str, Arg.String('key', help='A meta variable expression to sort by instead of sorting the content.')] = None,
    ):
        super().__init__(key=key)

    def filter(self, chunks: Iterable[Chunk]):
        def get_value(chunk: Chunk):
            if key is None:
                return chunk
            return metavars(chunk).get(key)

        key = self.args.key
        it = iter(chunks)

        for min_chunk in it:
            if not min_chunk.visible:
                yield min_chunk
            else:
                min_index = 0
                min_value = get_value(min_chunk)
                break
        else:
            return

        for index, chunk in enumerate(chunks, 1):
            if not chunk.visible:
                yield chunk
                continue
            value = get_value(chunk)
            try:
                is_min = value < min_value
            except TypeError:
                if min_value is None:
                    self.log_info(
                        F'Discarding chunk {min_index} in favor of {index} because {key} was not '
                        F'set on the former; new minimum is {value!r}.')
                    is_min = True
                else:
                    self.log_info(
                        F'Discarding chunk {index} because {key} had value {value!r}; it could not '
                        F'be compared to the current minimum {min_value!r} on chunk {min_index}.')
                    is_min = False
            if is_min:
                min_value = value
                min_chunk = chunk
                min_index = index

        yield min_chunk

class morse (language=None)

This unit is implemented in refinery.units.encoding.morse and has the following commandline Interface:

usage: morse [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [{ar,de,en,es,fr,he,ru,ua}]

Morse encoding and decoding. All tokens in the input data which consist of dashes and dots are
replaced by their Morse decoding.

positional arguments:
  {ar,de,en,es,fr,he,ru,ua}  Optionally choose a language. If none is specified, the unit will
                             attempt to detect the language automatically. Options are: ar, de,
                             en, es, fr, he, ru, ua

generic options:
  -h, --help                 Show this help message and exit.
  -L, --lenient              Increase the leniency, allowing partial results and ignoring more
                             errors.
  -Q, --quiet                Disables all log output.
  -0, --devnull              Do not produce any output.
  -v, --verbose              Specify up to two times to increase log level.
  -R, --reverse              Use the reverse operation.
  -F, --iff                  Only apply unit if it can handle the input format. Specify twice to
                             drop all other chunks.

Expand source code Browse git

class morse(Unit):
    """
    Morse encoding and decoding. All tokens in the input data which consist of dashes and dots are
    replaced by their Morse decoding.
    """
    def __init__(
        self,
        language: Param[str, Arg.Option(choices=MorseLanguage, help=(
            'Optionally choose a language. If none is specified, the unit will attempt to detect '
            'the language automatically. Options are: {choices}'))] = None,
    ):
        super().__init__(language=Arg.AsOption(language, MorseLanguage))

    @classmethod
    def handles(cls, data):
        if re.fullmatch(BR'[-.\s]+', data, re.DOTALL):
            return True

    @unicoded
    def process(self, data: str):
        language: MorseLanguage = self.args.language
        parsed = re.split('(\\s+)', data)
        tokens = {t for t in parsed[::2] if t}
        tables = [
            self._DECODE_SYMBOL,
            self._DECODE_DIGITS,
        ]

        if language is not None:
            tables.append(self._DECODE[language])
        else:
            special = set(self._DECODE_SYMBOL) | set(self._DECODE_DIGITS)
            best_ratio = 1 # number of unused codes
            best_table = None
            for language in MorseLanguage:
                table = self._DECODE[language]
                codes = set(table)
                if not tokens <= codes | special:
                    continue
                if language == MorseLanguage.EN:
                    best_table = table
                    break
                ratio = len(codes - tokens) / len(codes)
                if ratio < best_ratio:
                    best_ratio = ratio
                    best_table = table
            if best_table is None:
                raise LookupError('Unable to determine language, please specify it manually.')
            tables.append(best_table)

        with io.StringIO() as out:
            for k, string in enumerate(parsed):
                if k % 2 == 1:
                    string = string[1:]
                    if len(string) > 1:
                        string = string[:-1]
                    out.write(string)
                    continue
                if not string:
                    continue
                for table in tables:
                    try:
                        out.write(table[string])
                        break
                    except KeyError:
                        continue
                else:
                    raise ValueError(F'invalid token: {string}')
            return out.getvalue()

    @unicoded
    def reverse(self, data: str):
        language: MorseLanguage = self.args.language
        tables = [
            self._ENCODE_SYMBOL,
            self._ENCODE_DIGITS,
        ]
        if language is not None:
            tables.append(self._ENCODE[language])
        else:
            tables.extend(self._ENCODE.values())

        def _encode(letter):
            for table in tables:
                try:
                    return table[letter]
                except KeyError:
                    continue
            else:
                raise ValueError(F'cannot encode letter "{letter}"')

        with io.StringIO() as out:
            for k, word in enumerate(re.split('(\\s+)', data)):
                if k % 2 == 1:
                    out.write(F' {word} ')
                    continue
                out.write(' '.join(_encode(letter) for letter in word.lower()))
            return out.getvalue()

    _ENCODE = {
        MorseLanguage.EN: {
            'a': '.-',
            'b': '-...',
            'c': '-.-.',
            'd': '-..',
            'e': '.',
            'f': '..-.',
            'g': '--.',
            'h': '....',
            'i': '..',
            'j': '.---',
            'k': '-.-',
            'l': '.-..',
            'm': '--',
            'n': '-.',
            'o': '---',
            'p': '.--.',
            'q': '--.-',
            'r': '.-.',
            's': '...',
            't': '-',
            'u': '..-',
            'v': '...-',
            'w': '.--',
            'x': '-..-',
            'y': '-.--',
            'z': '--..',
        }
    }
    _ENCODE[MorseLanguage.ES] = _extend_dictionary(_ENCODE[MorseLanguage.EN], {
        'á': '.--.-',
        'é': '..-..',
        'í': '..',
        'ñ': '--.--',
        'ó': '---.',
        'ú': '..-',
        'ü': '..--',
        '¿': '..-.-',
        '¡': '--...-',
    })
    _ENCODE[MorseLanguage.DE] = _extend_dictionary(_ENCODE[MorseLanguage.EN], {
        'ä': '.-.-',
        'ö': '---.',
        'ü': '..--',
        'ß': '...--..',
    })
    _ENCODE[MorseLanguage.FR] = _extend_dictionary(_ENCODE[MorseLanguage.EN], {
        'à': '.--.-',
        'â': '.--.-',
        'ç': '-.-..',
        'è': '.-..-',
        'é': '..-..',
        'ê': '-..-.',
        'ë': '..-..',
        'î': '..',
        'ï': '-..--',
        'ô': '---',
        'ù': '..-',
        'ü': '..--',
    })
    _ENCODE[MorseLanguage.RU] = {
        'а': '.-',
        'б': '-...',
        'в': '.--',
        'г': '--.',
        'д': '-..',
        'е': '.',
        'ё': '.',
        'ж': '...-',
        'з': '--..',
        'и': '..',
        'й': '.---',
        'к': '-.-',
        'л': '.-..',
        'м': '--',
        'н': '-.',
        'о': '---',
        'п': '.--.',
        'р': '.-.',
        'с': '...',
        'т': '-',
        'у': '..-',
        'ф': '..-.',
        'х': '....',
        'ц': '-.-.',
        'ч': '---.',
        'ш': '----',
        'щ': '--.-',
        'ъ': '--.--',
        'ы': '-.--',
        'ь': '-..-',
        'э': '..-..',
        'ю': '..--',
        'я': '.-.-',
    }
    _ENCODE[MorseLanguage.UA] = _extend_dictionary(_ENCODE[MorseLanguage.RU], {
        'ґ': '--.',
        'и': '-.--',
        'ї': '.---.',
    })
    _ENCODE[MorseLanguage.UA]['є'] = _ENCODE[MorseLanguage.UA].pop('э')
    _ENCODE[MorseLanguage.UA]['і'] = _ENCODE[MorseLanguage.UA].pop('и')

    _ENCODE[MorseLanguage.HE] = {
        'א': '.-',
        'ב': '-...',
        'ג': '--.',
        'ד': '-..',
        'ה': '---',
        'ו': '.',
        'ז': '--..',
        'ח': '....',
        'ט': '..--',
        'י': '..',
        'כ': '-.',
        'ל': '.-..',
        'מ': '--',
        'נ': '--.',
        'ס': '-.-.',
        'ע': '.---',
        'פ': '.--.',
        'צ': '.--',
        'ק': '--.-',
        'ר': '.-.',
        'ש': '...',
        'ת': '-',
    }

    _ENCODE[MorseLanguage.AR] = {
        'ا': '.-',
        'ب': '-...',
        'ت': '-',
        'ث': '-.-.',
        'ج': '.---',
        'ح': '....',
        'خ': '---',
        'د': '-..',
        'ذ': '--..',
        'ر': '.-.',
        'ز': '---.',
        'س': '...',
        'ش': '----',
        'ص': '-..-',
        'ض': '...-',
        'ط': '..-',
        'ظ': '-.--',
        'ع': '.-.-',
        'غ': '--.',
        'ف': '..-.',
        'ق': '--.-',
        'ك': '-.-',
        'ل': '.-..',
        'م': '--',
        'ن': '-.',
        'ه': '..-..',
        'و': '.--',
        'ي': '..',
        'ﺀ': '.',
    }

    _ENCODE_DIGITS = {
        '0': '-----',
        '1': '.----',
        '2': '..---',
        '3': '...--',
        '4': '....-',
        '5': '.....',
        '6': '-....',
        '7': '--...',
        '8': '---..',
        '9': '----.'
    }

    _ENCODE_SYMBOL = {
        '_': '..--.-',
        '-': '-....-',
        ',': '--..--',
        ';': '-.-.-.',
        ':': '---...',
        '!': '-.-.--',
        '?': '..--..',
        '.': '.-.-.-',
        '"': '.-..-.',
        '(': '-.--.',
        ')': '-.--.-',
        '@': '.--.-.',
        '/': '-..-.',
        '\\': '-..-.',
        '&': '.-...',
        '+': '.-.-.',
        '=': '-...-',
        '$': '...-..-',
        "'": '.----.',
    }

    _DECODE = {
        lng: _reverse_dictionary(tbl) for lng, tbl in _ENCODE.items()}
    _DECODE_SYMBOL = _reverse_dictionary(_ENCODE_SYMBOL)
    _DECODE_DIGITS = _reverse_dictionary(_ENCODE_DIGITS)

Methods

def reverse(self, data)

Expand source code Browse git

@unicoded
def reverse(self, data: str):
    language: MorseLanguage = self.args.language
    tables = [
        self._ENCODE_SYMBOL,
        self._ENCODE_DIGITS,
    ]
    if language is not None:
        tables.append(self._ENCODE[language])
    else:
        tables.extend(self._ENCODE.values())

    def _encode(letter):
        for table in tables:
            try:
                return table[letter]
            except KeyError:
                continue
        else:
            raise ValueError(F'cannot encode letter "{letter}"')

    with io.StringIO() as out:
        for k, word in enumerate(re.split('(\\s+)', data)):
            if k % 2 == 1:
                out.write(F' {word} ')
                continue
            out.write(' '.join(_encode(letter) for letter in word.lower()))
        return out.getvalue()

class mscdk (size, hash='MD5')

This unit is implemented in refinery.units.crypto.keyderive.mscdk and has the following commandline Interface:

usage: mscdk [-h] [-L] [-Q] [-0] [-v] N [hash]

An implementation of the CryptDeriveKey routine available from the Win32 API.

positional arguments:
  N              The number of bytes to generate.
  hash           Specify one of these algorithms (default is MD5): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mscdk(KeyDerivation):
    """
    An implementation of the CryptDeriveKey routine available from the Win32 API.
    """

    def __init__(self, size, hash='MD5'):
        super().__init__(size=size, salt=None, hash=hash)

    def process(self, data):
        def digest(x):
            return self.hash.new(x).digest()
        size = self.args.size
        if self.args.hash in (HASH.SHA224, HASH.SHA256, HASH.SHA384, HASH.SHA512):
            buffer = digest(data)
            max_size = len(buffer)
        else:
            max_size = 2 * self.hash.digest_size
            value = digest(data)
            del data
            buffer1 = bytearray([0x36] * 64)
            buffer2 = bytearray([0x5C] * 64)
            for k, b in enumerate(value):
                buffer1[k] ^= b
                buffer2[k] ^= b
            buffer = digest(buffer1) + digest(buffer2)
        if size > max_size:
            raise RefineryPartialResult(F'too many bytes requested, can only provide {max_size}', partial=buffer)
        return buffer[:size]

class mscf (mode=None)

This unit is implemented in refinery.units.compression.mscf and has the following commandline Interface:

usage: mscf [-h] [-L] [-Q] [-0] [-v] [-F] [{mszip,xpress,xpress-huff,lzms}]

The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft
Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both
with and without Huffman table) are supported. This pure Python implementation is very slow when
compared to native code, so decompressing very large inputs can take several minutes.

positional arguments:
  {mszip,xpress,xpress-huff,lzms}
                              Manually select decompression mode (mszip, xpress, xpress-huff,
                              lzms); by default the unit attempts to derive the mode from the
                              header, but this will fail for raw streams. However, even if a
                              header is found, a manually specified mode will take precedence.

generic options:
  -h, --help                  Show this help message and exit.
  -L, --lenient               Increase the leniency, allowing partial results and ignoring more
                              errors.
  -Q, --quiet                 Disables all log output.
  -0, --devnull               Do not produce any output.
  -v, --verbose               Specify up to two times to increase log level.
  -F, --iff                   Only apply unit if it can handle the input format. Specify twice to
                              drop all other chunks.

Expand source code Browse git

class mscf(Unit):
    """
    The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft
    Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both
    with and without Huffman table) are supported. This pure Python implementation is very slow when
    compared to native code, so decompressing very large inputs can take several minutes.
    """

    _SIGNATURE = B'\x0A\x51\xE5\xC0'

    def __init__(
        self,
        mode: Param[str | None, Arg.Option(choices=MODE, help=(
            'Manually select decompression mode ({choices}); by default the unit attempts to derive the '
            'mode from the header, but this will fail for raw streams. However, even if a header is '
            'found, a manually specified mode will take precedence.'))] = None,
    ):
        super().__init__(mode=Arg.AsOption(mode, MODE))

    def process(self, data):
        mode: MODE = self.args.mode
        with StructReader(memoryview(data)) as reader, MemoryFile() as writer:
            reader: StructReader[memoryview]
            check = zlib.crc32(reader.peek(6))
            magic = reader.read(4)
            if magic != self._SIGNATURE:
                if mode is None:
                    self.log_warn(
                        F'data starts with {magic.hex().upper()} rather than the expected sequence '
                        F'{self._SIGNATURE.hex().upper()}; this could be a raw stream.')
                else:
                    reader.seek(0)
                    handler = self._get_handler(mode)
                    handler(reader, writer, None)
                    return writer.getvalue()

            header_size = reader.u16()
            if header_size != 24:
                self.log_warn(F'the header size {header_size} was not equal to 24')

            crc32byte = reader.u8()
            check = zlib.crc32(reader.peek(0x11), check) & 0xFF
            if check != crc32byte:
                self.log_warn(F'the CRC32 check byte was {crc32byte}, computed value was {check}')

            _mode_code = reader.u8()

            try:
                _mode = MODE(_mode_code)
            except ValueError:
                msg = F'header contains unknown compression type code {_mode_code}'
                if mode is None:
                    raise ValueError(msg)
                else:
                    self.log_warn(msg)
            else:
                if mode is not None and mode != _mode:
                    logger = self.log_warn
                else:
                    logger = self.log_info
                    mode = _mode
                logger(F'header specifies algorithm {_mode.name}')

            self.log_info(F'using algorithm {mode.name}')
            decompress = self._get_handler(mode)

            final_size = reader.u32()
            _unknown_1 = reader.u32()
            chunk_size = reader.u32()
            _unknown_2 = reader.u32()

            if _unknown_1 != 0:
                self.log_warn(F'unknown value 1 was unexpectedly nonzero: 0x{_unknown_1:08X}')
            if _unknown_2 != 0:
                self.log_warn(F'unknown value 2 was unexpectedly nonzero: 0x{_unknown_2:08X}')

            self.log_debug(F'final size: 0x{final_size:08X}')
            self.log_debug(F'chunk size: 0x{chunk_size:08X}')

            if chunk_size > COMPRESS_MAX_CHUNK:
                raise ValueError('the header chunk size is greater than the maximum value')

            while len(writer) < final_size:
                src_size = reader.u32()
                src_data = reader.read(src_size)
                if len(src_data) != src_size:
                    raise IndexError(F'Attempted to read {src_size} bytes, but got only {len(src_data)}.')
                if src_size + len(writer) == final_size:
                    self.log_debug(F'final chunk is uncompressed, appending {src_size} raw bytes to output')
                    writer.write(src_data)
                    break
                self.log_debug(F'reading chunk of size {src_size}')
                start = writer.tell()
                chunk = StructReader(src_data)
                target = min(chunk_size, final_size - len(writer))
                decompress(chunk, writer, target)
                writer.flush()
                written = writer.tell() - start
                if written != target:
                    raise RuntimeError(F'decompressed output had unexpected size {written} instead of {chunk_size}')

            if not reader.eof:
                self.log_info(F'compression complete with {reader.remaining_bytes} bytes remaining in input')
            return writer.getvalue()

    def _get_handler(self, mode: MODE) -> Callable[[StructReader, MemoryFile, int | None], None]:
        decompress = {
            mode.MSZIP       : self._decompress_mszip,
            mode.XPRESS_HUFF : self._decompress_xpress_huffman,
            mode.XPRESS      : self._decompress_xpress,
        }.get(mode, None)
        if decompress is None:
            raise NotImplementedError(F'algorithm {mode.name} is not yet implemented')
        return decompress

    def _decompress_mszip(self, reader: StructReader, writer: MemoryFile, target: int | None = None):
        header = bytes(reader.read(2))
        if header != B'CK':
            raise ValueError(F'chunk did not begin with CK header, got {header!r} instead')
        decompress = zlib.decompressobj(-zlib.MAX_WBITS, zdict=writer.getvalue())
        writer.write(decompress.decompress(reader.read()))
        writer.write(decompress.flush())

    def _decompress_xpress_huffman(
        self,
        reader: StructReader,
        writer: MemoryFile,
        target: int | None = None,
        max_chunk_size: int = 0x10000
    ) -> None:
        limit = writer.tell()
        if target is not None:
            target += limit

        while not reader.eof:

            if reader.remaining_bytes < XPRESS_NUM_SYMBOLS // 2:
                raise IndexError(
                    F'There are only {reader.remaining_bytes} bytes reamining in the input buffer,'
                    F' but at least {XPRESS_NUM_SYMBOLS // 2} are required to read a Huffman table.')

            table = bytearray(reader.read_integer(4) for _ in range(XPRESS_NUM_SYMBOLS))
            table = make_huffman_decode_table(table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN)
            limit = limit + max_chunk_size
            flags = BitBufferedReader(reader, 16)

            while True:
                position = writer.tell()
                if position == target:
                    if reader.remaining_bytes:
                        self.log_info(F'chunk decompressed with {reader.remaining_bytes} bytes remaining in input buffer')
                    return
                if position >= limit:
                    if position > limit:
                        limit = position
                        self.log_info(F'decompression of one chunk generated more than the limit of {max_chunk_size} bytes')
                    flags.collect()
                    break
                try:
                    sym = read_huffman_symbol(flags, table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN)
                except EOFError:
                    self.log_debug('end of file while reading huffman symbol')
                    break
                if sym < XPRESS_NUM_CHARS:
                    writer.write_byte(sym)
                    continue
                length = sym & 0xF
                offsetlog = (sym >> 4) & 0xF
                flags.collect()
                if reader.eof:
                    break
                offset = (1 << offsetlog) | flags.read(offsetlog)
                if length == 0xF:
                    nudge = reader.read_byte()
                    if nudge < 0xFF:
                        length += nudge
                    else:
                        length = reader.u16() or reader.u32()
                length += XPRESS_MIN_MATCH_LEN
                writer.replay(offset, length)

    def _decompress_xpress(self, reader: StructReader, writer: MemoryFile, target: int | None = None) -> bytearray:
        if target is not None:
            target += writer.tell()
        flags = BitBufferedReader(reader)
        nibble_cache = None
        while not reader.eof:
            if target is not None and writer.tell() >= target:
                return
            if not flags.next():
                writer.write(reader.read(1))
                continue
            offset, length = divmod(reader.u16(), 8)
            offset += 1
            if length == 7:
                length = nibble_cache
                if length is None:
                    length_pair = reader.u8()
                    nibble_cache = length_pair >> 4
                    length = length_pair & 0xF
                else:
                    nibble_cache = None
                if length == 15:
                    length = reader.u8()
                    if length == 0xFF:
                        length = reader.u16() or reader.u32()
                        length -= 22
                        if length < 0:
                            raise RuntimeError(F'Invalid match length of {length} for long delta sequence')
                    length += 15
                length += 7
            length += 3
            writer.replay(offset, length)

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:len(cls._SIGNATURE)] == cls._SIGNATURE:
            return True

class msgpack

This unit is implemented in refinery.units.formats.msgpack and has the following commandline Interface:

usage: msgpack [-h] [-L] [-Q] [-0] [-v] [-R]

Converts a message-pack (msgpack) buffer to JSON and vice-versa.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class msgpack(Unit):
    """
    Converts a message-pack (msgpack) buffer to JSON and vice-versa.
    """
    def reverse(self, data):
        try:
            data = json.loads(data)
        except Exception as E:
            try:
                data = json.loads(B'[%s]' % data)
            except Exception:
                raise E
        return mp.dumps(data)

    def process(self, data):
        unpacker: mp.fallback.Unpacker = mp.Unpacker(MemoryFile(data, output=bytes))
        for k in itertools.count():
            try:
                last = unpacker.tell()
                item = unpacker.unpack()
            except Exception as E:
                if isinstance(E, mp.OutOfData) and k == 1:
                    break
                raise RefineryPartialResult(str(E), memoryview(data)[last:]) from E
            else:
                yield json.dumps(item).encode(self.codec)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    try:
        data = json.loads(data)
    except Exception as E:
        try:
            data = json.loads(B'[%s]' % data)
        except Exception:
            raise E
    return mp.dumps(data)

class mspdb (size, salt, iter=100, hash='SHA1')

This unit is implemented in refinery.units.crypto.keyderive.mspdb and has the following commandline Interface:

usage: mspdb [-h] [-L] [-Q] [-0] [-v] N salt [iter] [hash]

An implementation of the PasswordDeriveBytes routine available from the .NET standard library.
According to documentation, it is an extension of PBKDF1.

positional arguments:
  N              The number of bytes to generate.
  salt           Salt for the derivation.
  iter           Number of iterations; default is 100.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mspdb(KeyDerivation):
    """
    An implementation of the PasswordDeriveBytes routine available from the .NET
    standard library. According to documentation, it is an extension of PBKDF1.
    """
    def __init__(self, size, salt, iter=100, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        if self.codec != 'UTF8':
            data = data.decode(self.codec).encode('UTF8')
        data += self.args.salt
        for _ in range(self.args.iter - 1):
            data = self.hash.new(data).digest()
        counter, seedhash = 1, data
        data = self.hash.new(data).digest()
        while len(data) < self.args.size:
            data += self.hash.new(B'%d%s' % (counter, seedhash)).digest()
            counter += 1
        return data[:self.args.size]

class mvg (*names, top=False)

This unit is implemented in refinery.units.meta.mvg and has the following commandline Interface:

usage: mvg [-h] [-L] [-Q] [-0] [-v] [-t] [name ...]

Short for "Make Variable Global": This unit can move meta variables into the scope of the parent
frame. If used at the end of a frame, the variables will be moved the scope of the frame that the
pipeline will return to. Otherwise and if the --top switch is being used, variables will be moved
to scope 0, i.e. to the topmost frame in the current tree.

Note that it is not possible to promote a variable to a parent frame if that variable does not
have the same value on all chunks in the current frame - such variables will always be removed
when the frame closes.

positional arguments:
  name           Name of a variable to be removed. If no variables are explicitly specified, all
                 variables in the current chunk will be rescoped.

options:
  -t, --top      Move the variable(s) to the topmost frame layer.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mvg(Unit):
    """
    Short for "Make Variable Global": This unit can move meta variables into the scope of the
    parent frame. If used at the end of a frame, the variables will be moved the scope of the
    frame that the pipeline will return to. Otherwise and if the --top switch is being used,
    variables will be moved to scope 0, i.e. to the topmost frame in the current tree.

    Note that it is not possible to promote a variable to a parent frame if that variable does not
    have the same value on all chunks in the current frame - such variables will always be removed
    when the frame closes.
    """
    def __init__(
        self,
        *names: Param[str, Arg.String(metavar='name', help=(
            'Name of a variable to be removed. If no variables are explicitly specified, all '
            'variables in the current chunk will be rescoped.'
        ))],
        top: Param[bool, Arg.Switch('-t', help='Move the variable(s) to the topmost frame layer.')] = False
    ):
        super().__init__(names=names, top=top)

    def process(self, data):
        meta = metavars(data)
        nest = self.args.nesting
        if nest < 0 and not self.args.top:
            spot = meta.scope + nest
        else:
            spot = 1
        for name in self.args.names or meta.variable_names():
            try:
                if meta.get_scope(name) <= spot:
                    continue
                meta.set_scope(name, spot)
            except KeyError:
                self.log_info(F'variable not defined: {name}')
        return data

class n40 (key)

This unit is implemented in refinery.units.malware.n40 and has the following commandline Interface:

usage: n40 [-h] [-L] [-Q] [-0] [-v] key

Decrypts hex-encoded strings in various latin-american banker families, including N40.

positional arguments:
  key            Decryption key.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class n40(Unit):
    """
    Decrypts hex-encoded strings in various latin-american banker families, including N40.
    """
    def __init__(self, key: Param[buf, Arg(help='Decryption key.')]):
        ...

    def process(self, data):
        try:
            data = b16decode(data, casefold=True)
        except Error:
            self.log_info('Input was not hex-encoded; ignoring this step.')
        mask = data[1:] | xor(self.args.key) | bytearray
        return bytearray(0xFF + b - a if b <= a else b - a for a, b in zip(data, mask))

class neg (bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.neg and has the following commandline Interface:

usage: neg [-h] [-L] [-Q] [-0] [-v] [-E] [-B N]

Each block of the input data is negated bitwise. This is sometimes also called the bitwise
complement or inverse.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class neg(UnaryOperation):
    """
    Each block of the input data is negated bitwise. This is sometimes
    also called the bitwise complement or inverse.
    """
    def operate(self, a): return ~a
    def inplace(self, a): a ^= self.fmask

class netbios (key=b'A')

This unit is implemented in refinery.units.encoding.netbios and has the following commandline Interface:

usage: netbios [-h] [-L] [-Q] [-0] [-v] [-R] [key]

Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte
0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character,
respectively. The default offset is the capital letter A.

positional arguments:
  key            Provide a single letter to use as the offset.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class netbios(Unit):
    """
    Encodes and decodes strings using the same algorithm that is used for NetBIOS
    labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and
    0xL with an offset character, respectively. The default offset is the capital
    letter A.
    """

    def __init__(self, key: Param[buf, Arg(help="Provide a single letter to use as the offset.")] = B'A'):
        if len(key) != 1:
            raise ValueError("The key must be a binary string of length exactly 1")
        super().__init__(key=key[0])

    def reverse(self, data):
        result = bytearray(2 * len(data))
        for k, byte in enumerate(data):
            hi, lo = byte >> 4, byte & 15
            result[2 * k + 0] = hi + self.args.key
            result[2 * k + 1] = lo + self.args.key
        return result

    def process(self, data):
        def merge(it):
            while True:
                try:
                    hi = next(it) - self.args.key
                    lo = next(it) - self.args.key
                    if hi not in range(16) or lo not in range(16):
                        raise ValueError(F'Invalid character encoding detected: hi={hi:X}, lo={lo:X}.')
                    yield (hi << 4) | lo
                except StopIteration:
                    break
        return bytearray(merge(iter(data)))

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    result = bytearray(2 * len(data))
    for k, byte in enumerate(data):
        hi, lo = byte >> 4, byte & 15
        result[2 * k + 0] = hi + self.args.key
        result[2 * k + 1] = lo + self.args.key
    return result

class ngrams (size=slice(2, None, None))

This unit is implemented in refinery.units.strings.ngrams and has the following commandline Interface:

usage: ngrams [-h] [-L] [-Q] [-0] [-v] [start:end:step]

Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
and deduplicates using a set data structure. The number n is taken from an arbitrary range given
as a Python slice expression.

positional arguments:
  start:end:step  Specifies the sizes of each n-gram, i.e. the number n. Defaults to 2:.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ngrams(Unit):
    """
    Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
    and deduplicates using a set data structure. The number n is taken from an arbitrary range given
    as a Python slice expression.
    """
    def __init__(
        self, size: Param[slice, Arg.Bounds(
            help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.')] = slice(2, None),
    ):
        super().__init__(size=size)

    def process(self, data: bytearray):
        for n in integers_of_slice(self.args.size):
            self.log_info(F'emitting {n}-grams')
            if n > len(data):
                break
            deduplicator = set()
            view = memoryview(data)
            for index in range(len(data) - n + 1):
                block = bytes(view[index:index + n])
                if block in deduplicator:
                    continue
                deduplicator.add(block)
                yield self.labelled(block, offset=index)

class nop

This unit is implemented in refinery.units.misc.nop and has the following commandline Interface:

usage: nop [-h] [-L] [-Q] [-0] [-v]

The unit generates the exact output that was received as input. All unknown arguments passed to
nop are completely ignored, which is different from the behavior of other units. As such, nop can
be used to comment out other units in longer refinery pipelines by simply prefixing a command
with nop.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nop(Unit):
    """
    The unit generates the exact output that was received as input. All unknown arguments passed
    to nop are completely ignored, which is different from the behavior of other units. As such,
    nop can be used to comment out other units in longer refinery pipelines by simply prefixing a
    command with nop.
    """
    @classmethod
    def argparser(cls, **keywords):
        argp = NopArgParser(
            keywords, prog=cls.name, description=documentation(cls), add_help=False)
        argp.set_defaults(nesting=0)
        return cls._interface(argp)

Static methods

def argparser(**keywords)

class nrv2b (bits=32)

This unit is implemented in refinery.units.compression.nrv and has the following commandline Interface:

usage: nrv2b [-h] [-L] [-Q] [-0] [-v] [N]

Decompress data using the NRV2B algorithm.

positional arguments:
  N              Specify the number of codec bits. The default is 32.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nrv2b(NRVUnit):
    """
    Decompress data using the NRV2B algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * offset + next(bb)
            if offset == 2:
                offset = last_offset
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                offset += 1
                last_offset = offset
            length = next(bb)
            length = 2 * length + next(bb)
            if length == 0:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 2
            length += int(bool(offset > 0xD00))
            dst.replay(offset, length + 1)

class nrv2d (bits=32)

This unit is implemented in refinery.units.compression.nrv and has the following commandline Interface:

usage: nrv2d [-h] [-L] [-Q] [-0] [-v] [N]

Decompress data using the NRV2D algorithm.

positional arguments:
  N              Specify the number of codec bits. The default is 32.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nrv2d(NRVUnit):
    """
    Decompress data using the NRV2D algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * (offset - 1) + next(bb) # noqa
                offset = 2 *  offset      + next(bb) # noqa
            if offset == 2:
                offset = last_offset
                length = next(bb)
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                length = (offset  ^ 1) & 1 # noqa
                offset = (offset >> 1) + 1
                last_offset = offset
            length = 2 * length + next(bb)
            if length == 0:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 2
            length += int(bool(offset > 0x500))
            dst.replay(offset, length + 1)

class nrv2e (bits=32)

This unit is implemented in refinery.units.compression.nrv and has the following commandline Interface:

usage: nrv2e [-h] [-L] [-Q] [-0] [-v] [N]

Decompress data using the NRV2E algorithm.

positional arguments:
  N              Specify the number of codec bits. The default is 32.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nrv2e(NRVUnit):
    """
    Decompress data using the NRV2E algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * (offset - 1) + next(bb) # noqa
                offset = 2 *  offset      + next(bb) # noqa
            if offset == 2:
                offset = last_offset
                length = next(bb)
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                length = (offset ^  1) & 1 # noqa
                offset = (offset >> 1) + 1
                last_offset = offset
            if length:
                length = 1 + next(bb)
            elif next(bb):
                length = 3 + next(bb)
            else:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 3
            length += int(bool(offset > 0x500))
            dst.replay(offset, length + 1)

class ntlm (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.password_hashes and has the following commandline Interface:

usage: ntlm [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the Windows NTLM hash of the input.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ntlm(HashUnit):
    """
    Returns the Windows NTLM hash of the input.
    """
    def _algorithm(self, data) -> bytes:
        from Cryptodome.Hash import MD4
        return MD4.new(codecs.decode(data, self.codec).encode('utf-16le')).digest()

class officecrypt (password=b'VelvetSweatshop')

This unit is implemented in refinery.units.formats.office.officecrypt and has the following commandline Interface:

usage: officecrypt [-h] [-L] [-Q] [-0] [-v] [password]

A simple proxy for the msoffcrypto package to decrypt office documents.

positional arguments:
  password       The document password. By default, the Excel default password "VelvetSweatshop"
                 is used.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class officecrypt(Unit):
    """
    A simple proxy for the `msoffcrypto` package to decrypt office documents.
    """

    def __init__(self, password: Param[buf, Arg.Binary(help=(
        'The document password. By default, the Excel default password "{default}" is used.'
    ))] = b'VelvetSweatshop'):
        super().__init__(password=password)

    @Unit.Requires('msoffcrypto-tool', ['formats', 'office'])
    def _msoffcrypto():
        import msoffcrypto
        return msoffcrypto

    def process(self, data):
        password: bytes = self.args.password
        with MemoryFile(data) as stream:
            doc = self._msoffcrypto.OfficeFile(stream)
            if not doc.is_encrypted():
                self.log_warn('the document is not encrypted; returning input')
                return data
            if password:
                doc.load_key(password=password.decode(self.codec))
            with MemoryFile(bytearray()) as output:
                doc.decrypt(output)
                return output.getvalue()

class opc (mode='x32', *, count=None, until=None, nvar='name', avar='addr', ovar='arg')

This unit is implemented in refinery.units.formats.exe.opc and has the following commandline Interface:

usage: opc [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-n STR] [-a STR] [-o STR] [[x32|x64|..]]

Disassembles the input data using capstone and generates opcodes with metadata as output. This is
useful for programmatic disassembly, while the asm unit outputs a human-readable representation.
Internally, asm uses this unit and pretty-prints the output.

positional arguments:
  [x32|x64|..]     Machine code architecture, default is x32. Select from the following list:
                   x16, x32, x64, ppc32, ppc64, mips32, mips64.

options:
  -c, --count N    Maximum number of bytes to disassemble, infinite by default.
  -u, --until STR  Disassemble until the given string appears among the disassembly.
  -n, --nvar STR   Variable to receive the disassembled mnemonic. Default is "name".
  -a, --avar STR   Variable to receive the address of the instruction. Default is "addr".
  -o, --ovar STR   Variable prefix for instruction operands. Default is "arg". The complete
                   operand string will be in args, the first argument in arg1, the second in
                   arg2, and so on.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class opc(Unit):
    """
    Disassembles the input data using capstone and generates opcodes with metadata as output. This
    is useful for programmatic disassembly, while the `refinery.asm` unit outputs a human-readable
    representation. Internally, `refinery.asm` uses this unit and pretty-prints the output.
    """
    def __init__(
        self,
        mode: Param[str, Arg.Choice(
            help='Machine code architecture, default is {default}. Select from the following list: {choices}.',
            choices=_ARCHES, metavar='[x32|x64|..]')] = 'x32', *,
        count: Param[int, Arg.Number('-c', help='Maximum number of bytes to disassemble, infinite by default.')] = None,
        until: Param[str, Arg.String('-u', help='Disassemble until the given string appears among the disassembly.')] = None,
        nvar: Param[str, Arg.String('-n', help=(
            'Variable to receive the disassembled mnemonic. Default is "{default}".'))] = 'name',
        avar: Param[str, Arg.String('-a', help=(
            'Variable to receive the address of the instruction. Default is "{default}".'))] = 'addr',
        ovar: Param[str, Arg.String('-o', help=(
            'Variable prefix for instruction operands. Default is "{default}". The complete operand '
            'string will be in {default}s, the first argument in {default}1, the second in {default}2, '
            'and so on.'))] = 'arg',
        **more
    ):
        super().__init__(
            mode=mode,
            count=count,
            until=until,
            nvar=nvar,
            avar=avar,
            ovar=ovar,
            **more)

    @property
    def _capstone_engine(self) -> Cs:
        mode = self.args.mode.lower()
        init = {
            'arm'    : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM),
            'mips32' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS32),
            'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS64),
            'ppc32'  : (cs.CS_ARCH_PPC, cs.CS_MODE_32),
            'ppc64'  : (cs.CS_ARCH_PPC, cs.CS_MODE_64),
            'x16'    : (cs.CS_ARCH_X86, cs.CS_MODE_16),
            'x32'    : (cs.CS_ARCH_X86, cs.CS_MODE_32),
            'x64'    : (cs.CS_ARCH_X86, cs.CS_MODE_64),
        }.get(mode)
        if init is not None:
            return cs.Cs(*init)
        raise AttributeError(F'invalid mode: {mode}')

    def process(self, data):
        count = self.args.count or 0
        until = self.args.until
        nvar = self.args.nvar
        avar = self.args.avar
        ovar = self.args.ovar
        if isinstance(until, str):
            until = until.lower()
        for insn in self._capstone_engine.disasm(data, 0, count):
            kwargs = {
                avar: insn.address,
                nvar: insn.mnemonic,
            }
            ops: str = insn.op_str
            try:
                operands = [op.strip() for op in ops.split(',')]
            except Exception:
                operands = []
            else:
                kwargs[F'{ovar}s'] = ops
            for k, op in enumerate(operands, 1):
                if not op:
                    break
                try:
                    op = int(op, 0)
                except Exception:
                    pass
                kwargs[F'{ovar}{k}'] = op
            yield self.labelled(insn.bytes, **kwargs)
            if until is None:
                continue
            if until in ops.lower() or until in insn.mnemonic.lower():
                break

class p1

This unit is implemented in refinery.units.meta.pick and has the following commandline Interface:

usage: p1 [-h] [-L] [-Q] [-0] [-v]

A shortcut for pick with the argument 0:1.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class p1(pick):
    """
    A shortcut for `refinery.pick` with the argument `0:1`.
    """
    def __init__(self):
        super().__init__(slice(0, 1))

class p2

This unit is implemented in refinery.units.meta.pick and has the following commandline Interface:

usage: p2 [-h] [-L] [-Q] [-0] [-v]

A shortcut for pick with the argument 0:2.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class p2(pick):
    """
    A shortcut for `refinery.pick` with the argument `0:2`.
    """
    def __init__(self):
        super().__init__(slice(0, 2))

class p3

This unit is implemented in refinery.units.meta.pick and has the following commandline Interface:

usage: p3 [-h] [-L] [-Q] [-0] [-v]

A shortcut for pick with the argument 0:3.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class p3(pick):
    """
    A shortcut for `refinery.pick` with the argument `0:3`.
    """
    def __init__(self):
        super().__init__(slice(0, 3))

class pack (base=0, prefix=False, strict=False, width=0, single_floats=False, double_floats=False, bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.pack and has the following commandline Interface:

usage: pack [-h] [-L] [-Q] [-0] [-v] [-R] [-r] [-s] [-w N] [-f] [-d] [-E] [-B N] [N]

Scans the input data for numeric constants and packs them into a binary format. This is useful to
convert the textual representation of an array of numbers into its binary form. For example,
123,34,256,12,1,234 would be transformed into the byte sequence 7B22000C01EA, where 256 was
wrapped and packed as a null byte because the default block size is one byte. If the above
sequence would be packed with options -EB2, the result is 007B00220100000C000100EA.

positional arguments:
  N                    Find only numbers in given base. Default of 0 means that common
                       expressions for hexadecimal, octal and binary are accepted.

options:
  -r, --prefix         Add numeric prefixes like 0x, 0b, and 0o in reverse mode.
  -s, --strict         Only parse integers that fit in one block of the given block size.
  -w, --width N        Pad numbers with the specified amount of leading zeros.
  -f, --single-floats  Pack single-precision floating-point numbers. Implies -B4.
  -d, --double-floats  Pack double-precision floating-point numbers. Implies -B8.
  -E, --bigendian      Read chunks in big endian.
  -B, --blocksize N    The size of each block in bytes. The default is 1.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.
  -R, --reverse        Use the reverse operation.

Expand source code Browse git

class pack(BlockTransformationBase):
    """
    Scans the input data for numeric constants and packs them into a binary format. This is useful
    to convert the textual representation of an array of numbers into its binary form. For example,
    `123,34,256,12,1,234` would be transformed into the byte sequence `7B22000C01EA`, where `256`
    was wrapped and packed as a null byte because the default block size is one byte. If the above
    sequence would be packed with options -EB2, the result is `007B00220100000C000100EA`.
    """

    def __init__(self,
        base: Param[int, Arg.Number(bound=(2, 36), help=(
            'Find only numbers in given base. Default of 0 means that common expressions for '
            'hexadecimal, octal and binary are accepted.'))] = 0,
        prefix: Param[bool, Arg.Switch('-r', group='FLT',
            help='Add numeric prefixes like 0x, 0b, and 0o in reverse mode.')] = False,
        strict: Param[bool, Arg.Switch('-s',
            help='Only parse integers that fit in one block of the given block size.')] = False,
        width: Param[int, Arg.Number('-w',
            help='Pad numbers with the specified amount of leading zeros.')] = 0,
        single_floats: Param[bool, Arg.Switch('-f', group='FLT',
            help='Pack single-precision floating-point numbers. Implies -B4.')] = False,
        double_floats: Param[bool, Arg.Switch('-d', group='FLT',
            help='Pack double-precision floating-point numbers. Implies -B8.')] = False,
        bigendian=False, blocksize=1
    ):
        if single_floats and double_floats:
            raise ValueError('The floats and doubles option are mutually exclusive.')
        elif single_floats:
            fmode = FMode.SINGLE
            blocksize = 4
        elif double_floats:
            fmode = FMode.DOUBLE
            blocksize = 8
        else:
            fmode = FMode.TO_INT
        super().__init__(
            base=base,
            prefix=prefix,
            strict=strict,
            width=width,
            bigendian=bigendian,
            blocksize=blocksize,
            fmode=fmode,
            _truncate=2,
        )

    @property
    def bytestream(self):
        # never alow bytes to be left unchunked
        return False

    def reverse(self, data):
        base = self.args.base or 10
        width = self.args.width
        mode: FMode = self.args.fmode
        prefix = B''

        self.log_debug(F'using base {base:d}')

        if self.args.prefix:
            prefix = {
                0x02: b'0b',
                0x08: b'0o',
                0x10: b'0x'
            }.get(base, B'')

        if mode is FMode.TO_INT:
            converter = BaseUnit(
                base,
                little_endian=not self.args.bigendian,
                strip_padding=True,
            )
            for n in self.chunk_into_bytes(data):
                converted = converter.reverse(n)
                if width:
                    converted = converted.rjust(width, B'0')
                if prefix:
                    converted = prefix + converted
                yield converted
            return

        elif mode is FMode.SINGLE:
            float_format = 'f'
            float_size = 4

        elif mode is FMode.DOUBLE:
            float_format = 'd'
            float_size = 8

        count, rest = divmod(len(data), float_size)
        if rest:
            self.log_warn(F'data contained {rest} trailing bytes that were ignored')
            data = memoryview(data)[:-rest]
        float_format *= count
        if self.args.bigendian:
            float_format = F'>{float_format}'
        else:
            float_format = F'<{float_format}'
        for n in struct.unpack(float_format, data):
            yield str(n).encode(self.codec)

    def process(self, data):
        base: int = self.args.base
        strict: bool = self.args.strict
        mode: FMode = self.args.fmode
        ep = '>' if self.args.bigendian else '<'

        def evaluate_literals(literals: Iterable[bytes]):
            for literal in literals:
                if mode is FMode.TO_INT:
                    if base == 0 and literal[0] == 0x30 and literal[1:].isdigit():
                        literal = B'0o%s' % literal
                    N = int(literal, base)
                elif mode is FMode.SINGLE:
                    N, = struct.unpack(F'{ep}I', struct.pack(F'{ep}f', float(literal)))
                elif mode is FMode.DOUBLE:
                    N, = struct.unpack(F'{ep}Q', struct.pack(F'{ep}d', float(literal)))
                else:
                    raise TypeError('unexpected floating point mode')
                M = N & self.fmask
                if strict and M != N:
                    continue
                yield M

        if base == 0:
            pattern = formats.number
        elif base <= 10:
            pattern = re.compile(B'[-+]?[0-%d]{1,64}' % (base - 1))
        else:
            pattern = re.compile(B'[-+]?[0-9a-%c]{1,20}' % (0x57 + base), re.IGNORECASE)

        return self.unchunk(evaluate_literals(m[0] for m in pattern.finditer(data)))

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    base = self.args.base or 10
    width = self.args.width
    mode: FMode = self.args.fmode
    prefix = B''

    self.log_debug(F'using base {base:d}')

    if self.args.prefix:
        prefix = {
            0x02: b'0b',
            0x08: b'0o',
            0x10: b'0x'
        }.get(base, B'')

    if mode is FMode.TO_INT:
        converter = BaseUnit(
            base,
            little_endian=not self.args.bigendian,
            strip_padding=True,
        )
        for n in self.chunk_into_bytes(data):
            converted = converter.reverse(n)
            if width:
                converted = converted.rjust(width, B'0')
            if prefix:
                converted = prefix + converted
            yield converted
        return

    elif mode is FMode.SINGLE:
        float_format = 'f'
        float_size = 4

    elif mode is FMode.DOUBLE:
        float_format = 'd'
        float_size = 8

    count, rest = divmod(len(data), float_size)
    if rest:
        self.log_warn(F'data contained {rest} trailing bytes that were ignored')
        data = memoryview(data)[:-rest]
    float_format *= count
    if self.args.bigendian:
        float_format = F'>{float_format}'
    else:
        float_format = F'<{float_format}'
    for n in struct.unpack(float_format, data):
        yield str(n).encode(self.codec)

class pad (width, padding=b'\x00', left=False, absolute=False)

This unit is implemented in refinery.units.meta.pad and has the following commandline Interface:

usage: pad [-h] [-L] [-Q] [-0] [-v] [-l] [-a] N [padding]

Allows padding of the input data.

positional arguments:
  N               Input is padded to the nearest multiple of this size.
  padding         This custom binary sequence is used (repeatedly, if necessary) to pad the
                  input. The default is a zero byte.

options:
  -l, --left      Pad on the left instead of the right.
  -a, --absolute  The width argument specifies an absolute size, not a block size.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class pad(Unit):
    """
    Allows padding of the input data.
    """

    def __init__(
        self,
        width: Param[int, Arg.Number(help='Input is padded to the nearest multiple of this size.')],
        padding: Param[buf, Arg(help=(
            'This custom binary sequence is used (repeatedly, if necessary) to pad the '
            'input. The default is a zero byte.'))] = B'\0',
        left: Param[bool, Arg.Switch('-l', help='Pad on the left instead of the right.')] = False,
        absolute: Param[bool, Arg.Switch('-a', help=(
            'The width argument specifies an absolute size, not a block size.'))] = False
    ):
        super().__init__(width=width, padding=padding, left=left, absolute=absolute)

    def process(self, data):
        width = self.args.width
        if self.args.absolute and len(data) >= width:
            return data
        q, r = divmod(len(data), width)
        size = (q + bool(r)) * width
        missing = (size - len(data))
        if missing <= 0:
            return data
        pad = self.args.padding
        if missing > len(pad):
            pad *= missing // len(pad)
        if self.args.left:
            return pad[:missing] + data
        else:
            data += pad[:missing]
            return data

class pbkdf1 (size, salt=b'\x00\x00\x00\x00\x00\x00\x00\x00', iter=1000, hash='SHA1')

This unit is implemented in refinery.units.crypto.keyderive.pbkdf1 and has the following commandline Interface:

usage: pbkdf1 [-h] [-L] [-Q] [-0] [-v] N [salt] [iter] [hash]

PBKDF1 Key derivation

positional arguments:
  N              The number of bytes to generate.
  salt           Salt for the derivation; default are 8 null bytes.
  iter           Number of iterations; default is 1000.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pbkdf1(KeyDerivation):
    """PBKDF1 Key derivation"""

    @Arg('salt', help='Salt for the derivation; default are 8 null bytes.')
    def __init__(self, size, salt=bytes(8), iter=1000, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        from Cryptodome.Protocol.KDF import PBKDF1
        return multidecode(data, lambda pwd: (
            PBKDF1(pwd, self.args.salt, dkLen=self.args.size, count=self.args.iter, hashAlgo=self.hash)
        ))

class pbkdf2 (size, salt, iter=1000, hash='SHA1')

This unit is implemented in refinery.units.crypto.keyderive.pbkdf2 and has the following commandline Interface:

usage: pbkdf2 [-h] [-L] [-Q] [-0] [-v] N salt [iter] [hash]

PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries.

positional arguments:
  N              The number of bytes to generate.
  salt           Salt for the derivation.
  iter           Number of iterations; default is 1000.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pbkdf2(KeyDerivation):
    """
    PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET
    binaries.
    """

    def __init__(self, size, salt, iter=1000, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        from Cryptodome.Protocol.KDF import PBKDF2
        return multidecode(data, partial(
            PBKDF2,
            salt=self.args.salt,
            dkLen=self.args.size,
            hmac_hash_module=self.hash,
            count=self.args.iter
        ))

class pbuf (try_repeated=False, encode=None, digest=None, arrays=False)

This unit is implemented in refinery.units.formats.pbuf and has the following commandline Interface:

usage: pbuf [-h] [-L] [-Q] [-0] [-v] [-r] [-e U | -d U | -a]

Converts a ProtoBuf message to JSON. Deserialization is ambiguous without the definition file, so
the output is partly based on heuristics. Some fields like fixed integers are never recovered,
fixed 32-bit and 64-bit data types are always recovered as floating point numbers. For variable
length data, the unit first attempts to decode the data as a printable UTF-8 string. If this
fails, it will attempt to deserialize it as ProtoBuf. If this also fails and the corresponding
option is set, it will try to reconstruct a sequence of repeated variable-length integers. The
final fallback is to return the body as a byte string.

options:
  -r, --try-repeated  Try to detect and decode repeated integer fields.
  -e, --encode U      Select an encoder unit used to represent binary data in the JSON output.
                      This unit must be reversible and produce UTF8 encoded string output when
                      operated in reverse. Common examples are hex and b64.
  -d, --digest U      Select a hashing unit to digest all byte strings: Instead of the data, only
                      the hash will be displayed.
  -a, --arrays        Encode all byte strings as integer arrays. These arrays will have unsigned
                      integer entires between 0 and 255.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.

Expand source code Browse git

class pbuf(JSONEncoderUnit):
    """
    Converts a ProtoBuf message to JSON. Deserialization is ambiguous without the definition file,
    so the output is partly based on heuristics. Some fields like fixed integers are never recovered, fixed 32-bit
    and 64-bit data types are always recovered as floating point numbers. For variable length data,
    the unit first attempts to decode the data as a printable UTF-8 string. If this fails, it will
    attempt to deserialize it as ProtoBuf. If this also fails and the corresponding option is set,
    it will try to reconstruct a sequence of repeated variable-length integers. The final fallback
    is to return the body as a byte string.
    """
    def __init__(
        self,
        try_repeated: Param[bool, Arg.Switch('-r',
            help='Try to detect and decode repeated integer fields.')] = False,
        encode=None,
        digest=None,
        arrays=False,
    ):
        super().__init__(
            encode=encode,
            digest=digest,
            arrays=arrays,
            try_repeated=try_repeated
        )

    def process(self, data):
        reader = ProtoBufReader(memoryview(data))
        reader.try_repeated = self.args.try_repeated
        message = reader.read_message()
        return self.to_json(message)

class pcap (merge=False, client=False, server=False)

This unit is implemented in refinery.units.formats.pcap and has the following commandline Interface:

usage: pcap [-h] [-L] [-Q] [-0] [-v] [-m] [-c | -s]

Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the
parts of each TCP conversation, attaching several pieces of metadata to each such output:
Included are the source and destination socket address as well as the variable stream which
identifies the conversation which it was part of. The chunks are returned in the order that the
bytes were exchanged between source and destination. When the --merge parameter is specified, the
unit instead collects all bytes going forward and backwards, respectively, and emitting these as
two chunks, for each TCP conversation that took place.

options:
  -m, --merge    Merge both parts of each TCP conversation into one chunk.
  -c, --client   Show only the client part of each conversation.
  -s, --server   Show only the server part of each conversation.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pcap(Unit):
    """
    Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of
    each TCP conversation, attaching several pieces of metadata to each such output: Included are the source
    and destination socket address as well as the variable `stream` which identifies the conversation which
    it was part of. The chunks are returned in the order that the bytes were exchanged between source and
    destination. When the `--merge` parameter is specified, the unit instead collects all bytes going forward
    and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place.
    """

    def __init__(
        self,
        merge: Param[bool, Arg.Switch('-m', help='Merge both parts of each TCP conversation into one chunk.')] = False,
        client: Param[bool, Arg.Switch('-c', group='D', help='Show only the client part of each conversation.')] = False,
        server: Param[bool, Arg.Switch('-s', group='D', help='Show only the server part of each conversation.')] = False,
    ):
        super().__init__(merge=merge, client=client, server=server)

    @Unit.Requires('pypcapkit[scapy]>=1.3', ['all'])
    def _pcapkit():
        with NoLogging():
            import importlib
            importlib.import_module('scapy.layers.tls.session')
            import pcapkit
            return pcapkit

    @Unit.Requires('scapy', ['all'])
    def _scapy():
        import scapy
        import scapy.packet
        return scapy

    def process(self, data):
        pcapkit = self._pcapkit
        merge = self.args.merge

        with NoLogging(), VirtualFileSystem() as fs:
            vf = VirtualFile(fs, data, 'pcap')
            pcap = pcapkit.extract(
                fin=vf.path,
                engine='scapy',
                store=True,
                nofile=True,
                extension=False,
                ip=True,
                tcp=True,
                reassembly=True,
                reasm_strict=True,
            )
            tcp: list[Datagram] = list(pcap.reassembly.tcp)
            tcp.sort(key=lambda p: min(p.index, default=0))

        count, convo = 0, None
        src_buffer = MemoryFile()
        dst_buffer = MemoryFile()

        self.log_debug(F'extracted {len(pcap.frame)} packets, assembled {len(tcp)} datagrams')
        PT = self._scapy.packet

        def payload(packet: Packet):
            circle = set()
            while True:
                try:
                    inner = packet.payload
                except AttributeError:
                    break
                if isinstance(packet, PT.Raw) and not isinstance(packet, (PT.NoPayload, PT.Padding)):
                    return packet.original
                if id(inner) in circle:
                    break
                packet = inner
                circle.add(id(inner))
            return B''

        def sequence(i: int):
            packet = pcap.frame[i - 1]
            while len(packet):
                try:
                    return packet.seq
                except AttributeError:
                    pass
                try:
                    packet = packet.payload
                except AttributeError:
                    break
            return 0

        client = self.args.client
        server = self.args.server

        def commit():
            if src_buffer.tell():
                if not server:
                    assert convo is not None
                    yield self.labelled(src_buffer.getvalue(), **convo.src_to_dst())
                src_buffer.truncate(0)
            if dst_buffer.tell():
                if not client:
                    assert convo is not None
                    yield self.labelled(dst_buffer.getvalue(), **convo.dst_to_src())
                dst_buffer.truncate(0)

        for datagram in tcp:
            self.log_info(datagram.header)

            this_convo = Conversation.FromID(datagram.id)
            if this_convo != convo:
                if count and merge:
                    yield from commit()
                count = count + 1
                convo = this_convo
            assert convo is not None
            data = bytearray()
            for index in sorted(datagram.index, key=sequence):
                data.extend(payload(pcap.frame[index - 1]))
            if not data:
                continue
            if not merge:
                yield self.labelled(data, **this_convo.src_to_dst(), stream=count)
            elif this_convo.src == convo.src:
                src_buffer.write(data)
            elif this_convo.dst == convo.src:
                dst_buffer.write(data)
            else:
                raise RuntimeError(F'direction of packet {convo!s} in conversation {count} is unknown')

        yield from commit()

class pcap_http

This unit is implemented in refinery.units.formats.pcap_http and has the following commandline Interface:

usage: pcap-http [-h] [-L] [-Q] [-0] [-v]

Extracts HTTP payloads from packet capture (PCAP) files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pcap_http(Unit):
    """
    Extracts HTTP payloads from packet capture (PCAP) files.
    """
    def process(self, data):
        http_parser = httpresponse()
        requests: list[_HTTP_Request] = []
        responses: list[bytearray] = []

        def lookup(src, dst):
            for k, request in enumerate(requests):
                if request.src == dst and request.dst == src:
                    requests.pop(k)
                    return self.labelled(data, url=request.url)
            return None

        for stream in data | pcap():
            try:
                data = http_parser.process(stream)
            except Exception:
                try:
                    rq = _parse_http_request(stream)
                    requests.append(rq)
                except _HTTPParseError as E:
                    self.log_info(F'error parsing http request: {E!s}')
                except Exception:
                    pass
                continue
            if not data:
                continue
            src, dst = stream['src'], stream['dst']
            item = lookup(src, dst)
            if item is None:
                responses.append((src, dst, data))
                continue
            yield item

        while responses:
            src, dst, data = responses.pop()
            item = lookup(src, dst)
            yield data if item is None else item

class pdfcrypt (owner='', user='')

This unit is implemented in refinery.units.formats.pdfcrypt and has the following commandline Interface:

usage: pdfcrypt [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-w PWD] [-u PWD]

The unit removes password protection from a PDF document. If the document is encrypted, either
the correct user or owner password must be specified to decrypt it. When the unit is operated in
reverse, the output is encrypted using the AES-256 mode.

options:
  -w, --owner PWD  Optionally specify an owner password.
  -u, --user PWD   Optionally specify a user password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class pdfcrypt(Unit):
    """
    The unit removes password protection from a PDF document. If the document is encrypted, either
    the correct user or owner password must be specified to decrypt it. When the unit is operated
    in reverse, the output is encrypted using the AES-256 mode.
    """

    @Unit.Requires('pymupdf', ['formats', 'default', 'extended'])
    def _mupdf():
        import os
        for setting in ('PYMUPDF_MESSAGE', 'PYMUPDF_LOG'):
            os.environ[setting] = F'path:{os.devnull}'
        import pymupdf
        import pymupdf.mupdf
        return pymupdf

    def __init__(
        self,
        owner: Param[str, Arg.String('-w', metavar='PWD', help='Optionally specify an owner password.')] = '',
        user: Param[str, Arg.String('-u', metavar='PWD', help='Optionally specify a user password.')] = '',
    ):
        super().__init__(user=user, owner=owner)

    def _ingest(self, data):
        pdf = self._mupdf.open(stream=data, filetype='pdf')
        given = 0
        if pdf.is_encrypted and (pwd := self.args.user):
            given += 1
            pdf.authenticate(pwd)
        if pdf.is_encrypted and (pwd := self.args.owner):
            given += 1
            pdf.authenticate(pwd)
        if pdf.is_encrypted:
            msg = {
                0: 'no password was specified',
                1: 'the given password was incorrect',
                2: 'neither of the given passwords worked'
            }[given]
            raise ValueError(F'The input data is encrypted and {msg}.')
        return pdf

    def process(self, data):
        with self._ingest(data) as pdf, MemoryFile() as out:
            pdf.save(out, encryption=self._mupdf.mupdf.PDF_ENCRYPT_NONE)
            return out.getvalue()

    def reverse(self, data):
        u = self.args.user
        w = self.args.owner
        if not u and not w:
            raise ValueError('Cannot encrypt document without a password.')
        with self._ingest(data) as pdf, MemoryFile() as out:
            pdf.save(out, encryption=self._mupdf.mupdf.PDF_ENCRYPT_AES_256, user_pw=u, owner_pw=w)
            return out.getvalue()

    @classmethod
    def handles(cls, data):
        return data[:5] == B'%PDF-'

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    u = self.args.user
    w = self.args.owner
    if not u and not w:
        raise ValueError('Cannot encrypt document without a password.')
    with self._ingest(data) as pdf, MemoryFile() as out:
        pdf.save(out, encryption=self._mupdf.mupdf.PDF_ENCRYPT_AES_256, user_pw=u, owner_pw=w)
        return out.getvalue()

class pecdb

This unit is implemented in refinery.units.formats.pe.pecdb and has the following commandline Interface:

usage: pecdb [-h] [-L] [-Q] [-0] [-v]

Short for "PE: Clear Dynamic Base"; this unit will clear the bit in the PE header that allows for
address space layout randomization. It will also set the integrity flag. With both bits set, this
DLL when loaded into memory will usually be loaded at its header-defined base address, which can
make debugging easier.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pecdb(Unit):
    """
    Short for "PE: Clear Dynamic Base"; this unit will clear the bit in the PE header that allows
    for address space layout randomization. It will also set the integrity flag. With both bits
    set, this DLL when loaded into memory will usually be loaded at its header-defined base address,
    which can make debugging easier.
    """
    @Unit.Requires('pefile', ['default', 'extended'])
    def _pefile():
        import pefile
        return pefile

    def process(self, data: bytearray):
        pe = self._pefile.PE(data=data, fast_load=True)
        dc = pe.OPTIONAL_HEADER.DllCharacteristics
        dc = dc & ~0x40 # IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE
        dc = dc & +0x80 # IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY
        pe.OPTIONAL_HEADER.DllCharacteristics = dc
        return pe.write()

class pedebloat (*names, certificate=False, directories=False, memdump=False, resources=False, sections=False, trim_code=False, trim_rsrc=False, threshold=0.05, size_limit=10.0 MB, keep_limit=False, aggressive=False)

This unit is implemented in refinery.units.formats.pe.pedebloat and has the following commandline Interface:

usage: pedebloat [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m] [-r] [-s] [-X] [-Y] [-t T] [-l N] [-k]
                 [-a]
                 [names ...]

Removes junk or excess data from PE files and returns the stripped executable. By default, only
the PE overlay is considered; use the flags -r and -s to also consider resources and entire
sections. Any buffer is only considered for removal if it exceeds a certain size. If this
condition is met, a binary search is performed to determine the offset inside the buffer up to
which the compression ratio is above a certain threshold; everything beyond that point is then
removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.

positional arguments:
  names

options:
  -c, --certificate   Include digital signatures for the size computation.
  -d, --directories   Include data directories for size computation.
  -m, --memdump       Assume that the file data was a memory-mapped PE file.
  -r, --resources     Strip large resources.
  -s, --sections      Strip large sections.
  -X, --trim-code     Lift the exception on code sections for stripping.
  -Y, --trim-rsrc     Lift the exception on rsrc sections for stripping.
  -t, --threshold T   Trailing data from resources and sections is stripped until the compression
                      ratio of the remaining data rises above this threshold. The default value
                      is 0.05. Set this to 1 to ignore the limit entirely and trim every
                      structure as much as possible without violating alignment. Setting this
                      value to 0 will only strip repeated occurrences of the last byte.
  -l, --size-limit N  Structures below this size are not stripped. Default is 10.0 MB.
  -k, --keep-limit    Do not strip structures to below the above size limit.
  -a, --aggressive    Equivalent to -srt1: Strip large sections and resources aggressively.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.

Expand source code Browse git

class pedebloat(OverlayUnit):
    """
    Removes junk or excess data from PE files and returns the stripped executable. By default, only
    the PE overlay is considered; use the flags `-r` and `-s` to also consider resources and entire
    sections. Any buffer is only considered for removal if it exceeds a certain size. If this
    condition is met, a binary search is performed to determine the offset inside the buffer up to
    which the compression ratio is above a certain threshold; everything beyond that point is then
    removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.
    """
    def __init__(
        self,
        *names: Param[str, Arg.String()],
        certificate=False,
        directories=False,
        memdump=False,
        resources: Param[bool, Arg.Switch('-r', help='Strip large resources.')] = False,
        sections: Param[bool, Arg.Switch('-s', help='Strip large sections.')] = False,
        trim_code: Param[bool, Arg.Switch('-X', help='Lift the exception on code sections for stripping.')] = False,
        trim_rsrc: Param[bool, Arg.Switch('-Y', help='Lift the exception on rsrc sections for stripping.')] = False,
        threshold: Param[float, Arg.Double('-t', metavar='T', help=(
            'Trailing data from resources and sections is stripped until the compression ratio '
            'of the remaining data rises above this threshold. The default value is {default}. '
            'Set this to 1 to ignore the limit entirely and trim every structure as much as '
            'possible without violating alignment. Setting this value to 0 will only strip repeated '
            'occurrences of the last byte.'))] = 0.05,
        size_limit: Param[int, Arg.Number('-l', help=(
            'Structures below this size are not stripped. Default is {default!r}.'))] = _STRIP,
        keep_limit: Param[bool, Arg.Switch('-k', help=(
            'Do not strip structures to below the above size limit.'))] = False,
        aggressive: Param[bool, Arg.Switch('-a', help=(
            'Equivalent to -srt1: Strip large sections and resources aggressively.'))] = False,
    ):
        if aggressive:
            sections = True
            resources = True
            threshold = 1

        super().__init__(
            certificate,
            directories,
            memdump,
            sections=sections,
            resources=resources,
            size_limit=size_limit,
            keep_limit=keep_limit,
            threshold=threshold,
            trim_rsrc=trim_rsrc,
            trim_code=trim_code,
            names=names,
        )

    @OverlayUnit.Requires('pefile', ['default', 'extended'])
    def _pefile():
        import pefile
        return pefile

    def _right_strip_data(self, data: memoryview, alignment=1, block_size=_MB) -> int:
        if not data:
            return 0
        threshold = self.args.threshold
        data_overhang = len(data) % alignment
        result = data_overhang

        if 0 < threshold < 1:
            def compression_ratio(offset: int):
                ratio = len(zlib.compress(data[:offset], level=1)) / offset
                self.log_debug(F'compressing {SizeInt(offset)!r} ratio={ratio:6.4f}')
                return ratio
            upper = len(data)
            lower = result
            if compression_ratio(upper) <= threshold:
                while block_size < upper - lower:
                    pivot = (lower + upper) // 2
                    ratio = compression_ratio(pivot)
                    if ratio > threshold:
                        lower = pivot + 1
                        continue
                    upper = pivot
                    if abs(ratio - threshold) < 1e-10:
                        break
            result = upper
        elif threshold == 0:
            result = len(data)
        elif threshold == 1:
            result = 0

        while result > 1 and data[result - 2] == data[result - 1]:
            result -= 1

        result = max(result, data_overhang)

        if self.args.keep_limit:
            result = max(result, self.args.size_limit)

        result = result + (data_overhang - result) % alignment

        if result > len(data):
            excess = result - len(data)
            excess = excess + (-excess % alignment)
            result = result - excess

        return result

    def _adjust_offsets(self, pe: PE, gap_offset: int, gap_size: int):
        base = pe.OPTIONAL_HEADER.ImageBase
        alignment = pe.OPTIONAL_HEADER.FileAlignment
        rva_offset = pe.get_rva_from_offset(gap_offset)
        tva_offset = rva_offset + base

        section = pe.get_section_by_offset(gap_offset)
        new_section_size = section.SizeOfRawData - gap_size
        if new_section_size % alignment != 0:
            raise RuntimeError(
                F'trimming 0x{gap_size:X} bytes from section {_ASCII(section.Name)} of size 0x{section.SizeOfRawData:X} '
                F'violates required section alignment of 0x{alignment:X} bytes')
        inside_section_offset = gap_offset - section.PointerToRawData
        if inside_section_offset > new_section_size:
            overlap = inside_section_offset - new_section_size
            raise RuntimeError(F'trimming from section {_ASCII(section.Name)}; data extends {overlap} beyond section')

        rva_lbound = section.VirtualAddress
        rva_ubound = section.VirtualAddress + section.Misc_VirtualSize - 1
        tva_lbound = rva_lbound + base
        tva_ubound = rva_ubound + base

        def adjust_attributes_of_structure(
            structure: Structure,
            gap_offset: int,
            valid_values_lower_bound: int | None,
            valid_values_upper_bound: int | None,
            attributes: Iterable[str]
        ):
            for attribute in attributes:
                old_value = getattr(structure, attribute, 0)
                if old_value <= gap_offset:
                    continue
                if valid_values_lower_bound is not None and old_value < valid_values_lower_bound:
                    continue
                if valid_values_upper_bound is not None and old_value > valid_values_upper_bound:
                    continue
                new_value = old_value - gap_size
                if new_value < gap_offset:
                    raise BrokenLink(F'attribute {attribute} points into removed region')
                self.log_debug(F'adjusting field in {structure.name}: {attribute}')
                setattr(structure, attribute, new_value)

        it: Iterable[Structure] = iter(pe.__structures__)
        structure_class = self._pefile.SectionStructure
        remove = []

        for index, structure in enumerate(it):
            old_offset = structure.get_file_offset()
            new_offset = old_offset - gap_offset

            if old_offset > gap_offset:
                if old_offset < gap_offset + gap_size:
                    self.log_debug(F'removing structure {structure.name}; starts inside removed region')
                    remove.append(index)
                    continue
                if isinstance(structure, structure_class) and new_offset % alignment != 0:
                    raise RuntimeError(
                        F'structure {structure.name} would be moved to offset 0x{new_offset:X}, '
                        F'violating section alignment value 0x{alignment:X}.')
                structure.set_file_offset(new_offset)

            try:
                adjust_attributes_of_structure(structure, rva_offset, rva_lbound, rva_ubound, (
                    'OffsetToData',
                    'AddressOfData',
                    'VirtualAddress',
                    'AddressOfNames',
                    'AddressOfNameOrdinals',
                    'AddressOfFunctions',
                    'AddressOfEntryPoint',
                    'AddressOfRawData',
                    'BaseOfCode',
                    'BaseOfData',
                ))
                adjust_attributes_of_structure(structure, tva_offset, tva_lbound, tva_ubound, (
                    'StartAddressOfRawData',
                    'EndAddressOfRawData',
                    'AddressOfIndex',
                    'AddressOfCallBacks',
                ))
                adjust_attributes_of_structure(structure, gap_offset, None, None, (
                    'OffsetModuleName',
                    'PointerToRawData',
                ))
            except BrokenLink as error:
                self.log_debug(F'removing structure {structure.name}; {error!s}')
                remove.append(index)
                continue

            for attribute in (
                'CvHeaderOffset',
                'OffsetIn2Qwords',
                'OffsetInQwords',
                'Offset',
                'OffsetLow',
                'OffsetHigh'
            ):
                if not hasattr(structure, attribute):
                    continue
                self.log_warn(F'potential offset in structure {structure.name} ignored: {attribute}')

        while remove:
            index = remove.pop()
            pe.__structures__[index:index + 1] = []

        section.SizeOfRawData = new_section_size

    def _trim_sections(self, pe: PE, data: bytearray) -> int:
        S = self.args.size_limit
        P = self.args.names
        trimmed = 0
        for section in pe.sections:
            section: SectionStructure
            offset = section.PointerToRawData
            name = _ASCII(section.Name)
            if not self.args.trim_code and name.lower() in ('.text', '.code'):
                self.log_debug(F'skipping code section {name}; specify --trim-code to override.')
                continue
            if not self.args.trim_rsrc and name.lower() == '.rsrc':
                self.log_debug(F'skipping rsrc section {name}; specify --trim-rsrc to override.')
                continue
            old_size = section.SizeOfRawData
            if old_size <= S and not any(fnmatch(name, p) for p in P):
                self.log_debug(F'criteria not satisfied for section: {SizeInt(old_size)!r} {name}')
                continue
            new_size = self._right_strip_data(
                memoryview(data)[offset:offset + old_size],
                pe.OPTIONAL_HEADER.FileAlignment)
            if new_size == old_size:
                continue
            self.log_info(F'stripping section {name} from {TI(old_size)!r} to {TI(new_size)!r}')
            gap_size = old_size - new_size
            gap_offset = offset + new_size
            if gap_size <= 0:
                continue
            self._adjust_offsets(pe, gap_offset, gap_size)
            trimmed += gap_size
            data[gap_offset:gap_offset + gap_size] = []
        return trimmed

    def _trim_pe_resources(self, pe: PE, data: bytearray) -> int:
        S = self.args.size_limit
        P = self.args.names
        trimmed = 0

        def find_bloated_resources(pe: PE, directory, level: int = 0, *path) -> Generator[Structure]:
            for entry in directory.entries:
                name = getattr(entry, 'name')
                numeric = getattr(entry, 'id')
                if not name:
                    if level == 0 and numeric in iter(RSRC):
                        name = RSRC(entry.id)
                    elif numeric is not None:
                        name = str(numeric)
                name = name and str(name) or '?'
                if entry.struct.DataIsDirectory:
                    yield from find_bloated_resources(pe, entry.directory, level + 1, *path, name)
                    continue
                struct: Structure = entry.data.struct
                name = '/'.join((*path, name))
                if struct.Size <= S and not any(fnmatch(name, p) for p in P):
                    self.log_debug(F'criteria not satisfied for resource: {SizeInt(struct.Size)!r} {name}')
                    continue
                yield name, struct

        RSRC_INDEX = self._pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']
        pe.parse_data_directories(directories=[RSRC_INDEX])

        try:
            resources = pe.DIRECTORY_ENTRY_RESOURCE
        except AttributeError:
            return 0
        for name, resource in find_bloated_resources(pe, resources):
            offset = pe.get_offset_from_rva(resource.OffsetToData)
            old_size = resource.Size
            new_size = self._right_strip_data(
                memoryview(data)[offset:offset + old_size],
                pe.OPTIONAL_HEADER.FileAlignment)
            self.log_info(F'stripping resource {name} from {old_size} to {new_size}')
            gap_size = old_size - new_size
            gap_offset = offset + new_size
            if gap_size <= 0:
                continue
            resource.Size = new_size
            self._adjust_offsets(pe, gap_offset, gap_size)
            trimmed += gap_size
            data[gap_offset:gap_offset + gap_size] = []

        pe.OPTIONAL_HEADER.DATA_DIRECTORY[RSRC_INDEX].Size -= trimmed
        self.log_info(F'trimming size of resource data directory by {TI(trimmed)!r}')
        return trimmed

    def process(self, data: bytearray) -> bytearray:
        overlay_offset = self._get_size(data)
        if len(data) - overlay_offset >= self.args.size_limit:
            view = memoryview(data)
            overlay_length = self._right_strip_data(view[overlay_offset:])
            body_size = overlay_offset + overlay_length
            try:
                data[body_size:] = []
            except Exception:
                data = data[:body_size]
        if not self.args.resources and not self.args.sections:
            return data
        pe = self._pefile.PE(data=data, fast_load=True)
        total = len(data)
        trimmed = 0
        view = pe.__data__
        copy = False
        if not isinstance(view, bytearray):
            view = memoryview(view)
            try:
                view[0] = 0x4D
            except Exception:
                copy = True
                view = bytearray(pe.__data__)
        if self.args.resources:
            trimmed += self._trim_pe_resources(pe, view)
        if self.args.sections:
            trimmed += self._trim_sections(pe, view)
        if copy:
            pe.__data__ = view
        data = pe.write()
        end = total - trimmed
        if end < len(data):
            self.log_warn(F'output contains {len(data) - end} trailing bytes')
        return data

class peek (lines=10, all=False, brief=False, decode=0, escape=False, bare=False, meta=0, gray=False, index=False, stdout=False, narrow=False, blocks=1, dense=False, expand=False, width=0)

This unit is implemented in refinery.units.sinks.peek and has the following commandline Interface:

usage: peek [-h] [-L] [-Q] [-0] [-v] [-l N | -a | -b] [-d | -e] [-r | -m] [-g] [-i] [-2] [-N]
            [-B N] [-D] [-E] [-W N]

The unit extracts preview information of the input data and displays it on the standard error
stream. If the standard output of this unit is connected by a pipe, the incoming data is
forwarded. However, if the unit outputs to a terminal, the data is discarded instead.

options:
  -l, --lines N   Specify number N of lines in the preview, default is 10.
  -a, --all       Output all possible preview lines without restriction
  -b, --brief     One line peek, implies --lines=1.
  -d, --decode    Attempt to decode and display printable data. Specify twice to enable line
                  wrapping.
  -e, --escape    Always peek data as string, escape characters if necessary.
  -r, --bare      Only peek the data itself, do not show a metadata preview.
  -m, --meta      Show more auto-derivable metadata. Specify multiple times to populate more
                  variables.
  -g, --gray      Do not colorize the output.
  -i, --index     Display the index of each chunk within the current frame.
  -2, --stdout    Print the peek to STDOUT rather than STDERR; the input data is lost.
  -N, --narrow    Do not show addresses in hexdump
  -B, --blocks N  Group hexadecimal bytes in blocks of the given size; default is 1.
  -D, --dense     Do not insert spaces in hexdump.
  -E, --expand    Do not compress sequences of identical lines in hexdump
  -W, --width N   Specify the number of hexadecimal characters to use in preview.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class peek(HexViewer):
    """
    The unit extracts preview information of the input data and displays it on the standard error stream. If the standard
    output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal,
    the data is discarded instead.
    """

    def __init__(
        self,
        lines: Param[int, Arg.Number('-l', group='SIZE', help='Specify number N of lines in the preview, default is 10.')] = 10,
        all: Param[bool, Arg.Switch('-a', group='SIZE', help='Output all possible preview lines without restriction')] = False,
        brief: Param[bool, Arg.Switch('-b', group='SIZE', help='One line peek, implies --lines=1.')] = False,
        decode: Param[int, Arg.Counts('-d', group='MODE', help=(
            'Attempt to decode and display printable data. Specify twice to enable line wrapping.'))] = 0,
        escape: Param[bool, Arg.Switch('-e', group='MODE', help='Always peek data as string, escape characters if necessary.')] = False,
        bare: Param[bool, Arg.Switch('-r', group='META', help='Only peek the data itself, do not show a metadata preview.')] = False,
        meta: Param[int, Arg.Counts('-m', group='META', help=(
            'Show more auto-derivable metadata. Specify multiple times to populate more variables.'))] = 0,
        gray: Param[bool, Arg.Switch('-g', help='Do not colorize the output.')] = False,
        index: Param[bool, Arg.Switch('-i', help='Display the index of each chunk within the current frame.')] = False,
        stdout: Param[bool, Arg.Switch('-2', help='Print the peek to STDOUT rather than STDERR; the input data is lost.')] = False,
        narrow=False, blocks=1, dense=False, expand=False, width=0
    ):
        if decode and escape:
            raise ValueError('The decode and esc options are exclusive.')
        if brief:
            narrow = True
        if environment.colorless.value:
            gray = True
        lines = 1 if brief else INF if all else lines
        super().__init__(
            brief=brief,
            gray=gray,
            blocks=blocks,
            decode=decode,
            dense=dense,
            index=index,
            escape=escape,
            expand=expand,
            narrow=narrow,
            lines=lines,
            meta=meta,
            bare=bare,
            width=width,
            stdout=stdout,
        )

    @HexViewer.Requires('colorama', ['display', 'default', 'extended'])
    def _colorama():
        import colorama
        return colorama

    def process(self, data):
        colorize = not self.args.gray and not self.args.stdout
        lines = self._peeklines(data, colorize)

        if self.args.stdout:
            for line in lines:
                yield line.encode(self.codec)
            return

        stderr = sys.stderr

        if colorize:
            colorama = self._colorama
            if os.name == 'nt':
                stderr = colorama.AnsiToWin32(stderr).stream
            _erase = ' ' * get_terminal_size()
            _reset = F'\r{colorama.Style.RESET_ALL}{_erase}\r'
        else:
            _reset = ''

        try:
            for line in lines:
                print(line, file=stderr)
        except BaseException:
            stderr.write(_reset)
            raise
        if not self.isatty():
            self.log_info('forwarding input to next unit')
            yield data

    def _peekmeta(self, linewidth, sep, meta: dict, peek=None) -> Generator[str]:
        if not meta and not peek:
            return
        width = max((len(name) for name in meta), default=0)
        separators = iter([sep])
        if peek is not None:
            if len(peek) > linewidth:
                peek = peek[:linewidth - 3] + '...'
            yield from separators
            yield peek
        for name in sorted(meta, key=lambda s: (len(s) <= 3, s)):
            if not self.args.index and name == LazyMetaOracle.IndexKey:
                continue
            value = meta[name]
            if value is None:
                continue
            if isinstance(value, CustomStringRepresentation):
                value = repr(value).strip()
            elif isbuffer(value):
                value = repr(ByteStringWrapper(value))
            elif isinstance(value, int):
                if value in range(-999, 1000):
                    value = str(value)
                elif value > 0:
                    value = F'0x{value:X}'
                else:
                    value = F'-0x{-value:X}'
            elif isinstance(value, float):
                value = F'{value:.4f}'
            metavar = F'{name:>{width + 2}} = {value!s}'
            if len(metavar) > linewidth:
                metavar = metavar[:linewidth - 3] + '...'
            yield from separators
            yield metavar

    def _trydecode(self, data, codec: str | None, width: int, linecount: int) -> str:
        remaining = linecount
        result = []
        wrap = self.args.decode > 1
        if codec is None:
            from refinery.units.encoding.esc import esc
            decoded = data[:abs(width * linecount)]
            decoded = str(decoded | -esc(bare=True))
            limit = abs(min(linecount * width, len(decoded)))
            for k in range(0, limit, width):
                result.append(decoded[k:k + width])
            return result
        try:
            import unicodedata
            unprintable = {'Cc', 'Cf', 'Co', 'Cs'}
            self.log_info(F'trying to decode as {codec}.')
            decoded = codecs.decode(data, codec, errors='strict')
            count = sum(unicodedata.category(c) not in unprintable for c in decoded)
            ratio = count / len(decoded)
        except UnicodeDecodeError as DE:
            self.log_info('decoding failed:', DE.reason)
            return None
        except ValueError as V:
            self.log_info('decoding failed:', V)
            return None
        if ratio < 0.8:
            self.log_info(F'data contains {ratio * 100:.2f}% printable characters, this is too low.')
            return None
        decoded = decoded.splitlines(False)
        if not wrap:
            for k, line in enumerate(decoded):
                line = line.replace('\t', '\x20' * 4)
                if len(line) <= width:
                    continue
                clipped = line[:width - 3]
                if self.args.gray:
                    color = ''
                    reset = ''
                else:
                    colorama = self._colorama
                    color = colorama.Fore.LIGHTRED_EX
                    reset = colorama.Style.RESET_ALL
                decoded[k] = F'{clipped}{color}...{reset}'
            return decoded[:abs(linecount)]
        for paragraph in decoded:
            if not remaining:
                break
            wrapped = [
                line for chunk in textwrap.wrap(
                    paragraph,
                    width,
                    break_long_words=True,
                    break_on_hyphens=False,
                    drop_whitespace=False,
                    expand_tabs=True,
                    max_lines=abs(remaining + 1),
                    replace_whitespace=False,
                    tabsize=4,
                )
                for line in chunk.splitlines(keepends=False)
            ]
            remaining -= len(wrapped)
            result.extend(wrapped)
        return result[:abs(linecount)]

    def _peeklines(self, data: Chunk, colorize: bool) -> Generator[str]:

        meta = metavars(data)

        codec = None
        lines = None
        final = data.temp or False
        empty = True

        if not self.args.index:
            index = None
        else:
            index = meta.get('index', None)

        if not self.args.brief:
            padding = 0
        else:
            padding = SizeInt.width + 2
            if index is not None:
                padding += 6

        metrics = self._get_metrics(len(data), self.args.lines, padding)

        if self.args.brief:
            metrics.address_width = 0
            metrics.fit_to_width(allow_increase=True)

        sepsize = metrics.hexdump_width
        txtsize = self.args.width or sepsize

        if self.args.lines and data:
            if self.args.escape:
                lines = self._trydecode(data, None, txtsize, metrics.line_count)
            if self.args.decode > 0:
                for codec in ('utf8', 'cp1251', 'cp1252', 'utf-16le', 'utf-16', 'utf-16be'):
                    lines = self._trydecode(data, codec, txtsize, metrics.line_count)
                    if lines:
                        codec = codec
                        break
                else:
                    codec = None
            if lines is None:
                lines = list(self.hexdump(data, metrics, colorize))
            else:
                sepsize = txtsize

        def separator(title=None):
            if title is None or sepsize <= len(title) + 8:
                return sepsize * '-'
            return '-' * (sepsize - len(title) - 5) + F'[{title}]---'

        if self.args.brief:
            final = False
        elif not self.args.bare:
            peek = repr(meta.size)
            line = separator()
            if len(data) <= 5_000_000:
                peek = F'{peek}; {meta.entropy!r} entropy'
            peek = F'{peek}; {meta.magic!s}'
            if self.args.lines == 0:
                peek = None
            elif not data:
                peek = None
                line = separator('empty chunk')
            if self.args.meta > 0:
                meta.derive('size')
                meta.derive('magic')
                meta.derive('entropy')
                peek = None
            if self.args.meta > 1:
                meta.derive('crc32')
                meta.derive('sha256')
            if self.args.meta > 2:
                for name in meta.derivations:
                    meta[name]
            for line in self._peekmeta(metrics.hexdump_width, line, meta, peek=peek):
                empty = False
                yield line

        if lines:
            empty = False
            if not self.args.brief:
                yield separator(codec or None)
                yield from lines
            else:
                brief = next(iter(lines))
                brief = F'{SizeInt(len(data))!r}: {brief}'
                if index is not None:
                    brief = F'#{index:03d}: {brief}'
                yield brief

        if final and (self.args.bare or not empty):
            yield separator()

    def filter(self, chunks):
        try:
            self._colorama.init(wrap=False)
        except ImportError:
            pass

        discarded = 0

        if self.args.brief:
            for chunk in chunks:
                if not chunk.visible and self.isatty():
                    discarded += 1
                    continue
                self.log_debug(chunk)
                yield chunk
        else:
            it = iter(chunks)
            buffer = collections.deque(itertools.islice(it, 0, 2))
            buffer.reverse()
            while buffer:
                if self.isatty() and not buffer[0].visible:
                    buffer.popleft()
                    discarded += 1
                else:
                    item = buffer.pop()
                    last = not bool(buffer)
                    item.temp = last
                    if not item.visible and self.isatty():
                        discarded += 1
                    else:
                        yield item
                try:
                    buffer.appendleft(next(it))
                except StopIteration:
                    pass

        if discarded:
            self.log_warn(F'discarded {discarded} invisible chunks to prevent them from leaking into the terminal.')

class pefix

This unit is implemented in refinery.units.formats.pe.pefix and has the following commandline Interface:

usage: pefix [-h] [-L] [-Q] [-0] [-v]

Take as input a buffer that represents a stripped PE file, i.e. magic numbers and other relevant
parts of the header have been stripped. The unit attempts to repair the damage and return
something that can be parsed.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pefix(Unit):
    """
    Take as input a buffer that represents a stripped PE file, i.e. magic numbers and other
    relevant parts of the header have been stripped. The unit attempts to repair the damage
    and return something that can be parsed.
    """
    @Unit.Requires('pefile', ['default', 'extended'])
    def _pefile():
        import pefile
        return pefile

    def process(self, data):
        sr = StructReader(data)
        sr.write(B'MZ')
        sr.seekset(0x3C)
        nt = sr.u16()
        oh = nt + 0x18
        sr.seekset(nt)
        sr.write(B'PE')
        sr.seekrel(2)
        mt = sr.u16()

        try:
            mt = MachineType(mt)
        except Exception:
            mt = None

        sr.seekset(oh)
        ms = bytes(sr.peek(2))

        try:
            ms = ImgState(ms)
        except ValueError:
            ms = {
                None: None,
                MachineType.I386  : ImgState.x32,
                MachineType.IA64  : ImgState.x64,
                MachineType.AMD64 : ImgState.x64,
            }.get(mt)

        if ms is None:
            self.log_warn('could not determine image state; nulling field')
            sr.write(B'\0\0')
        else:
            sr.write(ms.value)

        if mt is None:
            if mt := {
                None: None,
                ImgState.x32: MachineType.I386,
                ImgState.x64: MachineType.AMD64,
            }.get(ms):
                assert isinstance(mt, MachineType)
                sr.seekset(nt + 4)
                sr.write(mt.value.to_bytes(2, 'little'))

        pe = self._pefile.PE(data=data, fast_load=True)

        if (alignment := pe.OPTIONAL_HEADER.FileAlignment) not in {1 << k for k in range(9, 16)}:
            for k in range(9, 16):
                alignment = 1 << k
                size_of_headers = 0x28 * len(pe.sections) + oh + 0xF0
                soh = align(alignment, size_of_headers)
                if any(data[size_of_headers:soh]):
                    raise ValueError('nonzero bytes in what must be header padding')
                if any(data[soh:soh + 8]):
                    pe.OPTIONAL_HEADER.SizeOfHeaders = soh
                    break
            else:
                raise ValueError('unable to find a valid file alignment')

        pe.OPTIONAL_HEADER.FileAlignment = alignment
        pe.OPTIONAL_HEADER.SectionAlignment = max(pe.OPTIONAL_HEADER.SectionAlignment, alignment)

        return pe.write()

class pemeta (custom=False, debug=False, dotnet=False, signatures=False, timestamps=0, version=False, header=False, exports=0, imports=0, tabular=False, timeraw=False)

This unit is implemented in refinery.units.formats.pe.pemeta and has the following commandline Interface:

usage: pemeta [-h] [-L] [-Q] [-0] [-v] [-F] [-c] [-D] [-N] [-S] [-T] [-V] [-H] [-E] [-I] [-t]
              [-r]

Extract metadata from PE files. By default, all information except for imports and exports are
extracted.

options:
  -c, --custom      Unless enabled, all default categories will be extracted.
  -D, --debug       Parse the PDB path from the debug directory.
  -N, --dotnet      Parse the .NET header.
  -S, --signatures  Parse digital signatures.
  -T, --timestamps  Extract time stamps. Specify twice for more detail.
  -V, --version     Parse the VERSION resource.
  -H, --header      Parse base data from the PE header.
  -E, --exports     List all exported functions. Specify twice to include addresses.
  -I, --imports     List all imported functions. Specify twice to include addresses.
  -t, --tabular     Print information in a table rather than as JSON
  -r, --timeraw     Extract time stamps as numbers instead of human-readable format.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.
  -F, --iff         Only apply unit if it can handle the input format. Specify twice to drop all
                    other chunks.

Expand source code Browse git

class pemeta(Unit):
    """
    Extract metadata from PE files. By default, all information except for imports and exports are
    extracted.
    """
    def __init__(
        self,
        custom: Param[bool, Arg('-c', '--custom',
            help='Unless enabled, all default categories will be extracted.')] = False,
        debug: Param[bool, Arg.Switch('-D',
            help='Parse the PDB path from the debug directory.')] = False,
        dotnet: Param[bool, Arg.Switch('-N',
            help='Parse the .NET header.')] = False,
        signatures: Param[bool, Arg.Switch('-S',
            help='Parse digital signatures.')] = False,
        timestamps: Param[int, Arg.Counts('-T',
            help='Extract time stamps. Specify twice for more detail.')] = 0,
        version: Param[bool, Arg.Switch('-V',
            help='Parse the VERSION resource.')] = False,
        header: Param[bool, Arg.Switch('-H',
            help='Parse base data from the PE header.')] = False,
        exports: Param[int, Arg.Counts('-E',
            help='List all exported functions. Specify twice to include addresses.')] = 0,
        imports: Param[int, Arg.Counts('-I',
            help='List all imported functions. Specify twice to include addresses.')] = 0,
        tabular: Param[bool, Arg.Switch('-t',
            help='Print information in a table rather than as JSON')] = False,
        timeraw: Param[bool, Arg.Switch('-r',
            help='Extract time stamps as numbers instead of human-readable format.')] = False,
    ):
        if not custom and not any((debug, dotnet, signatures, timestamps, version, header)):
            debug = dotnet = signatures = timestamps = version = header = True
        super().__init__(
            debug=debug,
            dotnet=dotnet,
            signatures=signatures,
            timestamps=timestamps,
            version=version,
            header=header,
            imports=imports,
            exports=exports,
            timeraw=timeraw,
            tabular=tabular,
        )

    @classmethod
    def handles(cls, data):
        return is_likely_pe(data)

    @classmethod
    def _ensure_string(cls, x):
        if not isinstance(x, str):
            x = repr(x) if not isinstance(x, bytes) else x.decode(cls.codec, 'backslashreplace')
        return x

    @classmethod
    def _parse_pedict(cls, bin: dict):
        return {
            cls._ensure_string(key).replace(" ", ""): cls._ensure_string(val)
            for key, val in bin.items() if val}

    @classmethod
    def parse_signature(cls, data: bytearray) -> dict:
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        time stamp and code signing certificates that are attached to the input PE file.
        """
        from refinery.units.formats.pkcs7 import pkcs7

        try:
            signature = data | pkcs7 | json.loads
        except Exception as E:
            raise ValueError(F'PKCS7 parser failed with error: {E!s}')

        info = {}

        def _value(doc: dict, require_type=None):
            if require_type is not None:
                if doc.get('type', None) != require_type:
                    raise LookupError
            value = doc.get('value', None)
            value = [value] if value else doc.get('values', [])
            if not value:
                raise LookupError
            return value[0]

        def find_timestamps(entry) -> dict:
            if isinstance(entry, dict):
                try:
                    return {'Timestamp': _value(entry, 'signing_time')}
                except LookupError:
                    pass
                for value in entry.values():
                    result = find_timestamps(value)
                    if result is None:
                        continue
                    with suppress(KeyError):
                        result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name'])
                    return result
            elif isinstance(entry, list):
                for value in entry:
                    result = find_timestamps(value)
                    if result is None:
                        continue
                    return result

        timestamp_info = find_timestamps(signature)
        if timestamp_info is not None:
            info.update(timestamp_info)

        try:
            certificates = signature['content']['certificates']
            signer_infos = signature['content']['signer_infos']
        except KeyError:
            return info

        try:
            signer_serials = {info['sid']['serial_number']: info for info in signer_infos}
        except KeyError:
            return info

        signer_certificates = []

        for certificate in certificates:
            with suppress(Exception):
                crt = certificate['tbs_certificate']
                serial = crt['serial_number']
                signer = signer_serials[serial]
                if isinstance(serial, int):
                    serial = F'{serial:x}'
                if len(serial) % 2 != 0:
                    serial = F'0{serial}'
                assert bytes.fromhex(serial) in data
                subject = crt['subject']
                location = [subject.get(t, '') for t in (
                    'locality_name', 'state_or_province_name', 'country_name')]
                cert_info = {}
                cert_info.update(Subject=subject['common_name'])
                if any(location):
                    cert_info.update(SubjectLocation=', '.join(filter(None, location)))
                for attr in signer['signed_attrs']:
                    if attr['type'] == 'authenticode_info':
                        auth = _value(attr)
                        cert_info.update(ProgramName=auth['programName'])
                        cert_info.update(MoreInfo=auth['moreInfo'])
                try:
                    valid_since = crt['validity']['not_before']
                    valid_until = crt['validity']['not_after']
                except KeyError:
                    pass
                else:
                    cert_info.update(ValidSince=valid_since, ValidUntil=valid_until)
                cert_info.update(
                    Issuer=crt['issuer']['common_name'], Fingerprint=certificate['fingerprint'], Serial=serial)
                signer_certificates.append(cert_info)

        if len(signer_certificates) == 1:
            info.update(signer_certificates[0])
        if len(signer_certificates) >= 2:
            info['Signer'] = signer_certificates
        return info

    def _pe_characteristics(self, pe: lief.PE.Binary):
        characteristics = {F'IMAGE_FILE_{flag.name}' for flag in lief.PE.Header.CHARACTERISTICS
            if pe.header.characteristics & flag.value}
        if pe.header.characteristics & 0x40:
            # TODO: Missing from LIEF
            characteristics.add('IMAGE_FILE_16BIT_MACHINE')
        return characteristics

    def _pe_address_width(self, pe: lief.PE.Binary, default=16) -> int:
        # TODO: missing from LIEF
        IMAGE_FILE_16BIT_MACHINE = 0x40
        if pe.header.characteristics & IMAGE_FILE_16BIT_MACHINE:
            return 4
        elif pe.header.machine == lief.PE.Header.MACHINE_TYPES.I386:
            return 8
        elif pe.header.machine in (
            lief.PE.Header.MACHINE_TYPES.AMD64,
            lief.PE.Header.MACHINE_TYPES.IA64,
        ):
            return 16
        else:
            return default

    def _vint(self, pe: lief.PE.Binary, value: int):
        if not self.args.tabular:
            return value
        aw = self._pe_address_width(pe)
        return F'0x{value:0{aw}X}'

    def parse_version(self, pe: lief.PE.Binary, data=None) -> dict | None:
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        the version resource of an input PE file, if available.
        """
        version_info = {}
        rsrc = unwrap(pe.resources_manager)
        if isinstance(rsrc, lief.lib.lief_errors) or not rsrc.has_version:
            return None
        version = rsrc.version[0]

        if info := version.string_file_info:
            for child in info.children:
                entries = {e.key: e.value for e in child.entries}
                version_info.update({
                    k.replace(' ', ''): _STRING(v) for k, v in entries.items()
                })

        if rsrc.has_icons:
            icon = next(iter(rsrc.icons))
            version_info.update(
                LangID=self._vint(pe, icon.lang << 0x10 | icon.sublang),
                Language=LCID.get(icon.lang, 'Language Neutral'),
                Charset=self._CHARSET.get(icon.sublang, 'Unknown Charset'),
            )

        def _code_pages(d: lief.PE.ResourceDirectory | lief.PE.ResourceData):
            if isinstance(d, lief.PE.ResourceData):
                yield d.code_page
                return
            for child in d.childs:
                yield from _code_pages(child)

        code_pages: set[int] = set()

        for t in rsrc.types:
            code_pages.update(_code_pages(rsrc.get_node_type(t)))

        if len(code_pages) == 1:
            cp = next(iter(code_pages))
            version_info.update(CodePage=cp)

        def _to_version_string(hi: int, lo: int):
            a = hi >> 0x10
            b = hi & 0xFFFF
            c = lo >> 0x10
            d = lo & 0xFFFF
            return F'{a}.{b}.{c}.{d}'

        # TODO: Missing: Version.CompanyName
        # TODO: Missing: Version.FileDescription
        # TODO: Missing: Version.LegalCopyright
        # TODO: Missing: Version.ProductName

        if info := version.file_info:
            for name, val, T in (
                ('FileType', info.file_type, info.FILE_TYPE),
                ('OSName', info.file_os, info.VERSION_OS),
                ('FileSubType', info.file_subtype, info.FILE_TYPE_DETAILS),
            ):
                if not val:
                    continue
                try:
                    version_info[name] = T(val).name
                except Exception:
                    continue
            if t := info.file_date_ms << 32 | info.file_date_ls:
                version_info.update(Timestamp=_FILETIME(t))
            version_info.update(
                ProductVersion=_to_version_string(info.product_version_ms, info.product_version_ls),
                FileVersion=_to_version_string(info.file_version_ms, info.file_version_ls),
            )

        if info := version.var_file_info:
            ...

        return version_info or None

    def parse_exports(self, pe: lief.PE.Binary, data=None, include_addresses=False) -> list:
        base = pe.optional_header.imagebase
        info = []
        if not pe.has_exports:
            return None
        for k, exp in enumerate(pe.get_export().entries):
            name = exp.demangled_name
            if not name:
                name = exp.name
            if not name:
                name = F'@{k}'
            if not isinstance(name, str):
                name = name.decode('latin1')
            item = {
                'Name': name, 'Address': self._vint(pe, exp.address + base)
            } if include_addresses else name
            info.append(item)
        return info

    def parse_imports(self, pe: lief.PE.Binary, data=None, include_addresses=False) -> list:
        info = {}
        for idd in itertools.chain(pe.imports, pe.delay_imports):
            dll = _STRING(idd.name)
            if dll.lower().endswith('.dll'):
                dll = dll[:~3]
            imports: list[str] = info.setdefault(dll, [])
            for imp in idd.entries:
                name = _STRING(imp.name) or F'@{imp.ordinal}'
                imports.append(dict(
                    Name=name, Address=self._vint(pe, imp.value)
                ) if include_addresses else name)
        return info

    def parse_header(self, pe: lief.PE.Binary, data=None) -> dict:
        major = pe.optional_header.major_operating_system_version
        minor = pe.optional_header.minor_operating_system_version
        version = self._WINVER.get(major, {0: 'Unknown'})

        try:
            MinimumOS = version[minor]
        except LookupError:
            MinimumOS = version[0]
        header_information = {
            'Machine': pe.header.machine.name,
            'Subsystem': pe.optional_header.subsystem.name,
            'MinimumOS': MinimumOS,
        }
        if pe.has_exports:
            export_name = _STRING(pe.get_export().name)
            if export_name.isprintable():
                header_information['ExportName'] = export_name

        if pe.has_rich_header:
            rich = []
            if self.args.tabular:
                cw = max(len(F'{entry.count:d}') for entry in pe.rich_header.entries)
            for entry in pe.rich_header.entries:
                idv = entry.build_id | (entry.id << 0x10)
                count = entry.count
                info = get_rich_info(idv)
                if not info:
                    continue
                pid = info.pid.upper()
                if self.args.tabular:
                    short_pid = get_rich_short_pid(pid)
                    rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}')
                else:
                    rich.append({
                        'Counter': count,
                        'Encoded': F'{idv:08x}',
                        'Library': pid,
                        'Product': info.ver,
                    })
            header_information['RICH'] = rich

        characteristics = self._pe_characteristics(pe)
        for typespec, flag in {
            'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE',
            'DLL': 'IMAGE_FILE_DLL',
            'SYS': 'IMAGE_FILE_SYSTEM'
        }.items():
            if flag in characteristics:
                header_information['Type'] = typespec

        base = pe.optional_header.imagebase
        header_information['ImageBase'] = self._vint(pe, base)
        header_information['ImageSize'] = self._vint(pe, pe.optional_header.sizeof_image)
        header_information['ComputedSize'] = get_pe_size(pe)
        header_information['Bits'] = 4 * self._pe_address_width(pe, 16)
        header_information['EntryPoint'] = self._vint(pe, pe.optional_header.addressof_entrypoint + base)
        return header_information

    def parse_time_stamps(self, pe: lief.PE.Binary, raw_time_stamps: bool, more_detail: bool) -> dict:
        """
        Extracts time stamps from the PE header (link time), as well as from the imports,
        exports, debug, and resource directory. The resource time stamp is also parsed as
        a DOS time stamp and returned as the "Delphi" time stamp.
        """
        def _id(x): return x
        dt = _id if raw_time_stamps else date_from_timestamp
        info = {}

        with suppress(AttributeError):
            info.update(Linker=dt(pe.header.time_date_stamps))

        import_timestamps = {}
        for entry in pe.imports:
            ts = entry.timedatestamp
            if ts == 0 or ts == 0xFFFFFFFF:
                continue
            import_timestamps[_STRING(entry.name, True)] = dt(ts)

        symbol_timestamps = {}
        for entry in pe.delay_imports:
            ts = entry.timestamp
            if ts == 0 or ts == 0xFFFFFFFF:
                continue
            symbol_timestamps[_STRING(entry.name, True)] = dt(ts)

        for key, impts in [
            ('Import', import_timestamps),
            ('Symbol', symbol_timestamps),
        ]:
            if not impts:
                continue
            if not more_detail:
                dmin = min(impts.values())
                dmax = max(impts.values())
                small_delta = 2 * 60 * 60
                if not raw_time_stamps:
                    small_delta = timedelta(seconds=small_delta)
                if dmax - dmin < small_delta:
                    impts = dmin
            info[key] = impts

        if pe.has_exports and (ts := pe.get_export().timestamp):
            info.update(Export=dt(ts))

        if pe.has_resources and pe.resources.is_directory:
            rsrc: lief.PE.ResourceDirectory = pe.resources
            if res_timestamp := rsrc.time_date_stamp:
                with suppress(ValueError):
                    from refinery.units.misc.datefix import datefix
                    dos = datefix.dostime(res_timestamp)
                    info.update(Delphi=dos)
                    info.update(RsrcTS=dt(res_timestamp))

        def norm(value):
            if isinstance(value, list):
                return [norm(v) for v in value]
            if isinstance(value, dict):
                return {k: norm(v) for k, v in value.items()}
            if isinstance(value, int):
                return value
            return str(value)

        return {key: norm(value) for key, value in info.items()}

    def parse_dotnet(self, pe: lief.PE.Binary, data):
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        the .NET metadata of an input PE file.
        """
        header = DotNetHeader(data, pe)
        tables = header.meta.Streams.Tables
        info = dict(
            RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}',
            Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}',
            VersionString=header.meta.VersionString
        )

        info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check]

        if len(tables.Assembly) == 1:
            assembly = tables.Assembly[0]
            info.update(
                AssemblyName=assembly.Name,
                Release='{}.{}.{}.{}'.format(
                    assembly.MajorVersion,
                    assembly.MinorVersion,
                    assembly.BuildNumber,
                    assembly.RevisionNumber
                )
            )

        try:
            entry = self._vint(pe, header.head.EntryPointToken + pe.optional_header.imagebase)
            info.update(EntryPoint=entry)
        except AttributeError:
            pass

        if len(tables.Module) == 1:
            module = tables.Module[0]
            info.update(ModuleName=module.Name)

        return info

    def parse_debug(self, pe: lief.PE.Binary, data=None):
        result = []
        if not pe.has_debug:
            return None
        for entry in pe.debug:
            if entry.type != lief.PE.Debug.TYPES.CODEVIEW:
                continue
            try:
                entry: lief.PE.CodeViewPDB
                result.append(dict(
                    PdbPath=_STRING(entry.filename),
                    PdbGUID=entry.guid,
                    PdbAge=entry.age,
                ))
            except AttributeError:
                continue
        if len(result) == 1:
            result = result[0]
        return result

    def process(self, data):
        result = {}

        pe = lief.load_pe(
            data,
            parse_exports=True,
            parse_imports=self.args.imports,
            parse_rsrc=self.args.version,
            parse_reloc=False,
            parse_signature=self.args.timestamps or self.args.signatures,
        )

        if pe is None:
            raise ValueError('Input not recognized as a PE file.')

        pe = NoLoggingProxy(pe)

        for switch, resolver, name in [
            (self.args.debug,   self.parse_debug,    'Debug'),    # noqa
            (self.args.dotnet,  self.parse_dotnet,   'DotNet'),   # noqa
            (self.args.header,  self.parse_header,   'Header'),   # noqa
            (self.args.version, self.parse_version,  'Version'),  # noqa
            (self.args.imports, self.parse_imports,  'Imports'),  # noqa
            (self.args.exports, self.parse_exports,  'Exports'),  # noqa
        ]:
            if not switch:
                continue
            self.log_debug(F'parsing: {name}')
            args = pe, data
            if switch > 1:
                args = *args, True
            try:
                info = resolver(*args)
            except Exception as E:
                self.log_info(F'failed to obtain {name}: {E!s}')
                continue
            if info:
                result[name] = info

        signature = {}

        if self.args.timestamps or self.args.signatures:
            with suppress(Exception):
                from refinery.units.formats.pe.pesig import pesig
                signature = self.parse_signature(next(data | pesig))

        if signature:
            try:
                verification = pe.verify_signature()
            except Exception:
                pass
            else:
                from lief.PE import Signature
                if verification == Signature.VERIFICATION_FLAGS.OK:
                    signature['IsValid'] = True
                else:
                    signature['Flags'] = [
                        vf.name for vf in Signature.VERIFICATION_FLAGS if vf & verification]
                    signature['IsValid'] = False

        if self.args.timestamps:
            ts = self.parse_time_stamps(pe, self.args.timeraw, self.args.timestamps > 1)
            with suppress(KeyError):
                ts.update(Signed=signature['Timestamp'])
            result.update(TimeStamp=ts)

        if signature and self.args.signatures:
            result['Signature'] = signature

        if result:
            yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False)

    _CHARSET = {
        0x0000: '7-bit ASCII',
        0x03A4: 'Japan (Shift ? JIS X-0208)',
        0x03B5: 'Korea (Shift ? KSC 5601)',
        0x03B6: 'Taiwan (Big5)',
        0x04B0: 'Unicode',
        0x04E2: 'Latin-2 (Eastern European)',
        0x04E3: 'Cyrillic',
        0x04E4: 'Multilingual',
        0x04E5: 'Greek',
        0x04E6: 'Turkish',
        0x04E7: 'Hebrew',
        0x04E8: 'Arabic',
    }

    _WINVER = {
        3: {
            0x00: 'Windows NT 3',
            0x0A: 'Windows NT 3.1',
            0x32: 'Windows NT 3.5',
            0x33: 'Windows NT 3.51',
        },
        4: {
            0x00: 'Windows 95',
            0x0A: 'Windows 98',
        },
        5: {
            0x00: 'Windows 2000',
            0x5A: 'Windows Me',
            0x01: 'Windows XP',
            0x02: 'Windows Server 2003',
        },
        6: {
            0x00: 'Windows Vista',
            0x01: 'Windows 7',
            0x02: 'Windows 8',
            0x03: 'Windows 8.1',
        },
        10: {
            0x00: 'Windows 10',
        }
    }

Static methods

def parse_signature(data): Extracts a JSON-serializable and human-readable dictionary with information about time stamp and code signing certificates that are attached to the input PE file.

Methods

def parse_version(self, pe, data=None)

Extracts a JSON-serializable and human-readable dictionary with information about the version resource of an input PE file, if available.

Expand source code Browse git

def parse_version(self, pe: lief.PE.Binary, data=None) -> dict | None:
    """
    Extracts a JSON-serializable and human-readable dictionary with information about
    the version resource of an input PE file, if available.
    """
    version_info = {}
    rsrc = unwrap(pe.resources_manager)
    if isinstance(rsrc, lief.lib.lief_errors) or not rsrc.has_version:
        return None
    version = rsrc.version[0]

    if info := version.string_file_info:
        for child in info.children:
            entries = {e.key: e.value for e in child.entries}
            version_info.update({
                k.replace(' ', ''): _STRING(v) for k, v in entries.items()
            })

    if rsrc.has_icons:
        icon = next(iter(rsrc.icons))
        version_info.update(
            LangID=self._vint(pe, icon.lang << 0x10 | icon.sublang),
            Language=LCID.get(icon.lang, 'Language Neutral'),
            Charset=self._CHARSET.get(icon.sublang, 'Unknown Charset'),
        )

    def _code_pages(d: lief.PE.ResourceDirectory | lief.PE.ResourceData):
        if isinstance(d, lief.PE.ResourceData):
            yield d.code_page
            return
        for child in d.childs:
            yield from _code_pages(child)

    code_pages: set[int] = set()

    for t in rsrc.types:
        code_pages.update(_code_pages(rsrc.get_node_type(t)))

    if len(code_pages) == 1:
        cp = next(iter(code_pages))
        version_info.update(CodePage=cp)

    def _to_version_string(hi: int, lo: int):
        a = hi >> 0x10
        b = hi & 0xFFFF
        c = lo >> 0x10
        d = lo & 0xFFFF
        return F'{a}.{b}.{c}.{d}'

    # TODO: Missing: Version.CompanyName
    # TODO: Missing: Version.FileDescription
    # TODO: Missing: Version.LegalCopyright
    # TODO: Missing: Version.ProductName

    if info := version.file_info:
        for name, val, T in (
            ('FileType', info.file_type, info.FILE_TYPE),
            ('OSName', info.file_os, info.VERSION_OS),
            ('FileSubType', info.file_subtype, info.FILE_TYPE_DETAILS),
        ):
            if not val:
                continue
            try:
                version_info[name] = T(val).name
            except Exception:
                continue
        if t := info.file_date_ms << 32 | info.file_date_ls:
            version_info.update(Timestamp=_FILETIME(t))
        version_info.update(
            ProductVersion=_to_version_string(info.product_version_ms, info.product_version_ls),
            FileVersion=_to_version_string(info.file_version_ms, info.file_version_ls),
        )

    if info := version.var_file_info:
        ...

    return version_info or None

def parse_exports(self, pe, data=None, include_addresses=False)

Expand source code Browse git

def parse_exports(self, pe: lief.PE.Binary, data=None, include_addresses=False) -> list:
    base = pe.optional_header.imagebase
    info = []
    if not pe.has_exports:
        return None
    for k, exp in enumerate(pe.get_export().entries):
        name = exp.demangled_name
        if not name:
            name = exp.name
        if not name:
            name = F'@{k}'
        if not isinstance(name, str):
            name = name.decode('latin1')
        item = {
            'Name': name, 'Address': self._vint(pe, exp.address + base)
        } if include_addresses else name
        info.append(item)
    return info

def parse_imports(self, pe, data=None, include_addresses=False)

Expand source code Browse git

def parse_imports(self, pe: lief.PE.Binary, data=None, include_addresses=False) -> list:
    info = {}
    for idd in itertools.chain(pe.imports, pe.delay_imports):
        dll = _STRING(idd.name)
        if dll.lower().endswith('.dll'):
            dll = dll[:~3]
        imports: list[str] = info.setdefault(dll, [])
        for imp in idd.entries:
            name = _STRING(imp.name) or F'@{imp.ordinal}'
            imports.append(dict(
                Name=name, Address=self._vint(pe, imp.value)
            ) if include_addresses else name)
    return info

def parse_header(self, pe, data=None)

Expand source code Browse git

def parse_header(self, pe: lief.PE.Binary, data=None) -> dict:
    major = pe.optional_header.major_operating_system_version
    minor = pe.optional_header.minor_operating_system_version
    version = self._WINVER.get(major, {0: 'Unknown'})

    try:
        MinimumOS = version[minor]
    except LookupError:
        MinimumOS = version[0]
    header_information = {
        'Machine': pe.header.machine.name,
        'Subsystem': pe.optional_header.subsystem.name,
        'MinimumOS': MinimumOS,
    }
    if pe.has_exports:
        export_name = _STRING(pe.get_export().name)
        if export_name.isprintable():
            header_information['ExportName'] = export_name

    if pe.has_rich_header:
        rich = []
        if self.args.tabular:
            cw = max(len(F'{entry.count:d}') for entry in pe.rich_header.entries)
        for entry in pe.rich_header.entries:
            idv = entry.build_id | (entry.id << 0x10)
            count = entry.count
            info = get_rich_info(idv)
            if not info:
                continue
            pid = info.pid.upper()
            if self.args.tabular:
                short_pid = get_rich_short_pid(pid)
                rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}')
            else:
                rich.append({
                    'Counter': count,
                    'Encoded': F'{idv:08x}',
                    'Library': pid,
                    'Product': info.ver,
                })
        header_information['RICH'] = rich

    characteristics = self._pe_characteristics(pe)
    for typespec, flag in {
        'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE',
        'DLL': 'IMAGE_FILE_DLL',
        'SYS': 'IMAGE_FILE_SYSTEM'
    }.items():
        if flag in characteristics:
            header_information['Type'] = typespec

    base = pe.optional_header.imagebase
    header_information['ImageBase'] = self._vint(pe, base)
    header_information['ImageSize'] = self._vint(pe, pe.optional_header.sizeof_image)
    header_information['ComputedSize'] = get_pe_size(pe)
    header_information['Bits'] = 4 * self._pe_address_width(pe, 16)
    header_information['EntryPoint'] = self._vint(pe, pe.optional_header.addressof_entrypoint + base)
    return header_information

def parse_time_stamps(self, pe, raw_time_stamps, more_detail)

Extracts time stamps from the PE header (link time), as well as from the imports, exports, debug, and resource directory. The resource time stamp is also parsed as a DOS time stamp and returned as the "Delphi" time stamp.

Expand source code Browse git

def parse_time_stamps(self, pe: lief.PE.Binary, raw_time_stamps: bool, more_detail: bool) -> dict:
    """
    Extracts time stamps from the PE header (link time), as well as from the imports,
    exports, debug, and resource directory. The resource time stamp is also parsed as
    a DOS time stamp and returned as the "Delphi" time stamp.
    """
    def _id(x): return x
    dt = _id if raw_time_stamps else date_from_timestamp
    info = {}

    with suppress(AttributeError):
        info.update(Linker=dt(pe.header.time_date_stamps))

    import_timestamps = {}
    for entry in pe.imports:
        ts = entry.timedatestamp
        if ts == 0 or ts == 0xFFFFFFFF:
            continue
        import_timestamps[_STRING(entry.name, True)] = dt(ts)

    symbol_timestamps = {}
    for entry in pe.delay_imports:
        ts = entry.timestamp
        if ts == 0 or ts == 0xFFFFFFFF:
            continue
        symbol_timestamps[_STRING(entry.name, True)] = dt(ts)

    for key, impts in [
        ('Import', import_timestamps),
        ('Symbol', symbol_timestamps),
    ]:
        if not impts:
            continue
        if not more_detail:
            dmin = min(impts.values())
            dmax = max(impts.values())
            small_delta = 2 * 60 * 60
            if not raw_time_stamps:
                small_delta = timedelta(seconds=small_delta)
            if dmax - dmin < small_delta:
                impts = dmin
        info[key] = impts

    if pe.has_exports and (ts := pe.get_export().timestamp):
        info.update(Export=dt(ts))

    if pe.has_resources and pe.resources.is_directory:
        rsrc: lief.PE.ResourceDirectory = pe.resources
        if res_timestamp := rsrc.time_date_stamp:
            with suppress(ValueError):
                from refinery.units.misc.datefix import datefix
                dos = datefix.dostime(res_timestamp)
                info.update(Delphi=dos)
                info.update(RsrcTS=dt(res_timestamp))

    def norm(value):
        if isinstance(value, list):
            return [norm(v) for v in value]
        if isinstance(value, dict):
            return {k: norm(v) for k, v in value.items()}
        if isinstance(value, int):
            return value
        return str(value)

    return {key: norm(value) for key, value in info.items()}

def parse_dotnet(self, pe, data)

Extracts a JSON-serializable and human-readable dictionary with information about the .NET metadata of an input PE file.

Expand source code Browse git

def parse_dotnet(self, pe: lief.PE.Binary, data):
    """
    Extracts a JSON-serializable and human-readable dictionary with information about
    the .NET metadata of an input PE file.
    """
    header = DotNetHeader(data, pe)
    tables = header.meta.Streams.Tables
    info = dict(
        RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}',
        Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}',
        VersionString=header.meta.VersionString
    )

    info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check]

    if len(tables.Assembly) == 1:
        assembly = tables.Assembly[0]
        info.update(
            AssemblyName=assembly.Name,
            Release='{}.{}.{}.{}'.format(
                assembly.MajorVersion,
                assembly.MinorVersion,
                assembly.BuildNumber,
                assembly.RevisionNumber
            )
        )

    try:
        entry = self._vint(pe, header.head.EntryPointToken + pe.optional_header.imagebase)
        info.update(EntryPoint=entry)
    except AttributeError:
        pass

    if len(tables.Module) == 1:
        module = tables.Module[0]
        info.update(ModuleName=module.Name)

    return info

def parse_debug(self, pe, data=None)

Expand source code Browse git

def parse_debug(self, pe: lief.PE.Binary, data=None):
    result = []
    if not pe.has_debug:
        return None
    for entry in pe.debug:
        if entry.type != lief.PE.Debug.TYPES.CODEVIEW:
            continue
        try:
            entry: lief.PE.CodeViewPDB
            result.append(dict(
                PdbPath=_STRING(entry.filename),
                PdbGUID=entry.guid,
                PdbAge=entry.age,
            ))
        except AttributeError:
            continue
    if len(result) == 1:
        result = result[0]
    return result

class peoverlay (certificate=False, directories=False, memdump=False)

This unit is implemented in refinery.units.formats.pe.peoverlay and has the following commandline Interface:

usage: peoverlay [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m]

Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This
does not include digital signatures. Use pestrip to obtain only the body of the PE file after
removing the overlay.

options:
  -c, --cert     Include digital signatures for the size computation.
  -d, --dirs     Include data directories for size computation.
  -m, --memdump  Assume that the file data was a memory-mapped PE file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class peoverlay(OverlayUnit):
    """
    Returns the overlay of a PE file, i.e. anything that may have been appended to the file.
    This does not include digital signatures. Use `refinery.pestrip` to obtain only the body
    of the PE file after removing the overlay.
    """
    def process(self, data: bytearray) -> bytearray:
        size = self._get_size(data)
        try:
            data[:size] = []
        except Exception:
            return data[size:]
        else:
            return data

class perc (*paths, pretty=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.pe.perc and has the following commandline Interface:

usage: perc [-h] [-L] [-Q] [-0] [-v] [-p] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract PE file resources.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -p, --pretty     Add missing headers to bitmap and icon resources.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class perc(PathExtractorUnit):
    """
    Extract PE file resources.
    """
    def __init__(
        self, *paths,
        pretty: Param[bool, Arg.Switch('-p', help='Add missing headers to bitmap and icon resources.')] = False,
        **kwargs
    ):
        super().__init__(*paths, pretty=pretty, **kwargs)

    def _get_icon_dir(self, pe: lief.PE.Binary):
        for manifest_entry in pe.resources.childs:
            if manifest_entry.id != RSRC.ICON_GROUP.value:
                continue
            child: lief.PE.ResourceData = manifest_entry.childs[0].childs[0]
            return GRPICONDIR(bytearray(child.content))

    def _search(self, pe: lief.PE.Binary, directory: lief.PE.ResourceDirectory, *parts):
        if directory.depth >= 3:
            self.log_warn(F'unexpected resource tree level {directory.depth + 1:d}')
        for entry in directory.childs:
            entry: lief.PE.ResourceData
            if entry.has_name:
                identifier = str(entry.name)
            elif directory.depth == 0 and entry.id in iter(RSRC):
                identifier = RSRC(entry.id)
            elif entry.id is not None:
                identifier = entry.id
            else:
                self.log_warn(F'resource entry has name {entry.name} and id {entry.id} at level {directory.depth + 1:d}')
                continue
            if entry.is_directory:
                yield from self._search(pe, entry, *parts, identifier)
            else:
                def extract(_=pe, e=entry):
                    return bytearray(e.content)
                path = '/'.join(str(p) for p in (*parts, identifier))
                if self.args.pretty:
                    if parts[0] is RSRC.BITMAP:
                        extract = self._handle_bitmap(extract)
                    elif parts[0] is RSRC.ICON:
                        extract = self._handle_icon(pe, extract, parts)
                    elif parts[0] is RSRC.ICON_GROUP:
                        def extract(_=pe, e=entry):
                            data = GRPICONDIR(e.content)
                            return json.dumps({
                                entry.nid: {
                                    'width'         : entry.width,
                                    'height'        : entry.height,
                                    'bytes'         : entry.bytes_in_res,
                                    'color'         : {
                                        'count'     : entry.color_count,
                                        'planes'    : entry.planes,
                                        'bits'      : entry.bit_count,
                                    },
                                } for entry in data.entries},
                                indent=4
                            ).encode(self.codec)

                yield UnpackResult(
                    path,
                    extract,
                    lcid=self._get_lcid(entry),
                    offset=entry.offset,
                )

    def _get_lcid(self, node_data) -> str | None:
        try:
            pid = node_data.id & 0x3FF
            sid = node_data.id >> 0x0A
        except AttributeError:
            return None
        try:
            pid = self._LANG_ID_TO_LCID[pid]
        except KeyError:
            return None
        lcid = pid.get(sid, 0)
        return LCID.get(lcid)

    def _handle_bitmap(self, extract_raw_data: Callable[[], buf]) -> buf:
        def extract():
            bitmap = extract_raw_data()
            total = (len(bitmap) + 14).to_bytes(4, 'little')
            return B'BM' + total + B'\0\0\0\0\x36\0\0\0' + bitmap
        return extract

    def _handle_icon(
        self,
        pe: lief.PE.Binary,
        extract_raw_data: Callable[[], buf],
        parts: tuple[RSRC, int, int]
    ) -> buf:
        try:
            icondir = self._get_icon_dir(pe)
            index = int(parts[1]) - 1
            info = icondir.entries[index]
        except IndexError:
            return extract_raw_data
        except Exception as E:
            self.log_warn(F'unable to generate icon header: {E!s}')
            return extract_raw_data

        def extract(info=info):
            icon = extract_raw_data()
            if icon.startswith(B'(\0\0\0'):
                header = struct.pack('<HHHBBBBHHII',
                    0,
                    1,
                    1,
                    info.width,
                    info.height,
                    info.color_count,
                    0,
                    info.planes,
                    info.bit_count,
                    len(icon),
                    0x16
                )
                icon = header + icon
            return icon

        return extract

    def unpack(self, data):
        pe = lief.load_pe_fast(data, parse_rsrc=True)
        if not pe.has_resources:
            return
        yield from self._search(pe, pe.resources)

    def _mktbl(ids: list[tuple[int, int, int]]) -> dict[int, dict[int, int]]:
        table = {}
        for pid, sid, lcid in ids:
            if pid not in table:
                table[pid] = {0: lcid}
            table[pid][sid] = lcid
        return table

    _LANG_ID_TO_LCID = _mktbl([
        (0x00, 0x03, 0x0C00),
        (0x00, 0x05, 0x1400),
        (0x7F, 0x00, 0x007F),
        (0x00, 0x00, 0x0000),
        (0x02, 0x02, 0x0800),
        (0x00, 0x04, 0x1000),
        (0x00, 0x01, 0x0400),
        (0x36, 0x01, 0x0436),
        (0x1c, 0x01, 0x041C),
        (0x84, 0x01, 0x0484),
        (0x5E, 0x01, 0x045E),
        (0x01, 0x05, 0x1401),
        (0x01, 0x0f, 0x3C01),
        (0x01, 0x03, 0x0C01),
        (0x01, 0x02, 0x0801),
        (0x01, 0x0B, 0x2C01),
        (0x01, 0x0D, 0x3401),
        (0x01, 0x0C, 0x3001),
        (0x01, 0x04, 0x1001),
        (0x01, 0x06, 0x1801),
        (0x01, 0x08, 0x2001),
        (0x01, 0x10, 0x4001),
        (0x01, 0x01, 0x0401),
        (0x01, 0x0A, 0x2801),
        (0x01, 0x07, 0x1C01),
        (0x01, 0x0E, 0x3801),
        (0x01, 0x09, 0x2401),
        (0x2B, 0x01, 0x042B),
        (0x4D, 0x01, 0x044D),
        (0x2C, 0x02, 0x082C),
        (0x2C, 0x01, 0x042C),
        (0x45, 0x02, 0x0445),
        (0x6D, 0x01, 0x046D),
        (0x2d, 0x01, 0x042D),
        (0x23, 0x01, 0x0423),
        (0x1A, 0x08, 0x201A),
        (0x1A, 0x05, 0x141A),
        (0x7E, 0x01, 0x047E),
        (0x02, 0x01, 0x0402),
        (0x92, 0x01, 0x0492),
        (0x5C, 0x01, 0x045C),
        (0x03, 0x01, 0x0403),
        (0x04, 0x03, 0x0C04),
        (0x04, 0x05, 0x1404),
        (0x04, 0x04, 0x1004),
        (0x04, 0x02, 0x0004),
        (0x04, 0x01, 0x7C04),
        (0x83, 0x01, 0x0483),
        (0x1A, None, 0x001A),
        (0x1a, 0x04, 0x101A),
        (0x1a, 0x01, 0x041A),
        (0x05, 0x01, 0x0405),
        (0x06, 0x01, 0x0406),
        (0x8C, 0x01, 0x048C),
        (0x65, 0x01, 0x0465),
        (0x13, 0x02, 0x0813),
        (0x13, 0x01, 0x0413),
        (0x09, 0x03, 0x0C09),
        (0x09, 0x0A, 0x2809),
        (0x09, 0x04, 0x1009),
        (0x09, 0x09, 0x2409),
        (0x09, 0x10, 0x4009),
        (0x09, 0x06, 0x1809),
        (0x09, 0x08, 0x2009),
        (0x09, 0x11, 0x4409),
        (0x09, 0x05, 0x1409),
        (0x09, 0x0D, 0x3409),
        (0x09, 0x12, 0x4809),
        (0x09, 0x07, 0x1c09),
        (0x09, 0x0B, 0x2C09),
        (0x09, 0x02, 0x0809),
        (0x09, 0x01, 0x0409),
        (0x09, 0x0C, 0x3009),
        (0x25, 0x01, 0x0425),
        (0x38, 0x01, 0x0438),
        (0x64, 0x01, 0x0464),
        (0x0B, 0x01, 0x040B),
        (0x0C, 0x02, 0x080c),
        (0x0C, 0x03, 0x0C0C),
        (0x0C, 0x01, 0x040c),
        (0x0C, 0x05, 0x140C),
        (0x0C, 0x06, 0x180C),
        (0x0C, 0x04, 0x100C),
        (0x62, 0x01, 0x0462),
        (0x56, 0x01, 0x0456),
        (0x37, 0x01, 0x0437),
        (0x07, 0x03, 0x0C07),
        (0x07, 0x01, 0x0407),
        (0x07, 0x05, 0x1407),
        (0x07, 0x04, 0x1007),
        (0x07, 0x02, 0x0807),
        (0x08, 0x01, 0x0408),
        (0x6F, 0x01, 0x046F),
        (0x47, 0x01, 0x0447),
        (0x68, 0x01, 0x0468),
        (0x75, 0x01, 0x0475),
        (0x0D, 0x01, 0x040D),
        (0x39, 0x01, 0x0439),
        (0x0E, 0x01, 0x040E),
        (0x0F, 0x01, 0x040F),
        (0x70, 0x01, 0x0470),
        (0x21, 0x01, 0x0421),
        (0x5D, 0x02, 0x085D),
        (0x5D, 0x01, 0x045D),
        (0x3C, 0x02, 0x083C),
        (0x34, 0x01, 0x0434),
        (0x35, 0x01, 0x0435),
        (0x10, 0x01, 0x0410),
        (0x10, 0x02, 0x0810),
        (0x11, 0x01, 0x0411),
        (0x4B, 0x01, 0x044B),
        (0x3F, 0x01, 0x043F),
        (0x53, 0x01, 0x0453),
        (0x86, 0x01, 0x0486),
        (0x87, 0x01, 0x0487),
        (0x57, 0x01, 0x0457),
        (0x12, 0x01, 0x0412),
        (0x40, 0x01, 0x0440),
        (0x54, 0x01, 0x0454),
        (0x26, 0x01, 0x0426),
        (0x27, 0x01, 0x0427),
        (0x2E, 0x02, 0x082E),
        (0x6E, 0x01, 0x046E),
        (0x2F, 0x01, 0x042F),
        (0x3E, 0x02, 0x083E),
        (0x3E, 0x01, 0x043e),
        (0x4C, 0x01, 0x044C),
        (0x3A, 0x01, 0x043A),
        (0x81, 0x01, 0x0481),
        (0x7A, 0x01, 0x047A),
        (0x4E, 0x01, 0x044E),
        (0x7C, 0x01, 0x047C),
        (0x50, 0x01, 0x0450),
        (0x50, 0x02, 0x0850),
        (0x61, 0x01, 0x0461),
        (0x14, 0x01, 0x0414),
        (0x14, 0x02, 0x0814),
        (0x82, 0x01, 0x0482),
        (0x48, 0x01, 0x0448),
        (0x63, 0x01, 0x0463),
        (0x29, 0x01, 0x0429),
        (0x15, 0x01, 0x0415),
        (0x16, 0x01, 0x0416),
        (0x16, 0x02, 0x0816),
        (0x67, 0x02, 0x0867),
        (0x46, 0x01, 0x0446),
        (0x46, 0x02, 0x0846),
        (0x6B, 0x01, 0x046B),
        (0x6B, 0x02, 0x086B),
        (0x6B, 0x03, 0x0C6B),
        (0x18, 0x01, 0x0418),
        (0x17, 0x01, 0x0417),
        (0x19, 0x01, 0x0419),
        (0x85, 0x01, 0x0485),
        (0x3B, 0x09, 0x243B),
        (0x3B, 0x04, 0x103B),
        (0x3B, 0x05, 0x143B),
        (0x3B, 0x03, 0x0C3B),
        (0x3B, 0x01, 0x043B),
        (0x3B, 0x02, 0x083B),
        (0x3B, 0x08, 0x203B),
        (0x3B, 0x06, 0x183B),
        (0x3B, 0x07, 0x1C3B),
        (0x4F, 0x01, 0x044F),
        (0x1a, 0x07, 0x1C1A),
        (0x1a, 0x06, 0x181A),
        (0x1a, 0x03, 0x0C1A),
        (0x1a, 0x02, 0x081A),
        (0x6C, 0x01, 0x046C),
        (0x32, 0x02, 0x0832),
        (0x32, 0x01, 0x0432),
        (0x32, 0x01, 0x0459),
        (0x32, 0x02, 0x0859),
        (0x5B, 0x01, 0x045B),
        (0x1b, 0x01, 0x041B),
        (0x24, 0x01, 0x0424),
        (0x0A, 0x0b, 0x2C0A),
        (0x0A, 0x10, 0x400A),
        (0x0A, 0x0D, 0x340A),
        (0x0A, 0x09, 0x240A),
        (0x0A, 0x05, 0x140A),
        (0x0A, 0x07, 0x1C0A),
        (0x0A, 0x0C, 0x300A),
        (0x0A, 0x11, 0x440A),
        (0x0A, 0x04, 0x100A),
        (0x0A, 0x12, 0x480A),
        (0x0A, 0x02, 0x080A),
        (0x0A, 0x13, 0x4C0A),
        (0x0A, 0x06, 0x180A),
        (0x0A, 0x0F, 0x3C0A),
        (0x0A, 0x0A, 0x280A),
        (0x0A, 0x14, 0x500A),
        (0x0A, 0x03, 0x0C0A),
        (0x0A, 0x01, 0x040A),
        (0x0A, 0x15, 0x540A),
        (0x0A, 0x0E, 0x380A),
        (0x0A, 0x08, 0x200A),
        (0x41, 0x01, 0x0441),
        (0x1D, 0x02, 0x081D),
        (0x1D, 0x01, 0x041D),
        (0x5A, 0x01, 0x045A),
        (0x28, 0x01, 0x0428),
        (0x5F, 0x02, 0x085F),
        (0x49, 0x01, 0x0449),
        (0x49, 0x02, 0x0849),
        (0x44, 0x01, 0x0444),
        (0x4A, 0x01, 0x044A),
        (0x1E, 0x01, 0x041E),
        (0x51, 0x01, 0x0451),
        (0x73, 0x02, 0x0873),
        (0x73, 0x01, 0x0473),
        (0x1F, 0x01, 0x041F),
        (0x42, 0x01, 0x0442),
        (0x22, 0x01, 0x0422),
        (0x2E, 0x01, 0x042E),
        (0x20, 0x02, 0x0820),
        (0x20, 0x01, 0x0420),
        (0x80, 0x01, 0x0480),
        (0x43, 0x02, 0x0843),
        (0x43, 0x01, 0x0443),
        (0x03, 0x02, 0x0803),
        (0x2A, 0x01, 0x042A),
        (0x52, 0x01, 0x0452),
        (0x88, 0x01, 0x0488),
        (0x78, 0x01, 0x0478),
        (0x6A, 0x01, 0x046A),
    ])

class pesig

This unit is implemented in refinery.units.formats.pe.pesig and has the following commandline Interface:

usage: pesig [-h] [-L] [-Q] [-0] [-v]

Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital
signatures in DER format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pesig(Unit):
    """
    Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file,
    i.e. the digital signatures in DER format.
    """
    def process(self, data: bytearray):
        view = memoryview(data)
        pe = lief.load_pe_fast(view)
        security = pe.data_directory(lief.PE.DataDirectory.TYPES.CERTIFICATE_TABLE)
        self.log_info(F'signature offset: 0x{security.rva:08X}')
        self.log_info(F'signature length: 0x{security.size:08X}')
        if security.rva == 0 or security.size == 0:
            raise ValueError('IMAGE_DIRECTORY_ENTRY_SECURITY is corrupt.')
        sgnoff = security.rva + 8
        sgnend = sgnoff + security.size
        length, _, _ = struct.unpack('<IHH', view[sgnoff - 8:sgnoff])
        signature = view[sgnoff:sgnend]
        if len(signature) + 8 != length:
            raise RefineryPartialResult(
                F'Found {len(signature) + 8} bytes of signature, but length should be {length}.',
                partial=signature)
        return signature

class pestrip (certificate=False, directories=False, memdump=False)

This unit is implemented in refinery.units.formats.pe.pestrip and has the following commandline Interface:

usage: pestrip [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m]

Removes the overlay of a PE file and returns the main executable. Use peoverlay to extract the
overlay.

options:
  -c, --cert     Include digital signatures for the size computation.
  -d, --dirs     Include data directories for size computation.
  -m, --memdump  Assume that the file data was a memory-mapped PE file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pestrip(OverlayUnit):
    """
    Removes the overlay of a PE file and returns the main executable. Use `refinery.peoverlay` to
    extract the overlay.
    """

    def process(self, data: bytearray) -> bytearray:
        size = self._get_size(data)
        try:
            data[size:] = []
        except Exception:
            data = data[:size]
        else:
            return data

class pf (*formats, variable=None, separator=' ', multiplex=False, binary=False, unescape=False)

This unit is implemented in refinery.units.strings.pf and has the following commandline Interface:

usage: pf [-h] [-L] [-Q] [-0] [-v] [-n N] [-s S | -m] [-b] [-e] [format ...]

Stands for "Print Format": Transform a given chunk by applying a format string operation. The
positional format string placeholder {} will be replaced by the incoming data, named placeholders
have to exist as meta variables in the current chunk. For example, the following pipeline can be
used to print all files in a given directory with their corresponding SHA-256 hash:

    ef ** [| sha256 -t | pf {} {path} ]]

By default, format string arguments are simply joined along a space character to form a single
format string.

positional arguments:
  format             Format strings.

options:
  -n, --variable N   Store the formatted string in a meta variable.
  -s, --separator S  Separator to insert between format strings. The default is a space
                     character.
  -m, --multiplex    Do not join the format strings along the separator, generate one output for
                     each.
  -b, --binary       Use the binary formatter instead of the string formatter.
  -e, --unescape     Interpret escape sequences in format strings.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class pf(Unit):
    """
    Stands for "Print Format": Transform a given chunk by applying a format string operation. The
    positional format string placeholder `{}` will be replaced by the incoming data, named
    placeholders have to exist as meta variables in the current chunk. For example, the following
    pipeline can be used to print all files in a given directory with their corresponding SHA-256
    hash:

        ef ** [| sha256 -t | pf {} {path} ]]

    By default, format string arguments are simply joined along a space character to form a single
    format string.
    """

    def __init__(
        self,
        *formats: Param[buf, Arg.Binary(help='Format strings.', metavar='format')],
        variable: Param[str, Arg.String('-n', metavar='N', help='Store the formatted string in a meta variable.')] = None,
        separator: Param[str, Arg.String('-s', group='SEP', metavar='S',
            help='Separator to insert between format strings. The default is a space character.')] = ' ',
        multiplex: Param[bool, Arg.Switch('-m', group='SEP',
            help='Do not join the format strings along the separator, generate one output for each.')] = False,
        binary: Param[bool, Arg.Switch('-b', help='Use the binary formatter instead of the string formatter.')] = False,
        unescape: Param[bool, Arg.Switch('-e', help='Interpret escape sequences in format strings.')] = False,
    ):
        def fixfmt(fmt: bytes | str):
            if unescape:
                if isinstance(fmt, str):
                    fmt = fmt.encode('latin1')
                return bytes(fmt).decode('unicode-escape')
            elif not isinstance(fmt, str):
                fmt = bytes(fmt).decode(self.codec)
            return fmt
        _formats = [fixfmt(f) for f in formats]
        if not multiplex:
            _formats = [fixfmt(separator).join(_formats)]
        super().__init__(formats=_formats, variable=variable, binary=binary)

    def process(self, data):
        meta = metavars(data)
        meta.ghost = True
        args = [data]
        variable = self.args.variable
        if self.args.binary:
            formatter = partial(meta.format_bin, codec=self.codec, args=args)
        else:
            def formatter(spec):
                return meta.format_str(spec, self.codec, args).encode(self.codec)
        for spec in self.args.formats:
            result = formatter(spec)
            if variable is not None:
                result = self.labelled(data, **{variable: result})
            yield result

class pick (*bounds)

This unit is implemented in refinery.units.meta.pick and has the following commandline Interface:

usage: pick [-h] [-L] [-Q] [-0] [-v] [start:end:step ...]

Picks sequences from the array of multiple inputs. For example, pick 0 2: will return all but the
second ingested input (which has index 1).

positional arguments:
  start:end:step  Specify start:end:step in Python slice syntax. The default is 0.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class pick(Unit):
    """
    Picks sequences from the array of multiple inputs. For example, `pick 0 2:`
    will return all but the second ingested input (which has index `1`).
    """
    def __init__(self, *bounds: Param[slice, Arg.Bounds(nargs='*', default=[0])]):
        super().__init__(bounds=[sliceobj(s) for s in bounds])

    def process(self, data: Chunk):
        if not data.visible:
            yield data
            return

        state: _PickState = data.temp
        a = state.accessor
        lower = a.start
        upper = a.stop

        if lower is not None:
            lower -= state.discarded
        if upper is not None:
            upper -= state.discarded
        if state.consumed:
            yield from state.remaining[slice(lower, upper, a.step)]
            return

        while lower:
            try:
                chunk = next(state.chunks)
            except StopIteration:
                upper = None
                break
            if chunk.visible:
                lower -= 1
                upper -= 1
                state.discarded += 1
            else:
                yield chunk
        if upper is None:
            yield from state.chunks
            return
        while upper:
            try:
                chunk = next(state.chunks)
            except StopIteration:
                break
            if chunk.visible:
                upper -= 1
                state.discarded += 1
            yield chunk

    def filter(self, chunks: Iterable[Chunk]):
        chunks = begin(chunks)
        if chunks is None:
            return
        container, chunks = chunks
        if container.scope < 1:
            raise RuntimeError(F'{self.__class__.__name__} cannot be used outside a frame; maybe you meant to use snip?')
        container = container.copy()
        container.visible = True
        state = _PickState(deque(self.args.bounds), chunks)
        while state.next():
            if not state.consumed:
                if not state.discardable():
                    self.log_debug(F'consumed input into buffer after {state.discarded} skips')
                    for chunk in state.chunks:
                        if not chunk.visible:
                            yield chunk
                            continue
                        state.remaining.append(chunk)
                    state.consumed = True
            container.temp = state
            yield container

class pkcs7

This unit is implemented in refinery.units.formats.pkcs7 and has the following commandline Interface:

usage: pkcs7 [-h] [-L] [-Q] [-0] [-v]

Converts PKCS7 encoded data to a JSON representation.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pkcs7(Unit):
    """
    Converts PKCS7 encoded data to a JSON representation.
    """
    @Unit.Requires('asn1crypto', ['default', 'extended'])
    def _asn1crypto():
        import asn1crypto
        import asn1crypto.cms
        import asn1crypto.core
        import asn1crypto.x509
        return asn1crypto

    def process(self, data):
        asn1 = self._asn1crypto.core
        cms = self._asn1crypto.cms
        signature = cms.ContentInfo.load(convert(data, bytes))

        def unsign(data):
            if isinstance(data, int):
                size = data.bit_length()
                if data < 0:
                    data = (1 << (size + 1)) - ~data - 1
                if data > 0xFFFFFFFF_FFFFFFFF:
                    size, r = divmod(size, 8)
                    size += bool(r)
                    data = data.to_bytes(size, 'big').hex()
                return data
            elif isinstance(data, dict):
                return {key: unsign(value) for key, value in data.items()}
            elif isinstance(data, list):
                return [unsign(x) for x in data]
            else:
                return data

        class SpcString(asn1.Choice):
            _alternatives = [
                ('unicode', asn1.BMPString, {'implicit': 0}),
                ('ascii', asn1.IA5String, {'implicit': 1})
            ]

        SpcUuid = asn1.OctetString

        class SpcSerializedObject(asn1.Sequence):
            _fields = [
                ('classId', SpcUuid),
                ('serializedData', asn1.OctetString),
            ]

        class SpcLink(asn1.Choice):
            _alternatives = [
                ('url', asn1.IA5String, {'implicit': 0}),
                ('monikier', SpcSerializedObject, {'implicit': 1}),
                ('file', SpcString, {'explicit': 2})
            ]

        class SpcSpOpusInfo(asn1.Sequence):
            _fields = [
                ('programName', SpcString, {'optional': True, 'explicit': 0}),
                ('moreInfo', SpcLink, {'optional': True, 'explicit': 1}),
            ]

        class SetOfInfos(asn1.SetOf):
            _child_spec = SpcSpOpusInfo

        cms.CMSAttributeType._map['1.3.6.1.4.1.311.2.1.12'] = 'authenticode_info'
        cms.CMSAttribute._oid_specs['authenticode_info'] = SetOfInfos

        class ParsedASN1ToJSON(BytesAsStringEncoder):
            unit = self

            @classmethod
            def _is_keyval(cls, obj):
                return (
                    isinstance(obj, dict)
                    and set(obj.keys()) == {'type', 'values'}
                    and len(obj['values']) == 1
                )

            @classmethod
            def handled(cls, obj) -> bool:
                return BytesAsStringEncoder.handled(obj) or cls._is_keyval(obj)

            def encode_bytes(self, obj: bytes):
                with suppress(Exception):
                    string = obj.decode('latin1')
                    if string.isprintable():
                        return string
                return super().encode_bytes(obj)

            def default(self, obj):
                if self._is_keyval(obj):
                    return dict(type=obj['type'], value=obj['values'][0])
                with suppress(TypeError):
                    return super().default(obj)
                if isinstance(obj, (set, tuple)):
                    return list(obj)
                if isinstance(obj, datetime):
                    return str(obj)
                dict_result = {}
                list_result = None
                if isinstance(obj, self.unit._asn1crypto.x509.Certificate):
                    dict_result.update(fingerprint=obj.sha1.hex())
                if isinstance(obj, asn1.BitString):
                    return {'bit_string': obj.native}
                with suppress(Exception):
                    list_result = list(obj)
                    if all(isinstance(k, str) for k in list_result):
                        dict_result.update((key, obj[key]) for key in list_result)
                if dict_result:
                    return dict_result
                if list_result is not None:
                    return list_result
                if isinstance(obj, self.unit._asn1crypto.cms.CertificateChoices):
                    return obj.chosen
                if isinstance(obj, asn1.Sequence):
                    children = obj.children
                    if children:
                        return children
                    return obj.dump()
                with suppress(Exception):
                    return obj.native
                if isinstance(obj, asn1.Any):
                    parsed = None
                    with suppress(Exception):
                        parsed = obj.parse()
                    if parsed:
                        return parsed
                    return obj.dump()
                if isinstance(obj, asn1.Asn1Value):
                    return obj.dump()
                raise ValueError(F'Unable to determine JSON encoding of {obj.__class__.__name__} object.')

        with ParsedASN1ToJSON as encoder:
            encoded = encoder.dumps(signature)
            converted = unsign(json.loads(encoded))
            return json.dumps(converted, indent=4).encode(self.codec)

class pkcs7sig (tabular=False)

This unit is implemented in refinery.units.formats.pkcs7sig and has the following commandline Interface:

usage: pkcs7sig [-h] [-L] [-Q] [-0] [-v] [-t]

Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to
parse authenticode signatures appended to files that are not PE files to get the same output that
is produced by the pemeta unit.

options:
  -t, --tabular  Print information in a table rather than as JSON

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pkcs7sig(Unit):
    """
    Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used
    to parse authenticode signatures appended to files that are not PE files to get the same output
    that is produced by the pemeta unit.
    """
    def __init__(self, tabular: Param[bool, Arg('-t', help='Print information in a table rather than as JSON')] = False):
        super().__init__(tabular=tabular)

    def process(self, data):
        json = pemeta.parse_signature(data)
        yield from ppjson(tabular=self.args.tabular)._pretty_output(json, indent=4, ensure_ascii=False)

class pkw

This unit is implemented in refinery.units.compression.pkw and has the following commandline Interface:

usage: pkw [-h] [-L] [-Q] [-0] [-v] [-F]

This unit implements PKWare decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class pkw(Unit):
    """
    This unit implements PKWare decompression.
    """
    def process(self, data):

        def read_from_table(table: dict[tuple[int, int], int], start: int, stop: int):
            value = length = 0
            while length < start:
                value <<= 1
                value |= getint(1)
                length += 1
            while length < stop:
                try:
                    return table[length, value]
                except KeyError:
                    value <<= 1
                    value |= getint(1)
                    length += 1
            raise ValueError(
                'Failed to decode a symbol in the compressed data stream.')

        reader = StructReader(data)
        codelit = reader.u8()  # First byte is 0 if literals are uncoded, otherwise 1
        maxdict = reader.u8()  # Second byte is 4, 5, or 6 (max size of dictionary)

        if not 0 <= codelit <= 1:
            raise ValueError(F'Invalid literal encoding value {codelit}.')

        if not 4 <= maxdict <= 6:
            raise ValueError(F'Invalid dictionary size {maxdict}.')

        output = MemoryFile()
        getint = reader.read_integer

        while not reader.eof:
            try:
                if not getint(1):
                    if codelit:
                        code = read_from_table(_LITERALS, 4, 14)
                    else:
                        code = getint(8)
                    output.write_byte(code)
                else:
                    length = read_from_table(_COPY_LENGTHS, 2, 0x10)
                    if length == 519:
                        break
                    offset = read_from_table(_COPY_OFFSETS, 2, 0x09)
                    more = (2 if length == 2 else maxdict)
                    offset <<= more
                    offset += getint(more)
                    offset += 1
                    output.replay(offset, length)
            except Exception as E:
                if not (out := output.getvalue()):
                    raise
                raise RefineryPartialResult(str(E), out) from E

        return output.getvalue()

    @classmethod
    def handles(cls, data) -> bool:
        return (len(data) > 2) and (0 <= data[0] <= 1) and (4 <= data[1] <= 6)

class pop (*names)

This unit is implemented in refinery.units.meta.pop and has the following commandline Interface:

usage: pop [-h] [-L] [-Q] [-0] [-v] [instruction ...]

In processing order, remove visible chunks from the current frame and store their contents in the
given meta variables on all chunks that remain. All chunks in the input stream are consequently
made visible again. If pop is used at the end of a frame, then variables will be local to the
parent frame. A pop instruction has the following format:

    count | @ | name[=source][:conversion]

If the instruction is an integer, it is interpreted as count, specifying a number of chunks to be
skipped from the frame without storing them. The letter "@" can be used to remove a single chunk
from the input and merge all of its meta data into the ones that follow. Otherwise, the pop
instruction consists of the name of the variable to be created, an optional source variable name,
and an optional conversion sequence. If no source variable is specified, the chunk contents are
used as the source. The conversion is a sequence of multibin handlers that are applied to the
source data from right to left before storing it. For example, the argument k:le:b64 first
decodes the chunk data using base64, then converts it to an integer in little endian format, and
store the integer result in the variable k. The visual aid is that the content is passed from
right to left through all conversions, into the variable k. Similarly, the argument k=size will
store the current chunk's size in k.

positional arguments:
  instruction    A sequence of instructions, see above.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pop(Unit):
    """
    In processing order, remove visible chunks from the current frame and store their contents in
    the given meta variables on all chunks that remain. All chunks in the input stream are
    consequently made visible again. If pop is used at the end of a frame, then variables will be
    local to the parent frame. A pop instruction has the following format:

        count | {_MERGE_META} | name[{_CHERRYPICK}source][{_CONVERSION}conversion]

    If the instruction is an integer, it is interpreted as `count`, specifying a number of chunks
    to be skipped from the frame without storing them. The letter "{_MERGE_META}" can be used to
    remove a single chunk from the input and merge all of its meta data into the ones that follow.
    Otherwise, the pop instruction consists of the name of the variable to be created, an optional
    source variable name, and an optional conversion sequence. If no source variable is specified,
    the chunk contents are used as the source. The conversion is a sequence of multibin handlers
    that are applied to the source data from right to left before storing it.
    For example, the argument `k:le:b64` first decodes the chunk data using base64, then converts
    it to an integer in little endian format, and store the integer result in the variable `k`. The
    visual aid is that the content is passed from right to left through all conversions, into the
    variable `k`. Similarly, the argument k=size will store the current chunk's size in `k`.
    """
    def __init__(
        self,
        *names: Param[str, Arg.String(metavar='instruction', help='A sequence of instructions, see above.')]
    ):
        if not names:
            names = _MERGE_META,
        super().__init__(names=[_popcount(n) for n in names])

    def process(self, data):
        return data

    def filter(self, chunks: Iterable[Chunk]):
        invisible = []
        variables = {}
        remaining: Iterator[_popcount] = iter(self.args.names)

        it = iter(chunks)
        pop = next(remaining).reset()
        done = False

        for chunk in it:
            if not chunk.visible:
                self.log_debug('buffering invisible chunk')
                invisible.append(chunk)
                continue
            try:
                while not pop.into(variables, chunk):
                    pop = next(remaining).reset()
            except StopIteration:
                done = True
                invisible.append(chunk)
                break

        if not done and pop.done:
            try:
                next(remaining)
            except StopIteration:
                done = True

        if not done:
            msg = 'Not all variables could be assigned.'
            if not self.leniency:
                raise ValueError(F'{msg} Increase leniency to downgrade this failure to a warning.')
            self.log_warn(msg)

        nesting = self.args.nesting

        for chunk in chain(invisible, it):
            meta = chunk.meta
            meta.update(variables)
            if nesting < 0:
                for name in variables:
                    meta.set_scope(name, chunk.scope + nesting)
            chunk.visible = True
            yield chunk

class ppjscript (indent=4, strip_comments=False, strip_lines=False, keep_lines=False, keep_escapes=False)

This unit is implemented in refinery.units.sinks.ppjscript and has the following commandline Interface:

usage: ppjscript [-h] [-L] [-Q] [-0] [-v] [-i N] [-c] [-b | -B] [-E]

Pretty-prints JavaScript without any reflection or evaluation.

options:
  -i, --indent N        Number of space characters used for indentation in the output. Default is
                        4.
  -c, --strip-comments  Remove all comments from the input before pretty-printing.
  -b, --strip-lines     Remove all line breaks after potentially stripping comments, before
                        beautifying.
  -B, --keep-lines      Preserve line breaks as they occur in the input.
  -E, --keep-escapes    Preserve unnecessary escape sequences in string literals.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.

Expand source code Browse git

class ppjscript(Unit):
    """
    Pretty-prints JavaScript without any reflection or evaluation.
    """
    def __init__(
        self,
        indent: Param[int, Arg.Number('-i', help=(
            'Number of space characters used for indentation in the output. Default is {default}.'))] = 4,
        strip_comments: Param[bool, Arg.Switch('-c', help=(
            'Remove all comments from the input before pretty-printing.'))] = False,
        strip_lines: Param[bool, Arg.Switch('-b', group='LINES', help=(
            'Remove all line breaks after potentially stripping comments, before beautifying.'))] = False,
        keep_lines: Param[bool, Arg.Switch('-B', group='LINES', help=(
            'Preserve line breaks as they occur in the input.'))] = False,
        keep_escapes: Param[bool, Arg.Switch('-E', help=(
            'Preserve unnecessary escape sequences in string literals.'))] = False,
    ):
        return super().__init__(
            indent=indent,
            strip_comments=strip_comments,
            strip_lines=strip_lines,
            keep_lines=keep_lines,
            keep_escapes=keep_escapes,
        )

    @Unit.Requires('jsbeautifier', ['display', 'extended'])
    def _jsb():
        import jsbeautifier
        import jsbeautifier.unpackers.javascriptobfuscator

        # TODO: This is a workaround for the following bug:
        # https://github.com/beautify-web/js-beautify/issues/1350
        jsbeautifier.unpackers.javascriptobfuscator.detect = lambda *_: False
        return jsbeautifier

    def process(self, data: bytearray):
        if self.args.strip_comments:
            from refinery.units.obfuscation.js.comments import deob_js_comments
            code = data | deob_js_comments | str
        else:
            code = data.decode(self.codec)
        if self.args.strip_lines:
            code = ' '.join(code.splitlines(False))
        options = self._jsb.default_options()
        options.eval_code = False
        options.indent_size = self.args.indent
        options.unescape_strings = not self.args.keep_escapes
        options.preserve_newlines = self.args.keep_lines
        options.indent_level = 0
        options.keep_array_indentation = False
        return self._jsb.beautify(
            code.strip(), options).encode(self.codec)

class ppjson (tabular=False, indent=4)

This unit is implemented in refinery.units.sinks.ppjson and has the following commandline Interface:

usage: ppjson [-h] [-L] [-Q] [-0] [-v] [-t | -i N]

Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to
zero, the output is minified.

options:
  -t, --tabular   Convert JSON input into a flattened table.
  -i, --indent N  Number of spaces used for indentation. Default is 4.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ppjson(Unit):
    """
    Expects JSON input data and outputs it in a neatly formatted manner.
    If the indentation is set to zero, the output is minified.
    """
    _TRAILING_COMMA = re.compile(BR',\s*(}|])')

    def __init__(
        self,
        tabular: Param[bool, Arg.Switch('-t', group='OUT', help='Convert JSON input into a flattened table.')] = False,
        indent: Param[int, Arg.Number('-i', group='OUT', help='Number of spaces used for indentation. Default is {default}.')] = 4
    ):
        return super().__init__(indent=indent, tabular=tabular)

    def _pretty_output(self, parsed, **kwargs):
        encoded = json.dumps(parsed, **kwargs)
        if self.args.tabular:
            table = list(flattened(json.loads(encoded)))
            width = max(len(key) for key, _ in table)
            tsize = get_terminal_size(80) - width - 4
            for key, value in table:
                if isinstance(value, str):
                    value = value.strip()
                    if not is_printable(value) and all(ord(c) < 0x100 for c in value):
                        value = value.encode('latin1').hex(':')
                value = str(value).rstrip()
                value = textwrap.wrap(value, tsize)
                it = iter(value)
                try:
                    item = next(it)
                except StopIteration:
                    continue
                yield F'{key:<{width}} : {item}'.encode(self.codec)
                for wrap in it:
                    yield F'{"":<{width + 3}}{wrap}'.encode(self.codec)
        else:
            yield encoded.encode(self.codec)

    def process(self, data):
        if self._TRAILING_COMMA.search(data):
            def smartfix(match):
                k = match.start()
                return match.group(0 if any(k in s for s in strings) else 1)
            from refinery.lib.patterns import formats
            strings = {range(*m.span()) for m in formats.string.finditer(data)}
            data = self._TRAILING_COMMA.sub(smartfix, data)
        kwargs = {'indent': self.args.indent} if self.args.indent else {'separators': (',', ':')}
        yield from self._pretty_output(json.loads(data), **kwargs)

class ppxml (indent=4, header=False)

This unit is implemented in refinery.units.sinks.ppxml and has the following commandline Interface:

usage: ppxml [-h] [-L] [-Q] [-0] [-v] [-F] [-i N] [-x]

Expects XML input data and outputs it in a neatly formatted manner.

options:
  -i, --indent N  Controls the amount of space characters used for indentation in the output.
                  Default is 4.
  -x, --header    Add an XML header to the formatted output.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.
  -F, --iff       Only apply unit if it can handle the input format. Specify twice to drop all
                  other chunks.

Expand source code Browse git

class ppxml(Unit):
    """
    Expects XML input data and outputs it in a neatly formatted manner.
    """

    def __init__(self,
        indent: Param[int, Arg.Number('-i', help=(
            'Controls the amount of space characters used for indentation in the output. Default is 4.'))] = 4,
        header: Param[bool, Arg.Switch('-x', help='Add an XML header to the formatted output.')] = False
    ):
        super().__init__(indent=indent, header=header)

    def process(self, data):

        pad = self.args.indent * ' '
        etm = {}

        try:
            dom = ForgivingParse(data, etm)
        except Exception:
            from refinery.lib.meta import metavars
            msg = 'error parsing as XML, returning original content'
            path = metavars(data).get('path')
            if path:
                msg = F'{msg}: {path}'
            self.log_warn(msg)
            return data

        def indent(element, level=0, more_sibs=False):
            """
            The credit for this one goes to:
            https://stackoverflow.com/a/12940014
            """
            indentation = '\n'
            if level:
                indentation += (level - 1) * pad
            childcount = len(element)
            if childcount:
                if not element.text or not element.text.strip():
                    element.text = indentation + pad
                    if level:
                        element.text += pad
                for count, child in enumerate(element):
                    indent(child, level + 1, count < childcount - 1)
                if level and (not element.tail or element.tail.isspace()):
                    element.tail = indentation
                    if more_sibs:
                        element.tail += pad
            elif level and (not element.tail or element.tail.isspace()):
                element.tail = indentation
                if more_sibs: element.tail += pad

        indent(dom.getroot())

        with io.BytesIO() as output:
            dom.write(output, encoding=self.codec, xml_declaration=self.args.header)
            result = output.getvalue()

        for uid, key in etm.items():
            entity = F'&{key};'.encode(self.codec)
            needle = uid.encode(self.codec)
            result = result.replace(needle, entity)

        return result

    @classmethod
    def handles(cls, data):
        return is_likely_xml(data)

class push (data=b'')

This unit is implemented in refinery.units.meta.push and has the following commandline Interface:

usage: push [-h] [-L] [-Q] [-0] [-v] [data]

The unit inserts an additional chunk before each input chunk and moves the original data out of
scope. This chunk is considered the "original" data, while the one inserted in front of it is
used as an intermediate result. By default, this intermediate data is a copy of the input data.
For example:

    emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]

will output key=censored. The application of rex turns the (duplicated) data into just the value,
which is then stored in the variable v. The application of repl replaces this value with the
hard-coded string censored.

positional arguments:
  data           The data to be pushed, by default a copy of the input.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class push(Unit):
    """
    The unit inserts an additional chunk before each input chunk and moves the original
    data out of scope. This chunk is considered the "original" data, while the one inserted
    in front of it is used as an intermediate result. By default, this intermediate data is
    a copy of the input data. For example:

        emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]

    will output `key=censored`. The application of `refinery.rex` turns the (duplicated)
    data into just the value, which is then stored in the variable `v`. The application
    of `refinery.repl` replaces this value with the hard-coded string `censored`.
    """
    def __init__(self, data: Param[buf, Arg(help='The data to be pushed, by default a copy of the input.')] = B''):
        super().__init__(data=data)

    def process(self, data: Chunk):
        src = self.args.data
        tos = data.copy(meta=True, data=False)
        tos[:] = src or data
        if self.args.nesting > 0:
            data.set_next_scope(False)
        else:
            try:
                data.visible = False
            except AttributeError:
                self.log_warn('application has no effect outside frame.')
        yield data
        yield tos

class put (name, value=<object object>)

This unit is implemented in refinery.units.meta.put and has the following commandline Interface:

usage: put [-h] [-L] [-Q] [-0] [-v] name [value]

Can be used to add a meta variable to the processed chunk. Note that meta variables cease to
exist outside a frame.

positional arguments:
  name           The name of the variable to be used.
  value          The value for the variable. If no value is given, the entire current chunk is
                 stored.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class put(Unit):
    """
    Can be used to add a meta variable to the processed chunk. Note that meta variables
    cease to exist outside a frame.
    """
    def __init__(
        self,
        name: Param[str, Arg.String(help='The name of the variable to be used.')],
        value: Param[isq, Arg.NumSeq(check=False, help=(
            'The value for the variable. If no value is given, the entire current chunk is stored.'
        ))] = _EMPTY
    ):
        super().__init__(name=check_variable_name(name), value=value)

    def process(self, data: Chunk):
        value = self.args.value
        if value is _EMPTY:
            value = data
        if not isinstance(value, (int, float)) and not isbuffer(value):
            try:
                len(value)
            except TypeError:
                if isinstance(value, itertools.repeat):
                    value = next(value)
                if not isinstance(value, (int, float)):
                    raise NotImplementedError(F'put does not support {value.__class__.__name__} values.')
            else:
                if not isinstance(value, (dict, list)):
                    value = list(value)
        self.log_debug(F'storing {typename(value)}:', value, clip=True)
        data.meta[self.args.name] = value
        return data

class pyc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.pyc and has the following commandline Interface:

usage: pyc [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
           [path ...]

Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does
not work on recent Python versions, but anything below 3.9 should work.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class pyc(ArchiveUnit):
    """
    Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does
    not work on recent Python versions, but anything below 3.9 should work.
    """
    def unpack(self, data):
        input_path = metavars(data).get(self.args.path.decode(self.codec))
        for k, code in enumerate(extract_code_from_buffer(bytes(data), input_path)):
            if (co := code.container) is None:
                raise ValueError('could not find code in buffer')
            path = co.co_filename or F'__unknown_name_{k:02d}.py'
            date = datetime.fromtimestamp(code.timestamp)
            data = decompile_buffer(code)
            yield self._pack(path, date, data)

class pym (version=None, system=False, redump=False)

This unit is implemented in refinery.units.formats.pym and has the following commandline Interface:

usage: pym [-h] [-L] [-Q] [-0] [-v] [-R] [-V V] [-s] [-r]

Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an older
Python version, you can use the pyc unit to then decompile the code, but for more recent versions
a separate Python decompiler will be required.

options:
  -V, --version V  Optionally select the (known) Python version.
  -s, --system     Try to use the built-in marshal.loads before using the parser.
  -r, --redump     Load marshaled code objects before re-dumping them.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class pym(Unit):
    """
    Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an
    older Python version, you can use the `refinery.pyc` unit to then decompile the code, but
    for more recent versions a separate Python decompiler will be required.
    """
    def __init__(
        self,
        version: Param[str | None, Arg.String('-V', metavar='V',
            help='Optionally select the (known) Python version.')] = None,
        system: Param[bool, Arg.Switch('-s',
            help='Try to use the built-in marshal.loads before using the parser.')] = False,
        redump: Param[bool, Arg.Switch('-r',
            help='Load marshaled code objects before re-dumping them.')] = False,
    ):
        super().__init__(
            version=version,
            system=system,
            redump=redump,
        )

    def reverse(self, data):
        return marshal.dumps(data)

    def process(self, data):
        def toblob(data):
            if isinstance(data, (bytes, bytearray)):
                self.log_info('unmarshalled a byte string, returning as is')
                return data
            if isinstance(data, str):
                self.log_info(F'unmarshalled a string object, encoding as {self.codec}')
                return data.encode(self.codec)
            if isinstance(data, CodeType):
                self.log_info('unmarshalled a code object, converting to pyc')
                pyc = code_header()
                pyc.extend(marshal.dumps(data))
                return pyc
            if isinstance(data, int):
                self.log_info('unmarshalled an integer, returning big endian encoding')
                q, r = divmod(data.bit_length(), 8)
                q += int(bool(r))
                return data.to_bytes(q, 'big')
            if isinstance(data, dict):
                with BytesAsStringEncoder as encoder:
                    return encoder.dumps(data).encode(self.codec)
            raise NotImplementedError(
                F'No serialization implemented for object of type {data.__class__.__name__}')

        if version := self.args.version:
            version = version2tuple(version)

        if version and version != SYS_PYTHON or not self.args.system:
            out = None
        else:
            try:
                out = marshal.loads(data)
            except Exception as error:
                self.log_info(F'the marshal.loads method failed: {error!s}')
                out = None
            else:
                v = sys.version_info
                self.log_info(F'unmarshaled using the {v.major}.{v.minor}.{v.micro} built-in marshal.loads')

        if out is None:
            dumpcode = not self.args.redump
            memory = memoryview(data)
            unpacker = Marshal(memory, version=version, dumpcode=dumpcode)
            out = unpacker.object()

        if isinstance(out, (list, tuple, set, frozenset)):
            self.log_info('object is a collection, converting each item individually')
            for item in out:
                yield toblob(item)
        else:
            yield toblob(out)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return marshal.dumps(data)

class pymstr (buffers=False, strings=False)

This unit is implemented in refinery.units.formats.pymstr and has the following commandline Interface:

usage: pymstr [-h] [-L] [-Q] [-0] [-v] [-b] [-s]

Extract string constants from Python-Marshaled objects.

options:
  -b, --buffers  Dump byte strings.
  -s, --strings  Dump strings.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pymstr(Unit):
    """
    Extract string constants from Python-Marshaled objects.
    """
    def __init__(
        self,
        buffers: Param[bool, Arg.Switch('-b', help='Dump byte strings.')] = False,
        strings: Param[bool, Arg.Switch('-s', help='Dump strings.')] = False,
    ):
        if not buffers and not strings:
            buffers = strings = True
        super().__init__(buffers=buffers, strings=strings)

    def process(self, data):
        marshaled = Marshal(memoryview(data))
        marshaled.object()
        if self.args.buffers:
            for bs in marshaled.buffers:
                yield bs
        if self.args.strings:
            for us in marshaled.strings:
                yield us.encode(self.codec)

class qb (*data)

This unit is implemented in refinery.units.meta.queue and has the following commandline Interface:

usage: qb [-h] [-L] [-Q] [-0] [-v] [data ...]

Short for "queue back": Insert new chunks at the end of the current frame.

positional arguments:
  data           The arguments are inserted into the current frame in the given order. These
                 arguments are multibin expressions; If the expression depends on the input data,
                 it will always refer to the first chunk in the current frame. If no argument is
                 given, a single empty chunk is inserted.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qb(QueueUnit):
    """
    Short for "queue back": Insert new chunks at the end of the current frame.
    """
    def filter(self, chunks: Iterable[Chunk]):
        yield from self._queue(chunks, False)

class qf (*data)

This unit is implemented in refinery.units.meta.queue and has the following commandline Interface:

usage: qf [-h] [-L] [-Q] [-0] [-v] [data ...]

Short for "queue front": Insert new chunks at the beginning of the current frame.

positional arguments:
  data           The arguments are inserted into the current frame in the given order. These
                 arguments are multibin expressions; If the expression depends on the input data,
                 it will always refer to the first chunk in the current frame. If no argument is
                 given, a single empty chunk is inserted.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qf(QueueUnit):
    """
    Short for "queue front": Insert new chunks at the beginning of the current frame.
    """
    def filter(self, chunks: Iterable[Chunk]):
        yield from self._queue(chunks, True)

class qlz

This unit is implemented in refinery.units.compression.qlz and has the following commandline Interface:

usage: qlz [-h] [-L] [-Q] [-0] [-v]

This unit implements QuickLZ decompression levels 1 and 3.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qlz(Unit):
    """
    This unit implements QuickLZ decompression levels 1 and 3.
    """

    def process(self, data):
        source = memoryview(data)
        head = source[0]
        clvl = (head >> 2) & 0x3

        if head & 2:
            self.log_info('long header detected')
            size = int.from_bytes(source[5:9], 'little')
            source = source[9:]
        else:
            self.log_info('short header detected')
            size = source[3]
            source = source[3:]
        if head & 1 != 1:
            self.log_warn('header indicates that data is uncompressed, returning remaining data')
            return source
        else:
            self.log_info(F'compression level {clvl}, decompressed size {SizeInt(size)!r}')

        def fetchhash():
            return int.from_bytes(destination[hashvalue + 1:hashvalue + 4], byteorder='little')

        codeword = 1
        destination = bytearray()
        hashtable = [0] * _HASH_VALUES
        hashvalue = -1
        last_matchstart = size - _UNCONDITIONAL_MATCHLEN - _UNCOMPRESSED_END - 1
        fetch = 0

        if clvl == 2:
            raise ValueError("This version only supports level 1 and 3")
        while source:
            if codeword == 1:
                codeword = int.from_bytes(source[:4], byteorder='little')
                source = source[4:]
                if len(destination) <= last_matchstart:
                    c = 3 if clvl == 1 else 4
                    fetch = int.from_bytes(source[:c], byteorder='little')
            if codeword & 1:
                codeword = codeword >> 1
                if clvl == 1:
                    hash = (fetch >> 4) & 0xFFF
                    offset = hashtable[hash]
                    if fetch & 0xF:
                        matchlen = (fetch & 0xF) + 2
                        source = source[2:]
                    else:
                        matchlen = source[2]
                        source = source[3:]
                else:
                    if (fetch & 3) == 0:
                        delta = (fetch & 0xFF) >> 2
                        matchlen = 3
                        source = source[1:]
                    elif (fetch & 2) == 0:
                        delta = (fetch & 0xFFFF) >> 2
                        matchlen = 3
                        source = source[2:]
                    elif (fetch & 1) == 0:
                        delta = (fetch & 0xFFFF) >> 6
                        matchlen = ((fetch >> 2) & 15) + 3
                        source = source[2:]
                    elif (fetch & 127) != 3:
                        delta = (fetch >> 7) & 0x1FFFF
                        matchlen = ((fetch >> 2) & 0x1F) + 2
                        source = source[3:]
                    else:
                        delta = fetch >> 15
                        matchlen = ((fetch >> 7) & 255) + 3
                        source = source[4:]
                    offset = (len(destination) - delta) & 0xFFFFFFFF

                for i in range(offset, offset + matchlen):
                    destination.append(destination[i])

                if clvl == 1:
                    fetch = fetchhash()
                    while hashvalue < len(destination) - matchlen:
                        hashvalue += 1
                        hash = ((fetch >> 12) ^ fetch) & _HASH_MASK
                        hashtable[hash] = hashvalue
                        fetch = fetch >> 8 & 0xFFFF
                        try:
                            fetch |= destination[hashvalue + 3] << 16
                        except IndexError:
                            pass
                    fetch = int.from_bytes(source[:3], byteorder='little')
                else:
                    fetch = int.from_bytes(source[:4], byteorder='little')
                hashvalue = len(destination) - 1
            else:
                if len(destination) <= last_matchstart:
                    destination.append(source[0])
                    source = source[1:]
                    codeword = codeword >> 1
                    if clvl == 1:
                        while hashvalue < len(destination) - 3:
                            fetch2 = fetchhash()
                            hashvalue += 1
                            hash = ((fetch2 >> 12) ^ fetch2) & _HASH_MASK
                            hashtable[hash] = hashvalue
                        fetch = fetch >> 8 & 0xFFFF | source[2] << 16
                    else:
                        fetch = fetch >> 8 & 0xFFFF
                        fetch |= source[2] << 16
                        fetch |= source[3] << 24
                else:
                    while len(destination) <= size - 1:
                        if codeword == 1:
                            source = source[4:]
                            codeword = 0x80000000
                        destination.append(source[0])
                        source = source[1:]
                        codeword = codeword >> 1
                    break
        if len(destination) != size:
            raise RefineryPartialResult(
                F'Header indicates decompressed size 0x{size:X}, but 0x{len(destination):X} bytes '
                F'were decompressed.', destination)
        return destination

class qr

This unit is implemented in refinery.units.formats.qr and has the following commandline Interface:

usage: qr [-h] [-L] [-Q] [-0] [-v]

Extract information from bar codes, especially QR codes. This unit is a thin proxy around the
pyzbar library, which itself only provides Python bindings for the ZBar library.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qr(Unit):
    """
    Extract information from bar codes, especially QR codes. This unit is a thin proxy around the
    pyzbar library, which itself only provides Python bindings for the ZBar library.
    """
    @Unit.Requires('pyzbar', ['formats', 'extended', 'all'], info=_ZBAR_FOOTNOTE)
    def _pyzbar():
        try:
            import pyzbar
            import pyzbar.pyzbar
        except ModuleNotFoundError:
            raise
        except ImportError as ie:
            msg = str(ie).split()
            if 'zbar' in msg and 'shared' in msg:
                if info := _ZBAR_ON_ERROR:
                    raise RefineryImportError(info)
            raise RefineryImportError(F'there was an unexpected error importing pyzbar: {ie!s}')
        return pyzbar

    @Unit.Requires('Pillow', ['formats', 'extended', 'all'])
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        try:
            img = self._image.open(MemoryFile(data, output=bytes))
        except ImportError:
            raise
        except Exception:
            raise ValueError('the input data is not recognized as an image')
        else:
            bar = self._pyzbar.pyzbar.decode(img)
        for data in bar:
            self.log_debug(data)
            if not (data := getattr(data, 'data', None)):
                continue
            if isinstance(data, str):
                data = data.encode(self.codec)
            if isinstance(data, (bytes, bytearray)):
                yield data
                continue
            self.log_warn(
                F'skipping unknown data generated by zbar: {data!r}', clip=True)

class rabbit (key, discard=0, stateful=False, iv=b'')

This unit is implemented in refinery.units.crypto.cipher.rabbit and has the following commandline Interface:

usage: rabbit [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] [-i IV] key

RABBIT encryption and decryption.

positional arguments:
  key              The encryption key.

options:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.
  -i, --iv IV      Optional initialization vector.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class rabbit(StreamCipherUnit):
    """
    RABBIT encryption and decryption.
    """
    key_size = {16}

    def __init__(self, key, discard=0, stateful=False, iv: Param[buf, Arg('-i', '--iv', help='Optional initialization vector.')] = B''):
        super().__init__(key=key, iv=iv, stateful=stateful, discard=discard)

    def keystream(self) -> Iterable[int]:
        if len(self.args.iv) not in (0, 8):
            raise ValueError('The IV length must be exactly 8 bytes.')
        return RabbitCipher(self.args.key, self.args.iv)

class rc2 (key, *, iv=b'', eks=1024, derive_eks=False, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

This unit is implemented in refinery.units.crypto.cipher.rc2 and has the following commandline Interface:

usage: rc2 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-k N | -d] [-p P] [-m M] [-r] [-e] [-S N]
           [-t TAG] [-a AAD]
           key

RC2 encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -k, --eks N           Set the effective key size. Default is 1024.
  -d, --dks             Act as .NET and derive the effective key size from the key length.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -t, --tag TAG         Only for EAX, GCM, OCB, and CCM: An authentication tag to verify the
                        message. For encryption, this parameter specifies the tag length, and the
                        tag is provided as a meta variable named "tag".
  -a, --aad AAD         Only for EAX, GCM, OCB, and CCM: Set additional authenticated data.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class rc2(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(ARC2)):
    """
    RC2 encryption and decryption.
    """

    def __init__(
        self, key, *,
        iv=b'',
        eks: Param[int, Arg.Number('-k', '--eks', group='EKS',
            help='Set the effective key size. Default is {default}.')] = 1024,
        derive_eks: Param[bool, Arg.Switch('-d', '--dks', group='EKS',
            help='Act as .NET and derive the effective key size from the key length.')] = False,
        padding=None,
        mode=None,
        raw=False,
        little_endian=False,
        segment_size=0,
        tag=None,
        aad=None,
        **keywords
    ):
        super().__init__(
            key,
            iv=iv,
            eks=eks,
            derive_eks=derive_eks,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            tag=tag,
            aad=aad,
            **keywords
        )

    def _new_cipher(self, **optionals) -> CipherInterface:
        eks = len(self.args.key) * 8 if self.args.derive_eks else self.args.eks
        optionals.update(effective_keylen=eks)
        return super()._new_cipher(**optionals)

class rc4 (key, discard=0)

This unit is implemented in refinery.units.crypto.cipher.rc4 and has the following commandline Interface:

usage: rc4 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] key

RC4 encryption and decryption.

positional arguments:
  key              The encryption key.

options:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class rc4(StandardCipherUnit, cipher=PyCryptoFactoryWrapper(ARC4)):
    """
    RC4 encryption and decryption.
    """
    def __init__(
        self, key,
        discard: Param[int, Arg.Number('-d', help='Discard the first {varname} bytes of the keystream, {default} by default.')] = 0,
    ):
        super().__init__(key, discard=discard)

    def _new_cipher(self, **optionals):
        return super()._new_cipher(drop=self.args.discard, **optionals)

class rc4mod (key, stateful=False, discard=0, *, size=256)

This unit is implemented in refinery.units.crypto.cipher.rc4mod and has the following commandline Interface:

usage: rc4mod [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-t N] key

Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be
altered.

positional arguments:
  key              The encryption key.

options:
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -t, --size N     Table size, 256 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class rc4mod(StreamCipherUnit):
    """
    Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered.
    """

    def __init__(
        self, key, stateful=False, discard=0, *,
        size: Param[int, Arg.Number('-t', help='Table size, {default} by default.', bound=(1, None))] = 0x100
    ):
        super().__init__(key=key, stateful=stateful, discard=discard, size=size)

    def keystream(self):
        size = self.args.size
        tablerange = range(max(size, 0x100))
        b, table = 0, bytearray(k & 0xFF for k in tablerange)
        for a, keybyte in zip(tablerange, cycle(self.args.key)):
            t = table[a]
            b = (b + keybyte + t) % size
            table[a] = table[b]
            table[b] = t
        self.log_debug(lambda: F'SBOX = {table.hex(" ").upper()}', clip=True)
        b, a = 0, 0
        while True:
            a = (a + 1) % size
            t = table[a]
            b = (b + t) % size
            table[a] = table[b]
            table[b] = t
            yield table[(table[a] + t) % size]

class rc5 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=12, word_size=32, aad=b'', tag=())

This unit is implemented in refinery.units.crypto.cipher.rc5 and has the following commandline Interface:

usage: rc5 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-k N] [-w N] key

RC5 encryption and decryption.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -k, --rounds N        Number of rounds to use, the default is 12
  -w, --word-size N     The word size in bits, 32 by default.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class rc5(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC5)):
    """
    RC5 encryption and decryption.
    """
    def __init__(
        self, key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0,
        rounds: Param[int, Arg.Number('-k', help='Number of rounds to use, the default is {default}')] = _R,
        word_size: Param[int, Arg.Number('-w', help='The word size in bits, {default} by default.')] = _W,
        **more
    ):
        super().__init__(
            key,
            iv=iv,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            rounds=rounds,
            word_size=word_size,
            **more
        )

    @property
    def block_size(self):
        return self.args.word_size // 4

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            rounds=self.args.rounds,
            word_size=self.args.word_size,
            **optionals
        )

class rc6 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=20, word_size=32)

This unit is implemented in refinery.units.crypto.cipher.rc6 and has the following commandline Interface:

usage: rc6 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-k N] [-w N] key

RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for
the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the
unit will allow any key size up to 256 bits.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.
  -k, --rounds N        Number of rounds to use, the default is 20
  -w, --word-size N     The word size in bits, 32 by default.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class rc6(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC6)):
    """
    RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen
    for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but
    the unit will allow any key size up to 256 bits.
    """
    def __init__(
        self, key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0,
        rounds: Param[int, Arg.Number('-k', help='Number of rounds to use, the default is {default}')] = _R,
        word_size: Param[int, Arg.Number('-w', help='The word size in bits, {default} by default.')] = _W,
    ):
        super().__init__(
            key,
            iv=iv,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            rounds=rounds,
            word_size=word_size
        )

    @property
    def block_size(self):
        return self.args.word_size // 2

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            rounds=self.args.rounds,
            word_size=self.args.word_size,
            **optionals
        )

class recode (decode=None, encode='UTF8', decerr=None, encerr=None, errors=None)

This unit is implemented in refinery.units.encoding.recode and has the following commandline Interface:

usage: recode [-h] [-L] [-Q] [-0] [-v] [-R]
              [-d {strict,ignore,replace,xmlref,backslash,surrogate}]
              [-e {strict,ignore,replace,xmlref,backslash,surrogate}]
              [-E {strict,ignore,replace,xmlref,backslash,surrogate}]
              [decode-as] [encode-as]

Expects input string data encoded in the from encoding and encodes it in the to encoding, then
outputs the result.

positional arguments:
  decode-as                   Input encoding; Guess encoding by default.
  encode-as                   Output encoding; The default is UTF8.

options:
  -d, --decerr {strict,ignore,replace,xmlref,backslash,surrogate}
                              Specify an error handler for decoding.
  -e, --encerr {strict,ignore,replace,xmlref,backslash,surrogate}
                              Specify an error handler for encoding.
  -E, --errors {strict,ignore,replace,xmlref,backslash,surrogate}
                              Specify an error handler for both encoding and decoding. The
                              possible choices are the following: strict, ignore, replace,
                              xmlref, backslash, surrogate

generic options:
  -h, --help                  Show this help message and exit.
  -L, --lenient               Increase the leniency, allowing partial results and ignoring more
                              errors.
  -Q, --quiet                 Disables all log output.
  -0, --devnull               Do not produce any output.
  -v, --verbose               Specify up to two times to increase log level.
  -R, --reverse               Use the reverse operation.

Expand source code Browse git

class recode(Unit):
    """
    Expects input string data encoded in the `from` encoding and encodes it in
    the `to` encoding, then outputs the result.
    """

    def __init__(
        self,
        decode: Param[str, Arg.String(metavar='decode-as', help='Input encoding; Guess encoding by default.')] = None,
        encode: Param[str, Arg.String(metavar='encode-as', help=F'Output encoding; The default is {Unit.codec}.')] = Unit.codec,
        decerr: Param[str, Arg.Option('-d', choices=Handler,
            help='Specify an error handler for decoding.')] = None,
        encerr: Param[str, Arg.Option('-e', choices=Handler,
            help='Specify an error handler for encoding.')] = None,
        errors: Param[str, Arg.Option('-E', choices=Handler, help=(
            'Specify an error handler for both encoding and decoding. '
            'The possible choices are the following: {choices}'))] = None,
    ):
        super().__init__(
            decode=decode,
            encode=encode,
            decerr=Arg.AsOption(decerr or errors or 'STRICT', Handler).value,
            encerr=Arg.AsOption(encerr or errors or 'STRICT', Handler).value
        )

    @Unit.Requires('chardet', ['default', 'extended'])
    def _chardet():
        import chardet
        return chardet

    def _detect(self, data):
        mv = memoryview(data)
        if not any(mv[1::2]): return 'utf-16le'
        if not any(mv[0::2]): return 'utf-16be'
        detection = self._chardet.detect(data)
        codec = detection['encoding']
        self.log_info(lambda: F'Using input encoding: {codec}, detected with {int(detection["confidence"] * 100)}% confidence.')
        return codec

    def _recode(self, enc, dec, encerr, decerr, data):
        dec = dec or self._detect(data)
        return codecs.encode(codecs.decode(data, dec, errors=decerr), enc, errors=encerr)

    def reverse(self, data):
        return self._recode(self.args.decode, self.args.encode, self.args.decerr, self.args.encerr, data)

    def process(self, data):
        return self._recode(self.args.encode, self.args.decode, self.args.encerr, self.args.decerr, data)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return self._recode(self.args.decode, self.args.encode, self.args.decerr, self.args.encerr, data)

class reduce (suffix, just=0, temp='t')

This unit is implemented in refinery.units.meta.reduce and has the following commandline Interface:

usage: reduce [-h] [-L] [-Q] [-0] [-v] [-j N] [-t name] suffix

The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a
single chunk. The first chunk in the frame serves as initialization.

positional arguments:
  suffix           The remaining command line is a multibin suffix. The reduction accumulator is
                   initialized with the first chunk in the frame. Then, each remaining chunk is
                   processed with the given suffix and the result is used to overwrite the
                   accumulator.

options:
  -j, --just N     Optionally specify a maximum number of chunks to process beyond the first.
  -t, --temp name  The name of the accumulator variable. The default is "t".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class reduce(Unit):
    """
    The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a
    single chunk. The first chunk in the frame serves as initialization.
    """

    def __init__(self,
        suffix: Param[str, Arg.String(help=(
            'The remaining command line is a multibin suffix. The reduction accumulator is initialized '
            'with the first chunk in the frame. Then, each remaining chunk is processed with the given '
            'suffix and the result is used to overwrite the accumulator.'
        ))],
        just: Param[int, Arg.Number('-j',
            help='Optionally specify a maximum number of chunks to process beyond the first.')] = 0,
        temp: Param[str, Arg.String('-t', metavar='name',
            help='The name of the accumulator variable. The default is "{default}".')] = 't',
    ):
        super().__init__(suffix=suffix, temp=temp, just=just)

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        just = self.args.just
        name = self.args.temp
        accu = next(it)
        if not just:
            scope = it
        else:
            import itertools
            self.log_info(F'reducing only the next {just} chunks')
            scope = itertools.islice(it, 0, just)
        for chunk in scope:
            chunk.meta[name] = accu
            accu[:] = DelayedBinaryArgument(self.args.suffix, reverse=True, seed=chunk)(chunk)
            self.log_debug('reduced:', accu, clip=True)
        accu.meta.discard(name)
        yield accu
        yield from it

class rep (count=2, label='')

This unit is implemented in refinery.units.strings.rep and has the following commandline Interface:

usage: rep [-h] [-L] [-Q] [-0] [-v] [count] [label]

Duplicates the given input a given number of times. It is also possible to specify an iterable
instead of a number, in which case the input will be replicated once for each item in this
iterable.

positional arguments:
  count          Defines the number of outputs to generate for each input. The default is 2. You
                 can specify any multibin expression that defines an integer iterable here: Each
                 input chunk will be replicated once for each element of that sequence.
  label          If specified, the meta variable with this name will be populated with the index
                 of the replicated chunk. When the count parameter is an integer, this label will
                 be equivalent to the index meta variable.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rep(Unit):
    """
    Duplicates the given input a given number of times. It is also possible to specify
    an iterable instead of a number, in which case the input will be replicated once for
    each item in this iterable.
    """

    def __init__(
        self,
        count: Param[isq, Arg.NumSeq(help=(
            'Defines the number of outputs to generate for each input. The default is {default}. '
            'You can specify any multibin expression that defines an integer iterable here: Each '
            'input chunk will be replicated once for each element of that sequence.'))] = 2,
        label: Param[str, Arg.String(help=(
            'If specified, the meta variable with this name will be populated with the index of '
            'the replicated chunk. When the count parameter is an integer, this label will be '
            'equivalent to the index meta variable.'))] = ''
    ):
        super().__init__(count=count, label=label)

    def process(self, data: bytearray):
        def count():
            count = self.args.count
            if isinstance(count, int):
                return count
            return sum(1 for _ in count)

        if self.args.squeeze or not self._framed:
            self.log_debug('compressing all repeated items into a single chunk')
            yield data * count()
            return

        self.log_debug('emitting each repeated item as an individual chunk')

        if label := self.args.label:
            meta = {}
            for counter in self.args.count:
                meta[label] = counter
                yield self.labelled(data, **meta)
        else:
            yield from repeat(data, count())

class repl (search, replace=b'', count=-1)

This unit is implemented in refinery.units.strings.repl and has the following commandline Interface:

usage: repl [-h] [-L] [-Q] [-0] [-v] [-n N] search [replace]

Performs a simple binary string replacement on the input data.

positional arguments:
  search         This is the search term.
  replace        The substitution string. Leave this empty to remove all occurrences of the
                 search term.

options:
  -n, --count N  Only replace the given number of occurrences

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class repl(Unit):
    """
    Performs a simple binary string replacement on the input data.
    """

    def __init__(
        self,
        search: Param[buf, Arg(help='This is the search term.')],
        replace: Param[buf, Arg(help='The substitution string. Leave this empty to remove all occurrences of the search term.')] = B'',
        count: Param[int, Arg.Number('-n', help='Only replace the given number of occurrences')] = -1
    ):
        super().__init__(search=search, replace=replace, count=count)

    def process(self, data: bytearray):
        return data.replace(
            self.args.search,
            self.args.replace,
            self.args.count
        )

class resplit (regex=b'\\r?\\n', multiline=False, ignorecase=False, count=0)

This unit is implemented in refinery.units.pattern.resplit and has the following commandline Interface:

usage: resplit [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-c N] [regex]

Splits the data at the given regular expression and returns the sequence of chunks between the
separators. By default, the input is split along line breaks.

positional arguments:
  regex             Regular expression to match.

options:
  -M, --multiline   Caret and dollar in regular expressions match the beginning and end of a line
                    and a dot does not match line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters in regular expressions.
  -c, --count N     Specify the maximum number of operations to perform.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class resplit(SingleRegexUnit):
    """
    Splits the data at the given regular expression and returns the sequence of
    chunks between the separators. By default, the input is split along line breaks.
    """

    def __init__(
        self, regex=RB'\r?\n', multiline=False, ignorecase=False, count=0
    ):
        super().__init__(regex=regex, multiline=multiline, ignorecase=ignorecase, count=count)

    def process(self, data):
        view = memoryview(data)
        cursor = 0
        count = self.args.count
        for k, match in enumerate(self.regex.finditer(view), 2):
            yield view[cursor:match.start()]
            cursor = match.end()
            yield from match.groups()
            if k > count > 0:
                break
        yield view[cursor:]

class resub (regex='\\s+', subst=b'', multiline=False, ignorecase=False, count=0)

This unit is implemented in refinery.units.pattern.resub and has the following commandline Interface:

usage: resub [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-c N] [regex] [subst]

A unit for performing substitutions based on a binary regular expression pattern. Besides the
syntax {k} to insert the k-th match group, the unit supports processing the contents of match
groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

    {match-group:handlers}

where :handlers is an optional reverse multibin expression that is used to post-process the
binary data from the match. For example, {2:hex:b64} represents the base64-decoding of the hex-
decoding of the second match group.

positional arguments:
  regex             Regular expression to be searched and replaced. The default is "\s+".
  subst             Substitution value: use {1} for group 1, {0} for entire match. Matches are
                    removed (replaced by an empty string) by default.

options:
  -M, --multiline   Caret and dollar in regular expressions match the beginning and end of a line
                    and a dot does not match line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters in regular expressions.
  -c, --count N     Specify the maximum number of operations to perform.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class resub(SingleRegexUnit):
    """
    A unit for performing substitutions based on a binary regular expression pattern. Besides the
    syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of
    match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

        {match-group:handlers}

    where `:handlers` is an optional reverse multibin expression that is used to post-process the
    binary data from the match. For example, `{2:hex:b64}` represents the base64-decoding of the
    hex-decoding of the second match group.
    """
    def __init__(
        self,
        regex: Param[str, Arg(help='Regular expression to be searched and replaced. The default is "{default}".')] = '\\s+',
        subst: Param[buf, Arg('subst', help=(
            'Substitution value: use {1} for group 1, {0} for entire match. Matches are removed '
            '(replaced by an empty string) by default.'
        ))] = B'',
        multiline=False,
        ignorecase=False,
        count=0
    ):
        super().__init__(regex=regex, subst=subst, multiline=multiline, ignorecase=ignorecase, count=count)

    def process(self, data):
        def repl(match: Match):
            return meta.format_bin(spec, self.codec, [match[0], *match.groups()], match.groupdict())
        self.log_info('pattern:', getattr(self.regex, 'pattern', self.regex))
        self.log_info('replace:', self.args.subst)
        meta = metavars(data)
        spec = self.args.subst.decode('ascii', 'backslashreplace')
        substitute = self.regex.sub
        if self.args.count:
            from functools import partial
            substitute = partial(substitute, count=self.args.count)
        return substitute(repl, data)

class rev (blocksize=1)

This unit is implemented in refinery.units.blockwise.rev and has the following commandline Interface:

usage: rev [-h] [-L] [-Q] [-0] [-v] [-B N]

The blocks of the input data are output in reverse order. If the length of the input data is not
a multiple of the block size, the data is truncated.

options:
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class rev(UnaryOperation):
    """
    The blocks of the input data are output in reverse order. If the length of
    the input data is not a multiple of the block size, the data is truncated.
    """

    def __init__(self, blocksize=1):
        super().__init__(blocksize=blocksize, _truncate=2)

    def inplace(self, block: ndarray):
        return self._numpy.flip(block)

    operate = NotImplemented

    def process(self, data: bytearray):
        if self.bytestream:
            data.reverse()
            return data
        try:
            return self._fastblock(data)
        except FastBlockError:
            b = self.blocksize
            n = len(data)
            q = n // b
            m = q * b
            view = memoryview(data)
            temp = bytearray(b)
            for k in range(0, (q // 2) * b, b):
                lhs = slice(k, k + b)
                rhs = slice(m - k - b, m - k)
                temp[:] = view[rhs]
                data[rhs] = view[lhs]
                data[lhs] = temp
            if m < n:
                del view
                del temp
                del data[m:]
            return data

class rex (regex, /, *transformation, unicode=False, unique=False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.rex and has the following commandline Interface:

usage: rex [-h] [-L] [-Q] [-0] [-v] [-u] [-q] [-M] [-I] [-n N] [-m N] [-e N] [-x] [-l] [-t K]
           regex [transformation ...]

Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each
match. Each match is an individual output. Besides the syntax {k} to insert the k-th match group,
the unit supports processing the contents of match groups with arbitrary refinery units. To do
so, use the following F-string-like syntax:

    {match-group:pipeline}

where :pipeline is an optional pipeline of refinery commands as it would be specified on the
command line. The value of the corresponding match is post-processed with this command. The unit
also supports the special output format {.} which represents the input data.

positional arguments:
  regex             Regular expression to match.
  transformation    An optional sequence of transformations to be applied to each match. Each
                    transformation produces one output in the order in which they are given. The
                    default transformation is {0}, i.e. the entire match.

options:
  -u, --unicode     Also find unicode strings.
  -q, --unique      Yield every (transformed) match only once.
  -M, --multiline   Caret and dollar in regular expressions match the beginning and end of a line
                    and a dot does not match line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters in regular expressions.
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -l, --longest     Pick longer results first. The output will be sorted by length unless the
                    --take option is specified, in which case the longest K results will be
                    returned in order of appearance.
  -t, --take K      Return only the first K occurrences in order of appearance. If --longest is
                    specified, the K longest results will be returned in order of appearance
                    within the input.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class rex(SingleRegexUnit, PatternExtractor):
    """
    Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each
    match. Each match is an individual output. Besides the syntax `{k}` to insert the `k`-th match
    group, the unit supports processing the contents of match groups with arbitrary refinery units.
    To do so, use the following F-string-like syntax:

        {match-group:pipeline}

    where `:pipeline` is an optional pipeline of refinery commands as it would be specified on
    the command line. The value of the corresponding match is post-processed with this command. The
    unit also supports the special output format `{%s}` which represents the input data.
    """
    def __init__(
        self, regex,
        /,
        *transformation: Param[str, Arg.String(help=(
            'An optional sequence of transformations to be applied to each match. '
            'Each transformation produces one output in the order in which they '
            'are given. The default transformation is {0}, i.e. the entire match.'
        ))],
        unicode: Param[bool, Arg.Switch('-u', help='Also find unicode strings.')] = False,
        unique: Param[bool, Arg.Switch('-q', help='Yield every (transformed) match only once.')] = False,
        multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False,
        longest=False, take=None
    ):
        super().__init__(
            regex=regex,
            transformation=transformation,
            unicode=unicode,
            unique=unique,
            multiline=multiline,
            ignorecase=ignorecase,
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            longest=longest,
            take=take,
            utf16=unicode,
            ascii=True,
            duplicates=not unique
        )

    def process(self, data):
        meta = metavars(data)
        wrap = ByteStringWrapper.Wrap(data)
        self.log_debug('regular expression:', getattr(self.regex, 'pattern', self.regex))
        transformations = []
        specs: list[str] = list(self.args.transformation)
        if not specs:
            specs.append('{0}')
        for spec in specs:
            if spec.startswith('{') and spec.endswith('}') and (group := spec[1:-1]).isdigit():
                transformations.append(int(group))
            else:
                def transformation(match: Match, s=spec):
                    symb: dict = {
                        key: (value or b'') for key, value in match.groupdict().items()
                        if not key.startswith('__')}
                    args: list = [match.group(0), *match.groups()]
                    used = set()
                    for key, value in symb.items():
                        if value is None:
                            symb[key] = B''
                    symb[_FORWARD_VAR] = wrap
                    item = meta.format(s, self.codec, args, symb, True, True, used)
                    used.update(key for key, value in symb.items() if not value)
                    used.add(_FORWARD_VAR)
                    for variable in used:
                        symb.pop(variable, None)
                    symb.update(offset=match.start())
                    chunk = Chunk(item)
                    chunk.meta.update(meta)
                    chunk.meta.update(symb)
                    return chunk
                transformations.append(transformation)
        yield from self.matches_filtered(
            memoryview(data),
            self.regex,
            *transformations,
            expose_named_groups=True
        )

class rijndael (key, iv=b'', block_size=16, *, aad=b'', tag=(), segment_size=0, little_endian=False, raw=False, mode=None, padding=None)

This unit is implemented in refinery.units.crypto.cipher.rijndael and has the following commandline Interface:

usage: rijndael [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-b N] [-p P] [-m M] [-r] [-e] [-S N] key

Rijndael encryption and decryption. Note that there is also a aes unit which has much better
performance because it calls into the PyCryptodome library. You would have to use this specific
Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in
which case it is equivalent to AES.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -b, --block-size N    Cipher block size, default is 16. Valid choices are 16, 24, and 32.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class rijndael(StandardBlockCipherUnit, cipher=BlockCipherFactory(Rijndael)):
    """
    Rijndael encryption and decryption. Note that there is also a `refinery.aes` unit which has
    much better performance because it calls into the PyCryptodome library. You would have to
    use this specific Rijndael unit only if Rijndael is used with a block size that is different
    from 16 bytes, in which case it is equivalent to AES.
    """
    def __init__(
        self, key, iv=b'',
        block_size: Param[int, Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 16, 24, and 32.')] = 16,
        **more
    ):
        return super().__init__(key, iv=iv, block_size=block_size, **more)

    @property
    def block_size(self):
        return self.args.block_size

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(block_size=self.args.block_size, **optionals)

class ripemd128 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: ripemd128 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the RIPEMD-128 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ripemd128(HashUnit):
    """
    Returns the RIPEMD-128 hash of the input data.
    """
    def _algorithm(self, data):
        from refinery.lib.ripemd128 import ripemd128
        return ripemd128(data)

class ripemd160 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: ripemd160 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the RIPEMD160 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class rmv (*names)

This unit is implemented in refinery.units.meta.rmv and has the following commandline Interface:

usage: rmv [-h] [-L] [-Q] [-0] [-v] [name ...]

Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no
variable names are given, the unit removes all of them. Note that this can recover variables from
outer frames that were previously shadowed.

positional arguments:
  name           Name of a variable to be removed.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rmv(Unit):
    """
    Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no
    variable names are given, the unit removes all of them. Note that this can recover variables from
    outer frames that were previously shadowed.
    """
    def __init__(self, *names: Param[str, Arg.String(metavar='name', help='Name of a variable to be removed.')]):
        super().__init__(names=names)

    def process(self, data: Chunk):
        meta = metavars(data)
        keys = self.args.names or list(meta.variable_names())
        for key in keys:
            meta.discard(key)
        return data

class rncrypt (password)

This unit is implemented in refinery.units.crypto.cipher.rncrypt and has the following commandline Interface:

usage: rncrypt [-h] [-L] [-Q] [-0] [-v] [-R] password

Implements encryption and decryption using the RNCryptor specification. See also:
https://github.com/RNCryptor

positional arguments:
  password

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class rncrypt(Unit):
    """
    Implements encryption and decryption using the RNCryptor specification.
    See also: https://github.com/RNCryptor
    """
    def __init__(self, password: bytearray):
        super().__init__(password=password)

    def process(self, data: bytearray) -> bytes:
        encryption_salt = data[2:10]
        hmac_salt = data[10:18]
        iv = data[18:34]
        cipher_text = data[34:-32]
        hmac_signature = data[-32:]
        encryption_key = self._pbkdf2(self.args.password, encryption_salt)
        hmac_key = self._pbkdf2(self.args.password, hmac_salt)
        if not hmac.compare_digest(self._hmac(hmac_key, data[:-32]), hmac_signature):
            raise ValueError("Failed to verify signature.")
        return unpad(
            self._aes_decrypt(encryption_key, iv, cipher_text),
            block_size=AES.block_size
        )

    def reverse(self, data: bytes) -> bytes:
        prng = Random.new()
        data = pad(data, block_size=AES.block_size)
        encryption_salt = prng.read(8)
        encryption_key = self._pbkdf2(self.args.password, encryption_salt)
        hmac_salt = prng.read(8)
        hmac_key = self._pbkdf2(self.args.password, hmac_salt)
        iv = prng.read(AES.block_size)
        cipher_text = self._aes_encrypt(encryption_key, iv, data)
        new_data = b'\x03\x01' + encryption_salt + hmac_salt + iv + cipher_text
        return new_data + self._hmac(hmac_key, new_data)

    def _aes_encrypt(self, key, iv, text):
        return AES.new(key, AES.MODE_CBC, iv).encrypt(text)

    def _aes_decrypt(self, key, iv, text):
        return AES.new(key, AES.MODE_CBC, iv).decrypt(text)

    def _hmac(self, key, data):
        return hmac.new(key, data, hashlib.sha256).digest()

    def _prf(self, secret, salt):
        return hmac.new(secret, salt, hashlib.sha1).digest()

    def _pbkdf2(self, password, salt, iterations=10000, key_length=32):
        return KDF.PBKDF2(password, salt, dkLen=key_length, count=iterations, prf=self._prf)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data: bytes) -> bytes:
    prng = Random.new()
    data = pad(data, block_size=AES.block_size)
    encryption_salt = prng.read(8)
    encryption_key = self._pbkdf2(self.args.password, encryption_salt)
    hmac_salt = prng.read(8)
    hmac_key = self._pbkdf2(self.args.password, hmac_salt)
    iv = prng.read(AES.block_size)
    cipher_text = self._aes_encrypt(encryption_key, iv, data)
    new_data = b'\x03\x01' + encryption_salt + hmac_salt + iv + cipher_text
    return new_data + self._hmac(hmac_key, new_data)

class rot (amount=13)

This unit is implemented in refinery.units.crypto.cipher.rot and has the following commandline Interface:

usage: rot [-h] [-L] [-Q] [-0] [-v] [N]

Rotate the characters of the alphabet by the given amount. The default amount is 13, providing
the common (and weak) string obfuscation method.

positional arguments:
  N              Number of letters to rotate by; Default is 13.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rot(Unit):
    """
    Rotate the characters of the alphabet by the given amount. The default
    amount is 13, providing the common (and weak) string obfuscation method.
    """

    def __init__(self, amount: Param[int, Arg.Number(help='Number of letters to rotate by; Default is 13.')] = 13):
        super().__init__(amount=amount)

    def process(self, data: bytearray):
        rot = self.args.amount % 26
        for index, byte in enumerate(data):
            for alphabet in _LCASE, _UCASE:
                if byte in alphabet:
                    zero = alphabet[0]
                    data[index] = zero + (byte - zero + rot) % 26
                    break
        return data

class rotl (*argument, bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.rotl and has the following commandline Interface:

usage: rotl [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Rotate the bits of each block left.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class rotl(BinaryOperation):
    """
    Rotate the bits of each block left.
    """
    def operate(self, value, shift):
        shift %= self.fbits
        return (value << shift) | (value >> (self.fbits - shift))

    def inplace(self, value, shift):
        shift %= self.fbits
        lower = value >> (self.fbits - shift)
        value <<= shift
        value |= lower

class rotr (*argument, bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.rotr and has the following commandline Interface:

usage: rotr [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Rotate the bits of each block right.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class rotr(BinaryOperation):
    """
    Rotate the bits of each block right.
    """
    def operate(self, value, shift):
        shift %= self.fbits
        return (value >> shift) | (value << (self.fbits - shift))

    def inplace(self, value, shift):
        shift %= self.fbits
        lower = value >> shift
        value <<= self.fbits - shift
        value |= lower

class rsa (key, swapkeys=False, textbook=False, padding=0, rsautl=False)

This unit is implemented in refinery.units.crypto.cipher.rsa and has the following commandline Interface:

usage: rsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-t | -p P | -r] key

Implements single block RSA encryption and decryption. This unit can be used to encrypt and
decrypt blocks generated by openssl's rsautl tool when using the mode -verify. When it is
executed with a public key for decryption or with a private key for encryption, it will perform a
raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5.

positional arguments:
  key              RSA key in PEM, DER, or Microsoft BLOB format.

options:
  -s, --swapkeys   Swap public and private exponent.
  -t, --textbook   Equivalent to --padding=NONE.
  -p, --padding P  Choose one of the following padding modes: auto, none, oaep, pkcs15, pkcs10.
                   The default is AUTO.
  -r, --rsautl     Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class rsa(Unit):
    """
    Implements single block RSA encryption and decryption. This unit can be used to encrypt
    and decrypt blocks generated by openssl's `rsautl` tool when using the mode `-verify`.
    When it is executed with a public key for decryption or with a private key for encryption,
    it will perform a raw RSA operation. The result of these operations are (un)padded using
    EMSA-PKCS1-v1_5.
    """
    def __init__(
        self,
        key: Param[buf, Arg(help='RSA key in PEM, DER, or Microsoft BLOB format.')],
        swapkeys: Param[bool, Arg.Switch('-s', help='Swap public and private exponent.')] = False,
        textbook: Param[bool, Arg.Switch('-t', group='PAD', help='Equivalent to --padding=NONE.')] = False,
        padding: Param[str, Arg.Option('-p', group='PAD', metavar='P', choices=PAD,
            help='Choose one of the following padding modes: {choices}. The default is AUTO.')] = PAD.AUTO,
        rsautl: Param[bool, Arg.Switch('-r', group='PAD',
            help='Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10')] = False,
    ):
        padding = Arg.AsOption(padding, PAD)
        if textbook:
            if padding != PAD.AUTO:
                raise ValueError('Conflicting padding options!')
            padding = padding.NONE
        if rsautl:
            if padding and padding != PAD.PKCS10:
                raise ValueError('Conflicting padding options!')
            swapkeys = True
            padding = PAD.PKCS10

        super().__init__(key=key, textbook=textbook, padding=padding, swapkeys=swapkeys)

        self._key_hash = None
        self._key_data = None

    @property
    def blocksize(self) -> int:
        return self.key.size_in_bytes()

    @property
    def _blocksize_plain(self) -> int:
        # PKCS#1 v1.5 padding is at least 11 bytes.
        return self.blocksize - 11

    @property
    def pub(self):
        return self.key.d if self.args.swapkeys else self.key.e

    @property
    def prv(self):
        return self.key.e if self.args.swapkeys else self.key.d

    def _get_msg(self, data):
        msg = int.from_bytes(data, byteorder='big')
        if msg > self.key.n:
            raise ValueError(F'This key can only handle messages of size {self.blocksize}.')
        return msg

    def _encrypt_raw(self, data):
        return pow(
            self._get_msg(data),
            self.pub,
            self.key.n
        ).to_bytes(self.blocksize, byteorder='big')

    def _decrypt_raw(self, data):
        return pow(
            self._get_msg(data),
            self.prv,
            self.key.n
        ).to_bytes(self.blocksize, byteorder='big')

    def _unpad(self, data, head, padbyte=None):
        if len(data) > self.blocksize:
            raise ValueError(F'This key can only handle messages of size {self.blocksize}.')
        if data.startswith(head):
            pos = data.find(B'\0', 2)
            if pos > 0:
                pad = data[2:pos]
                if padbyte is None or all(b == padbyte for b in pad):
                    return data[pos + 1:]
        raise ValueError('Incorrect padding')

    def _pad(self, data, head, padbyte=None):
        if len(data) > self._blocksize_plain:
            raise ValueError(F'This key can only encrypt messages of size at most {self._blocksize_plain}.')
        pad = self.blocksize - len(data) - len(head) - 1
        if padbyte is not None:
            padding = pad * bytes((padbyte,))
        else:
            padding = bytearray(1)
            while not all(padding):
                padding = bytearray(filter(None, padding))
                padding.extend(get_random_bytes(pad - len(padding)))
        return head + padding + B'\0' + data

    def _unpad_pkcs10(self, data):
        return self._unpad(data, B'\x00\x01', 0xFF)

    def _unpad_pkcs15(self, data):
        return self._unpad(data, B'\x00\x02', None)

    def _pad_pkcs10(self, data):
        return self._pad(data, B'\x00\x01', 0xFF)

    def _pad_pkcs15(self, data):
        return self._pad(data, B'\x00\x02', None)

    def _decrypt_block_OAEP(self, data):
        self.log_debug('Attempting decryption with PyCrypto PKCS1 OAEP.')
        return PKCS1_OAEP.new(self.key).decrypt(data)

    def _encrypt_block_OAEP(self, data):
        self.log_debug('Attempting encryption with PyCrypto PKCS1 OAEP.')
        return PKCS1_OAEP.new(self.key).encrypt(data)

    def _decrypt_block(self, data):
        if self._oaep and self._pads in {PAD.AUTO, PAD.OAEP}:
            try:
                return self._decrypt_block_OAEP(data)
            except ValueError as E:
                if self._pads:
                    raise
                self.log_debug(F'{E!s} No longer attempting OAEP.')
                self._oaep = False

        data = self._decrypt_raw(data)
        return self._unpad_per_argument(data)

    def _unpad_per_argument(self, data):
        if self._pads == PAD.NONE:
            return data
        elif self._pads == PAD.PKCS10:
            return self._unpad_pkcs10(data)
        elif self._pads == PAD.PKCS15:
            return self._unpad_pkcs15(data)
        elif self._pads == PAD.AUTO:
            with suppress(ValueError):
                data = self._unpad_pkcs10(data)
                self.log_info('Detected PKCS1.0 padding.')
                self._pads = PAD.PKCS10
                return data
            with suppress(ValueError):
                data = self._unpad_pkcs15(data)
                self.log_info('Detected PKCS1.5 padding.')
                self._pads = PAD.PKCS15
                return data
            raise RefineryPartialResult('No padding worked, returning raw decrypted blocks.', data)
        else:
            raise ValueError(F'Invalid padding value: {self._pads!r}')

    def _encrypt_block(self, data):
        if self._pads in {PAD.AUTO, PAD.OAEP}:
            try:
                return self._encrypt_block_OAEP(data)
            except ValueError:
                if self._pads: raise
                self.log_debug('PyCrypto primitives for OAEP failed, falling back to PKCS1.5.')
                self._pads = PAD.PKCS15

        if self._pads == PAD.PKCS15:
            data = self._pad_pkcs15(data)
        elif self._pads == PAD.PKCS10:
            data = self._pad_pkcs10(data)

        return self._encrypt_raw(data)

    @property
    def key(self) -> RSA.RsaKey:
        key_blob = self.args.key
        key_hash = hash(key_blob)
        if key_hash != self._key_hash:
            fmt, key_data = normalize_rsa_key(key_blob)
            self.log_info(F'successfully parsed RSA key as {fmt.value}')
            self._key_hash = key_hash
            self._key_data = key_data
        return self._key_data

    def process(self, data):
        self._oaep = True
        self._pads = self.args.padding
        if not self.key.has_private():
            try:
                return self._unpad_per_argument(self._encrypt_raw(data))
            except RefineryPartialResult:
                raise
            except Exception as E:
                raise ValueError(F'A public key was given for decryption and rsautl mode resulted in an error: {E}') from E
        return B''.join(self._decrypt_block(block) for block in splitchunks(data, self.blocksize))

    def reverse(self, data):
        self._pads = self.args.padding
        return B''.join(self._encrypt_block(block) for block in splitchunks(data, self._blocksize_plain))

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    self._pads = self.args.padding
    return B''.join(self._encrypt_block(block) for block in splitchunks(data, self._blocksize_plain))

class rsakey (output=RSAFormat.PEM, public=False)

This unit is implemented in refinery.units.crypto.cipher.rsakey and has the following commandline Interface:

usage: rsakey [-h] [-L] [-Q] [-0] [-v] [-p] [{pem,der,xkms,text,json,blob}]

Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are
supported. The same formats are supported for the input format, but you can also specify a key in
the following format, where both modulus and exponent have to be hex-encoded:
[modulus]:[exponent]

positional arguments:
  {pem,der,xkms,text,json,blob}
                              Select an output format (pem, der, xkms, text, json, blob), default
                              is PEM.

options:
  -p, --public                Force public key output even if the input is private.

generic options:
  -h, --help                  Show this help message and exit.
  -L, --lenient               Increase the leniency, allowing partial results and ignoring more
                              errors.
  -Q, --quiet                 Disables all log output.
  -0, --devnull               Do not produce any output.
  -v, --verbose               Specify up to two times to increase log level.

Expand source code Browse git

class rsakey(Unit):
    """
    Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported.
    The same formats are supported for the input format, but you can also specify a key in the following
    format, where both modulus and exponent have to be hex-encoded: `[modulus]:[exponent]`
    """
    def __init__(
        self,
        output: Param[str, Arg.Option(choices=RSAFormat,
            help='Select an output format ({choices}), default is {default}.')] = RSAFormat.PEM,
        public: Param[bool, Arg.Switch('-p',
            help='Force public key output even if the input is private.')] = False,
    ):
        super().__init__(output=Arg.AsOption(output, RSAFormat), public=public)

    def _xkms_wrap(self, number: int):
        size, r = divmod(number.bit_length(), 8)
        size += int(bool(r))
        return base64.b64encode(number.to_bytes(size, 'big'))

    def process(self, data):
        from refinery.lib.mscrypto import ALGORITHMS, TYPES
        fmt, key = normalize_rsa_key(data, force_public=self.args.public)
        self.log_info(F'parsing input as {fmt.value} format')
        out = self.args.output
        if out is RSAFormat.PEM:
            yield key.export_key('PEM')
            return
        if out is RSAFormat.DER:
            yield key.export_key('DER')
            return
        if out is RSAFormat.BLOB:
            def le(v: int, s: int):
                return v.to_bytes(s, 'little')
            buffer = bytearray()
            buffer.append(TYPES.PRIVATEKEYBLOB if key.has_private() else TYPES.PUBLICKEYBLOB)
            buffer.extend(le(2, 3))
            buffer.extend(le(ALGORITHMS.CALG_RSA_KEYX, 4))
            buffer.extend(B'RSA2' if key.has_private() else B'RSA1')
            size = 2
            while size < key.n.bit_length():
                size <<= 1
            self.log_info(F'using bit size {size}')
            buffer.extend(le(size, 4))
            size //= 8
            buffer.extend(le(key.e, 4))
            buffer.extend(le(key.n, size))
            if key.has_private():
                exp_1 = key.d % (key.p - 1)
                exp_2 = key.d % (key.q - 1)
                coeff = pow(key.q, -1, key.p)
                half = size // 2
                buffer.extend(le(key.p, half))
                buffer.extend(le(key.q, half))
                buffer.extend(le(exp_1, half))
                buffer.extend(le(exp_2, half))
                buffer.extend(le(coeff, half))
                buffer.extend(le(key.d, size))
            yield buffer
            return
        components = {
            'Modulus' : key.n,
            'Exponent': key.e,
        }
        if key.has_private():
            decoded = DerSequence()
            decoded.decode(key.export_key('DER'))
            it = itertools.islice(decoded, 3, None)
            for v in ('D', 'P', 'Q', 'DP', 'DQ', 'InverseQ'):
                try:
                    components[v] = next(it)
                except StopIteration:
                    break
        if out is RSAFormat.XKMS:
            for tag in components:
                components[tag] = base64.b64encode(number.long_to_bytes(components[tag])).decode('ascii')
            tags = '\n'.join(F'\t<{tag}>{value}</{tag}>' for tag, value in components.items())
            yield F'<RSAKeyPair>\n{tags}\n</RSAKeyPair>'.encode(self.codec)
            return
        components['BitSize'] = key.n.bit_length()
        for tag, value in components.items():
            if value.bit_length() > 32:
                components[tag] = F'{value:X}'
        if out is RSAFormat.JSON:
            yield json.dumps(components, indent=4).encode(self.codec)
            return
        if out is RSAFormat.TEXT:
            table = list(flattened(components))
            for key, value in table:
                value = F'0x{value}' if isinstance(value, str) else str(value)
                value = '\n'.join(F'{L}' for L in textwrap.wrap(value, 80))
                yield F'-- {key + " ":-<77}\n{value!s}'.encode(self.codec)

class rtfc

This unit is implemented in refinery.units.formats.office.rtfc and has the following commandline Interface:

usage: rtfc [-h] [-L] [-Q] [-0] [-v] [-R]

Implements the RTF compression format. This compression algorithm is used, for example, to
compress RTF data in Outlook messages.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class rtfc(Unit):
    """
    Implements the RTF compression format. This compression algorithm is used, for example, to
    compress RTF data in Outlook messages.
    """
    @Unit.Requires('compressed_rtf', ['formats', 'office', 'default', 'extended'])
    def _rtfc():
        import compressed_rtf
        return compressed_rtf

    def process(self, data):
        return self._rtfc.decompress(data)

    def reverse(self, data):
        return self._rtfc.compress(data)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return self._rtfc.compress(data)

class run (*commandline, stream=False, noinput=False, errors=False, timeout=0.0)

This unit is implemented in refinery.units.misc.run and has the following commandline Interface:

usage: run [-h] [-L] [-Q] [-0] [-v] [-s] [-x] [-m] [-t T] ...

Turns any command into a refinery unit. Data is processed by feeding it to the standard input of
a process spawned from the given command line, and then reading the standard output of that
process as the result of the operation. The main purpose of this unit is to allow using the
syntax from frame with other command line tools. By default, the unit streams the output from the
executed command as individual outputs, but the buffer option can be set to buffer all output of
a single execution. The format string expression {} or {0} can be used as one of the arguments
passed to the external command to represent the incoming data. In this case, the data will not be
sent to the standard input device of the new process.

positional arguments:
  (all remaining)  All remaining command line tokens form an arbitrary command line to be
                   executed. Use format string syntax to insert meta variables and incoming data
                   chunks.

options:
  -s, --stream     Stream the command output rather than buffering it.
  -x, --noinput    Do not send any input to the new process.
  -m, --errors     Merge stdout and stderr. By default, the standard error stream of the coupled
                   command is forwarded to the logger, i.e. it is only visible if -v is also
                   specified.
  -t, --timeout T  Optionally set an execution timeout as a floating point number in seconds.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class run(Unit):
    """
    Turns any command into a refinery unit. Data is processed by feeding it to the standard input
    of a process spawned from the given command line, and then reading the standard output of that
    process as the result of the operation. The main purpose of this unit is to allow using the
    syntax from `refinery.lib.frame` with other command line tools. By default, the unit streams
    the output from the executed command as individual outputs, but the `buffer` option can be set
    to buffer all output of a single execution. The format string expression `{}` or `{0}` can be
    used as one of the arguments passed to the external command to represent the incoming data. In
    this case, the data will not be sent to the standard input device of the new process.
    """

    _JOIN_TIME = 0.1

    def __init__(
        self, *commandline: Param[str, Arg.String(nargs='...', metavar='(all remaining)', help=(
            'All remaining command line tokens form an arbitrary command line to be executed. Use'
            ' format string syntax to insert meta variables and incoming data chunks.'))],
        stream: Param[bool, Arg.Switch('-s',
            help='Stream the command output rather than buffering it.')] = False,
        noinput: Param[bool, Arg.Switch('-x', help='Do not send any input to the new process.')] = False,
        errors: Param[bool, Arg.Switch('-m', help=(
            'Merge stdout and stderr. By default, the standard error stream of the coupled command'
            ' is forwarded to the logger, i.e. it is only visible if -v is also specified.'
        ))] = False,
        timeout: Param[float, Arg.Double('-t', metavar='T', help=(
            'Optionally set an execution timeout as a floating point number in seconds.'
        ))] = 0.0
    ):
        if not commandline:
            raise ValueError('you need to provide a command line.')
        super().__init__(
            commandline=commandline, errors=errors, noinput=noinput, stream=stream, timeout=timeout)

    def process(self, data):
        def shlexjoin():
            import shlex
            return shlex.join(commandline)

        meta = metavars(data)
        meta.ghost = True
        used = set()
        commandline = [
            meta.format(cmd, self.codec, [data], None, False, used=used)
            for cmd in self.args.commandline
        ]

        if self.args.noinput:
            self.log_info('sending no input to process stdin')
            data = None

        if not self.log_debug(commandline):
            self.log_info(shlexjoin)

        posix = 'posix' in sys.builtin_module_names
        process = Popen(commandline, shell=True,
            stdin=PIPE, stdout=PIPE, stderr=PIPE, close_fds=posix)

        if not self.args.stream and not self.args.timeout:
            out, err = process.communicate(data)
            for line in err.splitlines():
                self.log_info(line)
            yield out
            return

        from queue import Empty, Queue
        from threading import Event, Thread
        from time import process_time, sleep

        start = 0
        result = None

        qerr = Queue()
        qout = Queue()
        done = Event()

        def adapter(stream, queue: Queue, event: Event):
            while not event.is_set():
                out = stream.read1()
                if out: queue.put(out)
                else: break
            stream.close()

        recvout = Thread(target=adapter, args=(process.stdout, qout, done), daemon=True)
        recverr = Thread(target=adapter, args=(process.stderr, qerr, done), daemon=True)

        recvout.start()
        recverr.start()

        if data:
            process.stdin.write(data)
        process.stdin.close()
        start = process_time()

        if not self.args.stream or self.args.timeout:
            result = MemoryFile()

        def queue_read(q: Queue):
            try: return q.get_nowait()
            except Empty: return None

        errbuf = MemoryFile()

        while True:
            out = queue_read(qout)
            err = None

            if self.args.errors:
                out = out or queue_read(qerr)
            else:
                err = queue_read(qerr)

            if err and self.log_info():
                errbuf.write(err)
                errbuf.seek(0)
                lines = errbuf.readlines()
                errbuf.seek(0)
                errbuf.truncate()
                if lines:
                    if not (done.is_set() or lines[~0].endswith(B'\n')):
                        errbuf.write(lines.pop())
                    for line in lines:
                        msg = line.rstrip(B'\n')
                        if msg: self.log_info(msg)
            if out:
                if not self.args.stream or self.args.timeout:
                    result.write(out)
                if self.args.stream:
                    yield out

            if done.is_set():
                if recverr.is_alive():
                    self.log_warn('stderr receiver thread zombied')
                if recvout.is_alive():
                    self.log_warn('stdout receiver thread zombied')
                break
            elif not err and not out and process.poll() is not None:
                recverr.join(self._JOIN_TIME)
                recvout.join(self._JOIN_TIME)
                done.set()
            elif self.args.timeout:
                if process_time() - start > self.args.timeout:
                    self.log_info('terminating process after timeout expired')
                    done.set()
                    process.terminate()
                    for wait in range(4):
                        if process.poll() is not None:
                            break
                        sleep(self._JOIN_TIME)
                    else:
                        self.log_warn('process termination may have failed')
                    recverr.join(self._JOIN_TIME)
                    recvout.join(self._JOIN_TIME)
                    if not len(result):
                        result = RuntimeError('timeout reached, process had no output')
                    else:
                        result = RefineryPartialResult(
                            'timeout reached, returning all collected output',
                            partial=result.getvalue())

        if isinstance(result, Exception):
            raise result
        elif not self.args.stream:
            yield result.getvalue()

class salsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

This unit is implemented in refinery.units.crypto.cipher.salsa and has the following commandline Interface:

usage: salsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce]

Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as
the key, this data is interpreted as the initial state box and all other parameters are ignored.

positional arguments:
  key                The encryption key.
  nonce              The nonce. Default is the string REFINERY.

options:
  -s, --stateful     Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N    Discard the first N bytes of the keystream, 0 by default.
  -m, --magic MAGIC  The magic constant; depends on the key size by default.
  -x, --offset N     Optionally specify the stream index, default is 0.
  -r, --rounds N     The number of rounds. Has to be an even number. Default is 20.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class salsa(LatinCipherUnit):
    """
    Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided
    as the key, this data is interpreted as the initial state box and all other parameters are
    ignored.
    """
    def keystream(self) -> Iterable[int]:
        key = self.args.key
        if len(key) == 64:
            it = SalsaCipher.FromState(key)
        else:
            it = SalsaCipher(
                key,
                self.args.nonce,
                self.args.magic,
                self.args.rounds,
                self.args.offset,
            )
        yield from it

class salsa20 (key, nonce=b'REFINERY')

This unit is implemented in refinery.units.crypto.cipher.salsa and has the following commandline Interface:

usage: salsa20 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce]

Salsa20 encryption and decryption. This unit is functionally equivalent to salsa with 20 rounds,
but it uses the PyCryptodome library C implementation rather than the pure Python implementation
used by salsa.

positional arguments:
  key            The encryption key.
  nonce          The nonce. Default is the string REFINERY.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class salsa20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(Salsa20)):
    """
    Salsa20 encryption and decryption. This unit is functionally equivalent to `refinery.salsa`
    with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure
    Python implementation used by `refinery.salsa`.
    """

class scope (*slice, visible=True)

This unit is implemented in refinery.units.meta.scope and has the following commandline Interface:

usage: scope [-h] [-L] [-Q] [-0] [-v] [-n] [start:end:step ...]

After using scope within in a frame, all the following operations will be applied only to the
selected indices. All remaining chunks still exist, they are just not operated on. When the frame
closes or the frame is being rescoped by a second application of this unit, they become visible
again.

positional arguments:
  start:end:step  Specify start:end:step in Python slice syntax. The default is :.

options:
  -n, --not       Hide the given chunks instead of making them the only ones visible.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class scope(FrameSlicer):
    """
    After using `refinery.scope` within in a `refinery.lib.frame`, all the
    following operations will be applied only to the selected indices. All
    remaining chunks still exist, they are just not operated on. When the
    frame closes or the frame is being rescoped by a second application of
    this unit, they become visible again.
    """
    def __init__(self, *slice, visible: Param[bool, Arg.Switch('-n', '--not', off=True, help=(
        'Hide the given chunks instead of making them the only ones visible.'))] = True
    ):
        super().__init__(*slice, visible=visible)
        # Sort any slices with negative arguments to the back so we check
        # them last. This delays potential consumption of the chunks iterator
        # as much as possible.
        self.args.slice.sort(
            key=lambda s: (s.start or 0, s.stop or 0), reverse=True)

    def filter(self, chunks):
        it = iter(chunks)
        consumed = None
        size = None

        def buffered() -> Generator[Chunk]:
            yield from it
            while consumed:
                yield consumed.popleft()

        def shift(offset, default):
            nonlocal consumed, size
            if offset is None:
                return default
            if offset >= 0:
                return offset
            if consumed is None:
                from collections import deque
                self.log_info(F'consuming iterator to compute negative offset {offset}.')
                consumed = deque(it)
                size = len(consumed) + k + 1
            return max(0, offset + size)

        for k, chunk in enumerate(buffered()):
            for s in self.args.slice:
                if k in range(shift(s.start, 0), shift(s.stop, k + 1), s.step or 1):
                    chunk.visible = self.args.visible
                    break
            else:
                chunk.visible = not self.args.visible
            self.log_debug(chunk)
            yield chunk

class seal (key, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.seal and has the following commandline Interface:

usage: seal [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key

SEAL encryption and decryption.

positional arguments:
  key              The encryption key.

options:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class seal(StreamCipherUnit):
    """
    SEAL encryption and decryption.
    """
    key_size = {20}

    def keystream(self) -> Iterable[bytes]:
        return SEAL_Cipher(self.args.key)

class secstr (key=b'\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10', iv=b'')

This unit is implemented in refinery.units.crypto.cipher.secstr and has the following commandline Interface:

usage: secstr [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [key]

Implements the AES-based encryption scheme used by the PowerShell commands ConvertFrom-
SecureString and ConvertTo-SecureString.

positional arguments:
  key            Secure string encryption 16-byte AES key; the default are the bytes from 1 to
                 16.

options:
  -i, --iv IV    Optionally specify an IV to use for encryption.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class secstr(Unit):
    """
    Implements the AES-based encryption scheme used by the PowerShell commands
    `ConvertFrom-SecureString` and `ConvertTo-SecureString`.
    """

    # This is a magic header value used for PowerShell secure strings.
    _MAGIC = bytes((
        0xEF, 0xAE, 0x3D, 0xD9, 0xDD, 0x75, 0xD7, 0xAE, 0xF8, 0xDD, 0xFD, 0x38,
        0xDB, 0x7E, 0x35, 0xDD, 0xBD, 0x7A, 0xD3, 0x9D, 0x1A, 0xE7, 0x7E, 0x39))

    # Secure strings include a decimal number formatted as a string directly
    # following the header. Presumably, this is the PowerShell version.
    _PSVER = 2

    def __init__(
        self, key: Param[buf, Arg(
            help='Secure string encryption 16-byte AES key; the default are the bytes from 1 to 16.'
        )] = bytes(range(1, 17)),
        iv: Param[buf, Arg('-i', help='Optionally specify an IV to use for encryption.')] = B''
    ):
        super().__init__(key=key, iv=iv)

    @property
    def key(self):
        key = self.args.key
        if len(key) not in (0x10, 0x18, 0x20):
            raise ValueError('The encryption key has to be 16 bytes long.')
        return key

    @property
    def iv(self):
        iv = self.args.iv
        if iv and len(iv) != 0x10:
            raise ValueError('The IV has to be 16 bytes long.')
        return iv

    def reverse(self, data):
        ivec = self.iv or urandom(0x10)
        if len(ivec) != 0x10:
            raise ValueError(self._IVERR)
        cipher = AES.new(self.key, AES.MODE_CBC, ivec)
        data = data.decode('latin-1').encode('utf-16LE')
        data = cipher.encrypt(pad(data, block_size=0x10))
        data = base64.b16encode(data).lower().decode('ascii')
        ivec = base64.b64encode(ivec).decode('ascii')
        data = '|'.join(('%d' % self._PSVER, ivec, data)).encode('utf-16LE')
        return base64.b64encode(self._MAGIC + data)

    def process(self, data):
        head, ivec, data = base64.b64decode(data).split(b'|\0')
        self.log_info('head:', head.hex())
        ivec = base64.b64decode(ivec.decode('utf-16LE'))
        self.log_info('ivec:', ivec.hex())
        data = base64.b16decode(data.decode('utf-16LE'), casefold=True)
        if len(data) % 0x10 != 0:
            self.log_info('data not block-aligned, padding with zeros')
            data += B'\0' * (0x10 - len(data) % 0x10)
        cipher = AES.new(self.key, AES.MODE_CBC, ivec)
        data = cipher.decrypt(data)
        try:
            data = unpad(data, block_size=0x10)
        except Exception:
            self.log_warn('decrypted data does not have PKCS7 padding')
        for p in range(0x10):
            try:
                return data[-p:].decode('utf-16LE').encode('latin-1')
            except UnicodeDecodeError:
                pass
            except UnicodeEncodeError:
                pass
        self.log_warn('result is not a padded unicode string, key is likely wrong')
        return data

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    ivec = self.iv or urandom(0x10)
    if len(ivec) != 0x10:
        raise ValueError(self._IVERR)
    cipher = AES.new(self.key, AES.MODE_CBC, ivec)
    data = data.decode('latin-1').encode('utf-16LE')
    data = cipher.encrypt(pad(data, block_size=0x10))
    data = base64.b16encode(data).lower().decode('ascii')
    ivec = base64.b64encode(ivec).decode('ascii')
    data = '|'.join(('%d' % self._PSVER, ivec, data)).encode('utf-16LE')
    return base64.b64encode(self._MAGIC + data)

class sep (separator=b'\n', scoped=False)

This unit is implemented in refinery.units.meta.sep and has the following commandline Interface:

usage: sep [-h] [-L] [-Q] [-0] [-v] [-s] [separator]

Multiple inputs are joined along a specified separator. If any of the input Chunks is currently
out of scope, sep turns makes them visible by default. This can be prevented by using the -s
flag.

positional arguments:
  separator      Separator; the default is a line break.

options:
  -s, --scoped   Maintain chunk scope; i.e. do not turn all input chunks visible.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sep(Unit):
    """
    Multiple inputs are joined along a specified separator. If any of the input
    `refinery.lib.frame.Chunk`s is currently out of scope, `refinery.sep` turns
    makes them visible by default. This can be prevented by using the `-s` flag.
    """

    def __init__(
        self, separator: Param[buf, Arg(help='Separator; the default is a line break.')] = B'\n',
        scoped: Param[bool, Arg.Switch('-s', help=(
            'Maintain chunk scope; i.e. do not turn all input chunks visible.'))] = False
    ):
        super().__init__(separator=separator, scoped=scoped)
        self.separate = False

    def filter(self, chunks):
        it = iter(chunks)
        try:
            chunk = next(it)
        except StopIteration:
            return
        self.separate = True
        for upcoming in it:
            if not self.args.scoped:
                chunk.visible = True
            yield chunk
            chunk = upcoming
        self.separate = False
        yield chunk

    def process(self, data):
        yield data
        if self.separate:
            yield self.args.separator

class serpent (key, iv=b'', padding=None, mode=None, raw=False, swap=False)

This unit is implemented in refinery.units.crypto.cipher.serpent and has the following commandline Interface:

usage: serpent [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key

Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in
one direction, some in the other. When decryption results with this unit do not yield the
expected result, try using the --swap (or -s) option to swap the bytes in each block.
Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by
prefixing the input key by the multibin handler snip[::-1].

positional arguments:
  key              The encryption key.

options:
  -i, --iv IV      Specifies the initialization vector. If none is specified, then a block of
                   zero bytes is used.
  -p, --padding P  Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does
                   nothing. By default, all other algorithms are attempted. In most cases, the
                   data was not correctly decrypted if none of these work.
  -m, --mode M     Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB,
                   PCBC. By default, the CBC mode is used when an IV is is provided, and ECB
                   otherwise.
  -r, --raw        Set the padding to raw; ignored when a padding is specified.
  -s, --swap       Read the bytes in each block in reverse order.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class serpent(StandardBlockCipherUnit, cipher=BlockCipherFactory(Serpent)):
    """
    Serpent encryption and decryption. Some Serpent implementations read the bytes of each block
    in one direction, some in the other. When decryption results with this unit do not yield the
    expected result, try using the `--swap` (or `-s`) option to swap the bytes in each block.
    Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done
    by prefixing the input key by the multibin handler `snip[::-1]`.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        swap: Param[bool, Arg.Switch('-s', help='Read the bytes in each block in reverse order.')] = False
    ):
        super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, swap=swap)

    def _new_cipher(self, **optionals) -> CipherInterface:
        instance: Serpent = super()._new_cipher()
        instance.swap = self.args.swap
        return instance

class sha1 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha1 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA1 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha224 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha224 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA224 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha256 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha256 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA256 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha384 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha384 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA384 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha3_224 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha3-224 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA3-224 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha3_256 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha3-256 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA3-256 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha3_384 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha3-384 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA3-384 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha3_512 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha3-512 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA3-512 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class sha512 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha512 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHA512 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class shake128 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: shake128 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHAKE-128 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class shake256 (reps=1, text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: shake256 [-h] [-L] [-Q] [-0] [-v] [-r N] [-t]

Returns the SHAKE-256 hash of the input data.

options:
  -r, --reps N   Optionally specify a number of times to apply the hash to its own output.
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

class shl (*argument, bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.shl and has the following commandline Interface:

usage: shl [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Shift the bits of each block left, filling with zero bits.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class shl(BinaryOperation):
    """
    Shift the bits of each block left, filling with zero bits.
    """
    @staticmethod
    def operate(a, b): return a << b
    @staticmethod
    def inplace(a, b): a <<= b

class shr (*argument, bigendian=False, blocksize=1)

This unit is implemented in refinery.units.blockwise.shr and has the following commandline Interface:

usage: shr [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Shift the bits of each block right, filling with zero bits.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. The default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class shr(BinaryOperation):
    """
    Shift the bits of each block right, filling with zero bits.
    """
    @staticmethod
    def operate(a, b): return a >> b
    @staticmethod
    def inplace(a, b): a >>= b

class sm4 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=(), aad=b'')

This unit is implemented in refinery.units.crypto.cipher.sm4 and has the following commandline Interface:

usage: sm4 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] key

The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography
Administration of China (SCA).

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class sm4(StandardBlockCipherUnit, cipher=BlockCipherFactory(SM4)):
    """
    The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography
    Administration of China (SCA).
    """

class snip (slices=[slice(None, None, None)], length=False, stream=False, remove=False)

This unit is implemented in refinery.units.strings.snip and has the following commandline Interface:

usage: snip [-h] [-L] [-Q] [-0] [-v] [-l] [-s] [-r] [slices ...]

Snips the input data based on a Python slice expression. For example, the initialization slice
0::1 1::1 would yield a unit that first extracts every byte at an even position and then, every
byte at an odd position. In this case, multiple outputs are produced. The unit can be used in
reverse mode, in which case the specified ranges are deleted sequentially from the input.

positional arguments:
  slices         Specify start:stop:step in Python slice syntax.

options:
  -l, --length   Interpret the end of a slice as a length rather than as an offset.
  -s, --stream   After each slice, consider only the data that follows after it for subsequent
                 slicing. This mode is incompatible with negative step sizes.
  -r, --remove   Remove the slices from the input rather than selecting them.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class snip(Unit):
    """
    Snips the input data based on a Python slice expression. For example, the
    initialization `slice 0::1 1::1` would yield a unit that first extracts
    every byte at an even position and then, every byte at an odd position. In
    this case, multiple outputs are produced. The unit can be used in reverse
    mode, in which case the specified ranges are deleted sequentially from the
    input.
    """
    def __init__(
        self,
        slices: Param[list[slice], Arg(help='Specify start:stop:step in Python slice syntax.')] = [slice(None, None)],
        length: Param[bool, Arg.Switch('-l', help=(
            'Interpret the end of a slice as a length rather than as an offset.'))] = False,
        stream: Param[bool, Arg.Switch('-s', help=(
            'After each slice, consider only the data that follows after it for subsequent '
            'slicing. This mode is incompatible with negative step sizes.'))] = False,
        remove: Param[bool, Arg.Switch('-r', help=(
            'Remove the slices from the input rather than selecting them.'))] = False,
    ):
        super().__init__(slices=slices, length=length, stream=stream, remove=remove)

    def process(self, data: bytearray):
        slices: list[slice] = list(self.args.slices)
        opt_stream = self.args.stream
        opt_remove = self.args.remove
        opt_length = self.args.length
        view = memoryview(data)
        offset = 0

        if opt_stream and any(b.step or 1 < 0 for b in slices):
            raise RuntimeError('Streaming is incompatible with negative step slices.')

        for k, bounds in enumerate(slices):
            step = bounds.step or 1
            stop = bounds.stop
            start = bounds.start
            if opt_length:
                if stop is not None:
                    if start is None:
                        if step < 0:
                            stop += len(data)
                    else:
                        stop += start
            if opt_stream:
                start = start or 0
                stop = stop or len(data)
                start += offset
                stop += offset

            bounds = slice(start, stop, bounds.step)

            if not opt_remove:
                temp = view[bounds]
            else:
                temp = bytearray(data)
                del temp[bounds]
            yield temp

            if opt_stream:
                offset = stop

class sorted (key=None, ascending=False)

This unit is implemented in refinery.units.meta.sorted and has the following commandline Interface:

usage: sorted [-h] [-L] [-Q] [-0] [-v] [-a] [key]

Sorts all elements of the input frame lexicographically. This unit is a nop on single inputs.

positional arguments:
  key              A meta variable expression to sort by instead of sorting the content.

options:
  -a, --ascending  Sort in ascending order, the default is descending.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class sorted(Unit):
    """
    Sorts all elements of the input `refinery.lib.frame` lexicographically.
    This unit is a `refinery.nop` on single inputs.
    """

    def __init__(
        self,
        key: Param[str, Arg.String('key', help='A meta variable expression to sort by instead of sorting the content.')] = None,
        ascending: Param[bool, Arg.Switch('-a', help='Sort in ascending order, the default is descending.')] = False
    ):
        super().__init__(key=key, ascending=ascending)

    def filter(self, chunks):
        sortbuffer = []
        invisibles = []
        key = self.args.key
        rev = not self.args.ascending

        if key is not None:
            def _key(chunk):
                return expression(metavars(chunk)), chunk
            expression = PythonExpression(key, all_variables_allowed=True)
            key = _key

        def sorted():
            if not sortbuffer:
                return
            sortbuffer.sort(key=key, reverse=rev)
            yield from sortbuffer
            sortbuffer.clear()

        for chunk in chunks:
            if chunk.visible:
                yield from invisibles
                invisibles.clear()
                sortbuffer.append(chunk)
            else:
                yield from sorted()
                invisibles.append(chunk)

        yield from invisibles
        yield from sorted()

class sosemanuk (key, stateful=False, discard=0, nonce=b'')

This unit is implemented in refinery.units.crypto.cipher.sosemanuk and has the following commandline Interface:

usage: sosemanuk [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] key [nonce]

positional arguments:
  key              The encryption key.
  nonce            The nonce. Default is empty, which is equivalent to 16 null bytes.

options:
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class sosemanuk(StreamCipherUnit):

    def __init__(
        self, key, stateful=False, discard=0,
        nonce: Param[buf, Arg(help='The nonce. Default is empty, which is equivalent to 16 null bytes.')] = B'',
    ):
        super().__init__(key=key, nonce=nonce, stateful=stateful, discard=discard)

    def keystream(self):
        yield from Sosemanuk(self.args.key, self.args.nonce)

class speck (key, iv=b'', padding=None, mode=None, raw=False, block_size=16, *, aad=b'', tag=(), segment_size=0, little_endian=False)

This unit is implemented in refinery.units.crypto.cipher.speck and has the following commandline Interface:

usage: speck [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-b N] [-e] [-S N] key

SPECK encryption and decryption. It supports block sizes of 8 and 16 bytes.

positional arguments:
  key                   The encryption key.

options:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -b, --block-size N    Cipher block size, default is 16. Valid choices are 8 and 16.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of segmentation bits. It must be a multiple of 8.
                        The default of 0 means that the block size will be used as the segment
                        size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class speck(StandardBlockCipherUnit, cipher=BlockCipherFactory(Speck)):
    """
    SPECK encryption and decryption. It supports block sizes of 8 and 16 bytes.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        block_size: Param[int, Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 8 and 16.')] = 16,
        **more
    ):
        return super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, block_size=block_size, **more)

    @property
    def block_size(self):
        return self.args.block_size

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(block_size=self.args.block_size, **optionals)

class sqlite (query="SELECT * FROM sqlite_master WHERE type='table';")

This unit is implemented in refinery.units.formats.sqlite and has the following commandline Interface:

usage: sqlite [-h] [-L] [-Q] [-0] [-v] [query]

Extracts data from SQLite3 databases. Each row is returned as a single output chunk in JSON
format. If no query is provided, the unit will extract all table metadata from the database.

positional arguments:
  query          The SQL query to execute.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sqlite(Unit):
    """
    Extracts data from SQLite3 databases. Each row is returned as a single output chunk in JSON
    format. If no query is provided, the unit will extract all table metadata from the database.
    """
    def __init__(
        self,
        query: Param[
            str, Arg.String('query', help='The SQL query to execute.')
        ] = "SELECT * FROM sqlite_master WHERE type='table';",
    ):
        super().__init__(query=query)

    def process(self, data):
        try:
            with sqlite3.connect(':memory:') as database:
                try:
                    database.deserialize(data)
                except AttributeError:
                    raise NotImplementedError(F'Python >= 3.11 is required to use {self.name}.')
                cursor = database.cursor().execute(self.args.query)
                fields = (
                    [i[0] for i in cursor.description] if cursor.description else []
                )
                for row in cursor:
                    if fields:
                        cleaned_row = {}
                        for i in range(len(fields)):
                            if isinstance(row[i], bytes):
                                cleaned_row[i] = row[i].decode(self.codec)
                            else:
                                cleaned_row[fields[i]] = row[i]
                        yield json.dumps(cleaned_row).encode(self.codec)
                    else:
                        yield json.dumps(list(row)).encode(self.codec)
        except sqlite3.Error as e:
            raise ValueError(F'Failed to process SQLite database: {e}')

class stego (split=False, parts='RGB')

This unit is implemented in refinery.units.formats.stego and has the following commandline Interface:

usage: stego [-h] [-L] [-Q] [-0] [-v] [-m] [parts]

Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs
these values as bytes. By default, the pixels are converted left to right, top to bottom. When
the input image is grayscale, the color channels are ignored. Colored images are converted to
RGBA mode.

positional arguments:
  parts          A string containing any ordering of the letters R, G, B, and A (case-
                 insensitive). These pixel components will be extracted from every pixel in the
                 given order. The default value is RGB.

options:
  -m, --split    Emit the individual rows or columns as separate outputs.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class stego(Unit):
    """
    Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs
    these values as bytes. By default, the pixels are converted left to right, top to bottom. When
    the input image is grayscale, the color channels are ignored. Colored images are converted to
    RGBA mode.
    """
    def __init__(
        self,
        split: Param[bool, Arg.Switch('-m', help='Emit the individual rows or columns as separate outputs.')] = False,
        parts: Param[str, Arg.String('parts', nargs='?', help=(
            'A string containing any ordering of the letters R, G, B, and A (case-insensitive). '
            'These pixel components will be extracted from every pixel in the given order. The '
            'default value is {default}.'
        ))] = 'RGB'
    ):
        super().__init__(
            split=split,
            parts=tuple(Arg.AsOption(p, PIXEL_PART) for p in parts)
        )

    @Unit.Requires('Pillow', ['formats'])
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        split = self.args.split
        parts = self.args.parts
        image = self._image.open(MemoryFile(data, output=bytes))

        grayscale = image.mode.startswith('L')
        bw_bitmap = image.mode.startswith('1')
        no_colors = grayscale or bw_bitmap

        if not no_colors:
            image = image.convert('RGBA')

        width, height = image.size
        chunk_size = 1 if no_colors else len(parts)
        output = MemoryFile()
        buffer = bytearray(chunk_size * width)
        pixels = iter(image.getdata())

        for _ in range(height):
            offset = 0
            for _ in range(width):
                pixel = next(pixels)
                next_offset = offset + chunk_size
                if no_colors:
                    buffer[offset] = pixel
                else:
                    buffer[offset:next_offset] = (pixel[p] for p in parts)
                offset = next_offset
            if split:
                yield buffer
            else:
                output.write(buffer)
        if not split:
            yield output.getvalue()

class stretch (*count)

This unit is implemented in refinery.units.strings.stretch and has the following commandline Interface:

usage: stretch [-h] [-L] [-Q] [-0] [-v] [-R] [count ...]

Stretch the input data by repeating every byte a number of times.

positional arguments:
  count          The number of times every byte should be repeated. By default, every byte is
                 repeated once.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class stretch(Unit):
    """
    Stretch the input data by repeating every byte a number of times.
    """
    def __init__(self, *count: Param[int, Arg.Number(metavar='count', help=(
        'The number of times every byte should be repeated. By default,  '
        'every byte is repeated once.'
    ))]):
        count = count or (2,)
        if any(k <= 0 for k in count):
            raise ValueError('You can not use a stretching factor of less than 1.')
        super().__init__(count=count or (2,))

    def process(self, data):
        def stretched(it):
            factor = cycle(self.args.count)
            for byte in it:
                yield from repeat(byte, next(factor))
        return bytearray(stretched(iter(data)))

    def reverse(self, data):
        # one-sided inverse
        def clinched(it):
            factor = cycle(self.args.count)
            while True:
                try:
                    take = islice(it, next(factor))
                    yield next(take)
                    for _ in take: pass
                except StopIteration:
                    break
        return bytearray(clinched(iter(data)))

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    # one-sided inverse
    def clinched(it):
        factor = cycle(self.args.count)
        while True:
            try:
                take = islice(it, next(factor))
                yield next(take)
                for _ in take: pass
            except StopIteration:
                break
    return bytearray(clinched(iter(data)))

class struct (spec, *outputs, multi=False, count=∞, until=None, format=None, name=None, more=False)

This unit is implemented in refinery.units.pattern.struct_parser and has the following commandline Interface:

usage: struct [-h] [-L] [-Q] [-0] [-v] [-m] [-c N] [-u E] [-f F] [-n VAR] [-M] spec [output ...]

Read structured data from the beginning of a chunk and store the extracted fields in chunk meta
variables. The structure format is specified in extended Python struct format, and all remaining
arguments to this unit are the names of the variables that receive the values from this struct.
The extended struct format supports all field types supported by Python, as well as the
following:

- a for null-terminated ASCII strings,
- u to read encoded, null-terminated UTF16 strings,
- w to read decoded, null-terminated UTF16 strings,
- g to read Microsoft GUID values,
- E to read 7-bit encoded integers.

For example, the string LLxxHaa will read two unsigned 32bit integers, then skip two bytes, then
read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to
using native byte order with no alignment. The spec parameter may additionally contain format
expressions of the following form:

    {name[!alignment]:format}

The alignment parameter is optional. It must be an expression that evaluates to an integer value.
The current data pointer is aligned to a multiple of this value before reading the field. The
format can either be an integer expression specifying a number of bytes to read, or any format
string. If name is specified for an extracted field, its value is made available as a meta
variable under the given name. For example, the expression LLxxH{foo:a}{bar:a} would be parsed in
the same way as the previous example, but the two ASCII strings would also be stored in meta
variables under the names foo and bar, respectively. The format string of a named field is itself
parsed as a foramt string expression, where all the previously parsed fields are already
available. For example, I{:{}} reads a single 32-bit integer length prefix and then reads as many
bytes as that prefix specifies.

A second format string expression is used to specify the output format. For example, the format
string LLxxH{foo:a}{bar:a} together with the output format {foo}/{bar} would parse data as
before, but the output body would be the concatnation of the field foo, a forward slash, and the
field bar. Variables used in the output expression are not included as meta variables. As format
fields in the output expression, one can also use {1}, {2} or {-1} to access extracted fields by
index. The value {0} represents the entire chunk of structured data. By default, the output
format {#} is used, which represents either the last byte string field that was extracted, or the
entire chunk of structured data if none of the fields were extracted.

Reverse multibin expressions can be used to post-process the fields included in any output
format. For example, {F:b64:zl} will be the base64-decoded and inflate- decompressed contents of
the data that was read as field F.

Finally, it is possible to specify a byte alignment by using the syntax {field!T:a:b:c} where the
letter T is either a single digit specifying the alignment, or a single letter variable that
holds the byte alignment value in the current metadata. It is also possible to specify the
alignment as 0 which instructs the parser to only peek the contents of this field, i.e. the read
pointer is not advanced after reading it.

positional arguments:
  spec            Structure format as explained above.
  output          Output format as explained above.

options:
  -m, --multi     Read as many pieces of structured data as possible intead of just one.
  -c, --count N   A limit on the number of chunks to read in multi mode; default is ∞.
  -u, --until E   An expression evaluated on each chunk in multi mode. New chunks will be parsed
                  only if the result is nonzero.
  -f, --format F  Optionally specify a format string expression to auto-name extracted fields
                  without a given name. The format string accepts the field {c} for the type code
                  and {n} for the variable index.
  -n, --name VAR  Equivalent to --format=VAR{n}.
  -M, --more      After parsing the struct, emit one chunk that contains the data that was left
                  over in the buffer. If no data was left over, this chunk will be empty.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class struct(Unit):
    """
    Read structured data from the beginning of a chunk and store the extracted fields in chunk meta
    variables. The structure format is specified in extended Python struct format, and all
    remaining arguments to this unit are the names of the variables that receive the values from
    this struct. The extended struct format supports all field types supported by Python, as well
    as the following:

    - `a` for null-terminated ASCII strings,
    - `u` to read encoded, null-terminated UTF16 strings,
    - `w` to read decoded, null-terminated UTF16 strings,
    - `g` to read Microsoft GUID values,
    - `E` to read 7-bit encoded integers.

    For example, the string `LLxxHaa` will read two unsigned 32bit integers, then skip two bytes,
    then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults
    to using native byte order with no alignment. The `spec` parameter may additionally contain
    format expressions of the following form:

        {name[!alignment]:format}

    The `alignment` parameter is optional. It must be an expression that evaluates to an integer
    value. The current data pointer is aligned to a multiple of this value before reading the field.
    The `format` can either be an integer expression specifying a number of bytes to read, or any
    format string. If `name` is specified for an extracted field, its value is made available as a
    meta variable under the given name. For example, the expression `LLxxH{foo:a}{bar:a}` would be
    parsed in the same way as the previous example, but the two ASCII strings would also be stored
    in meta variables under the names `foo` and `bar`, respectively. The `format` string of a named
    field is itself parsed as a foramt string expression, where all the previously parsed fields
    are already available. For example, `I{:{}}` reads a single 32-bit integer length prefix and
    then reads as many bytes as that prefix specifies.

    A second format string expression is used to specify the output format. For example, the format
    string `LLxxH{foo:a}{bar:a}` together with the output format `{foo}/{bar}` would parse data as
    before, but the output body would be the concatnation of the field `foo`, a forward slash, and
    the field `bar`. Variables used in the output expression are not included as meta variables. As
    format fields in the output expression, one can also use `{1}`, `{2}` or `{-1}` to access
    extracted fields by index. The value `{0}` represents the entire chunk of structured data. By
    default, the output format `{%s}` is used, which represents either the last byte string field
    that was extracted, or the entire chunk of structured data if none of the fields were extracted.

    Reverse `refinery.lib.argformats.multibin` expressions can be used to post-process the fields
    included in any output format. For example, `{F:b64:zl}` will be the base64-decoded and inflate-
    decompressed contents of the data that was read as field `F`.

    Finally, it is possible to specify a byte alignment by using the syntax `{field!T:a:b:c}` where
    the letter `T` is either a single digit specifying the alignment, or a single letter variable
    that holds the byte alignment value in the current metadata. It is also possible to specify the
    alignment as `0` which instructs the parser to only peek the contents of this field, i.e. the
    read pointer is not advanced after reading it.
    """

    def __init__(
        self,
        spec: Param[str, Arg.String(help='Structure format as explained above.')],
        *outputs: Param[str, Arg.String(metavar='output', help='Output format as explained above.')],
        multi: Param[bool, Arg.Switch('-m', help=(
            'Read as many pieces of structured data as possible intead of just one.'))] = False,
        count: Param[int, Arg.Number('-c', help=(
            'A limit on the number of chunks to read in multi mode; default is {default}.'))] = INF,
        until: Param[str, Arg.String('-u', metavar='E', help=(
            'An expression evaluated on each chunk in multi mode. New chunks will be parsed '
            'only if the result is nonzero.'))] = None,
        format: Param[str, Arg.String('-f', metavar='F', help=(
            'Optionally specify a format string expression to auto-name extracted fields without a '
            'given name. The format string accepts the field {{c}} for the type code and {{n}} for '
            'the variable index.'))] = None,
        name: Param[str, Arg.String('-n', metavar='VAR', group='FIELDS', help=(
            'Equivalent to --format=VAR{{n}}.'))] = None,
        more: Param[bool, Arg.Switch('-M', help=(
            'After parsing the struct, emit one chunk that contains the data that was left '
            'over in the buffer. If no data was left over, this chunk will be empty.'))] = False
    ):
        if name:
            format = format or F'{name}{{n}}'
        outputs = outputs or [F'{{{_REST_MARKER}}}']
        super().__init__(spec=spec, outputs=outputs, until=until, format=format, count=count, multi=multi, more=more)

    def process(self, data: Chunk):
        formatter = string.Formatter()
        field_format: str | None = self.args.format
        until = self.args.until
        until = until and PythonExpression(until, all_variables_allowed=True)
        reader = StructReader(memoryview(data))
        checkpoint = 0
        mainspec = self.args.spec
        byteorder = mainspec[:1]
        if byteorder in '<@=!>':
            mainspec = mainspec[1:]
        else:
            byteorder = '='

        def fixorder(spec):
            if spec[0] not in '<@=!>':
                spec = byteorder + spec
            return spec

        previously_existing_variables = set(metavars(data).variable_names())

        it = itertools.count() if self.args.multi else (0,)
        for index in it:

            field_counter = 0
            checkpoint = reader.tell()

            if reader.eof:
                break
            if index >= self.args.count:
                break

            meta = metavars(data)
            meta.ghost = True
            meta.index = index

            args = []
            last = None
            self.log_debug(F'starting new read at: 0x{checkpoint:08X}')

            try:
                for prefix, name, spec, conversion in formatter.parse(mainspec):
                    name: str
                    spec: str = spec and spec.strip()
                    if prefix:
                        fields = reader.read_struct(fixorder(prefix))
                        if field_format is not None:
                            codes = re.findall('[?cbBhHiIlLqQnNefdspPauwgk]', prefix)
                            if len(codes) != len(fields):
                                codes = 'v' * len(fields)
                            for code, field in zip(codes, fields):
                                code = 'b' if code == '?' else code.lower()
                                v = field_format.format_map({'c': code, 'n': field_counter})
                                meta[v] = field
                                field_counter += 1
                        args.extend(fields)

                    if name is None:
                        continue

                    field_counter += 1

                    if name and not name.isdecimal():
                        check_variable_name(name)

                    if not conversion:
                        peek = False
                    else:
                        alignment = PythonExpression.Evaluate(conversion, meta)
                        if alignment == 0:
                            peek = True
                        else:
                            _aa = reader.tell()
                            reader.byte_align(alignment)
                            _ab = reader.tell()
                            if _aa != _ab:
                                self.log_info(F'aligned from 0x{_aa:X} to 0x{_ab:X}')

                    spec, _, pipeline = spec.partition(':')

                    if spec:
                        spec = meta.format_str(spec, self.codec, args)

                    if spec:
                        try:
                            _exp = PythonExpression.Evaluate(spec, meta)
                        except ParserError:
                            pass
                        else:
                            spec = _exp

                    if spec == '':
                        last = value = reader.read(peek=peek)
                    elif isinstance(spec, int):
                        if spec < 0:
                            spec += reader.remaining_bytes
                        if spec < 0:
                            raise ValueError(F'The specified negative read offset is {-spec} beyond the cursor.')
                        last = value = reader.read_bytes(spec, peek=peek)
                    else:
                        value = reader.read_struct(fixorder(spec), peek=peek)
                        if not value:
                            self.log_debug(F'field {name} was empty, ignoring.')
                            continue
                        if len(value) > 1:
                            self.log_info(F'parsing field {name} produced {len(value)} items reading a tuple')
                        else:
                            value = value[0]

                    if pipeline:
                        value = numseq(pipeline, reverse=True, seed=value)
                    args.append(value)

                    if name == _REST_MARKER:
                        raise ValueError(F'Extracting a field with name {_REST_MARKER} is forbidden.')
                    elif name.isdecimal():
                        index = int(name)
                        limit = len(args) - 1
                        if index > limit:
                            self.log_warn(F'cannot assign index field {name}, the highest index is {limit}')
                        else:
                            args[index] = value
                        continue
                    elif name:
                        meta[name] = value

                if until and until(meta):
                    self.log_info(F'the expression ({until}) evaluated to true; aborting.')
                    break

                with StreamDetour(reader, checkpoint) as detour:
                    full = reader.read(detour.cursor - checkpoint)
                if last is None:
                    last = full

                outputs = []
                symbols = dict(meta)
                symbols[_REST_MARKER] = last

                for template in self.args.outputs:
                    used = set()
                    outputs.append(meta.format(template, self.codec, [full, *args], symbols, True, used=used))
                    for key in used:
                        if key in previously_existing_variables:
                            continue
                        meta.discard(key)

                for output in outputs:
                    chunk = Chunk(output)
                    chunk.meta.update(meta)
                    chunk.set_next_batch(index)
                    yield chunk

            except EOFError:
                break

        leftover = len(reader) - checkpoint

        if not leftover:
            return
        elif self.args.more:
            reader.seekset(checkpoint)
            yield reader.read()
        else:
            leftover = repr(SizeInt(leftover)).strip()
            self.log_info(F'discarding {leftover} left in buffer')

class sub (*argument, bigendian=False, blocksize=0)

This unit is implemented in refinery.units.blockwise.sub and has the following commandline Interface:

usage: sub [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Subtract the given argument from each block.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. It is chosen, by default, to be the
                     smallest size that can hold the provided argument without loss of precision.
                     For example, passing the value 0x1234 will result in a default block size of
                     2, while passing the value 12 will mean that the default block size is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class sub(BinaryOperationWithAutoBlockAdjustment):
    """
    Subtract the given argument from each block.
    """
    @staticmethod
    def operate(a, b): return a - b
    @staticmethod
    def inplace(a, b): a -= b

class subfiles (memdump=False, recursive=False)

This unit is implemented in refinery.units.pattern.subfiles and has the following commandline Interface:

usage: subfiles [-h] [-L] [-Q] [-0] [-v] [-m] [-r]

Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents
against the input data and generates one output chunk for each successfully carved subfile.

options:
  -m, --memdump    Assume that the input is a memdump for PE file carving.
  -r, --recursive  Extract files that are subfiles of other extracted files as separate chunks.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class subfiles(Unit):
    """
    Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against
    the input data and generates one output chunk for each successfully carved subfile.
    """

    _MINLENGTH = {
        'json': 300,
        'xml' : 300,
        'rtf' : 100,
    }

    def __init__(
        self,
        memdump: Param[bool, Arg.Switch('-m',
            help='Assume that the input is a memdump for PE file carving.')] = False,
        recursive: Param[bool, Arg.Switch('-r',
            help='Extract files that are subfiles of other extracted files as separate chunks.')] = False,
    ):
        super().__init__(memdump=memdump, recursive=recursive)

    def process(self, data: bytearray):
        carvers = {
            'zip'  : carve_zip(),
            '7z'   : carve_7z(),
            'pe'   : carve_pe(memdump=self.args.memdump, fileinfo=True, recursive=True, keep_root=True),
            'lnk'  : carve_lnk(),
            'json' : carve_json(dictonly=True),
            'xml'  : carve_xml(),
            'rtf'  : carve_rtf(),
        }

        covered = []

        for extension, unit in carvers.items():
            self.log_info(F'carving {extension} files')
            for chunk in data | unit:
                if len(chunk) < self._MINLENGTH.get(extension, 1):
                    continue
                start = chunk['offset']
                end = start + len(chunk)
                if any(start > left and end < right for left, right in covered):
                    continue
                if not self.args.recursive:
                    covered.append((start, end))
                yield chunk

class swap (src, dst=None)

This unit is implemented in refinery.units.meta.swap and has the following commandline Interface:

usage: swap [-h] [-L] [-Q] [-0] [-v] src [dst]

Swap the contents of an existing variable with the contents of the chunk or with another meta
variable. When swapping with the chunk, the variable has to contain a binary string. When
swapping with a variable that does not exist, the original variable is cleared, essentially
renaming the variable.

positional arguments:
  src            The meta variable name.
  dst            Optional name of the second meta variable.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class swap(Unit):
    """
    Swap the contents of an existing variable with the contents of the chunk or with another meta variable.
    When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable
    that does not exist, the original variable is cleared, essentially renaming the variable.
    """
    def __init__(
        self,
        src: Param[str, Arg.String(help='The meta variable name.')],
        dst: Param[str, Arg.String(help='Optional name of the second meta variable.')] = None
    ):
        super().__init__(
            src=check_variable_name(src),
            dst=check_variable_name(dst)
        )

    def filter(self, chunks: Iterable[Chunk]):
        src = self.args.src
        dst = self.args.dst
        for chunk in chunks:
            if not chunk.visible:
                pass
            elif dst is None:
                try:
                    value = chunk.meta[src]
                except KeyError:
                    value = bytearray()
                if isinstance(value, str):
                    value = value.encode(self.codec)
                elif not isbuffer(value):
                    raise ValueError(F'Unable to swap data with variable {src} because it has type {type(value).__name__}.')
                if not chunk:
                    chunk.meta.discard(src)
                else:
                    chunk.meta[src] = bytes(chunk)
                chunk[:] = value
            else:
                try:
                    value = chunk.meta.pop(src)
                except KeyError:
                    raise KeyError(F'The variable {src} does not exist.')
                try:
                    swap = chunk.meta.pop(dst)
                except KeyError:
                    chunk.meta[dst] = value
                else:
                    chunk.meta[src], chunk.meta[dst] = swap, value
            yield chunk

class szdd

This unit is implemented in refinery.units.compression.szdd and has the following commandline Interface:

usage: szdd [-h] [-L] [-Q] [-0] [-v] [-F]

Extract files from SZDD archives.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class szdd(Unit):
    """
    Extract files from SZDD archives.
    """
    def process(self, data):
        with StructReader(data) as archive:
            if archive.read(8) != b'SZDD\x88\xF0\x27\x33':
                if not self.args.lenient:
                    raise ValueError('signature missing')
                self.log_warn('the header signature is invalid, this is likely not an SZDD archive')
            if archive.read_byte() != 0x41:
                raise ValueError('Unsupported compression mode')
            # ignore the missing file extension letter:
            archive.seekrel(1)
            output_len = archive.u32()
            window_pos = 0x1000 - 0x10
            output_pos = 0
            output = bytearray(output_len)
            window = bytearray(0x1000)
            for k in range(len(window)):
                window[k] = 0x20
            while not archive.eof:
                control = archive.read_byte()
                for cb in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80):
                    if archive.eof:
                        break
                    if control & cb:
                        output[output_pos] = window[window_pos] = archive.read_byte()
                        output_pos += 1
                        window_pos += 1
                        window_pos &= 0xFFF
                    else:
                        match_pos = archive.read_byte()
                        match_len = archive.read_byte()
                        match_pos |= (match_len & 0xF0) << 4
                        match_len = (match_len & 0x0F) + 3
                        match_pos &= 0xFFF
                        for _ in range(match_len):
                            window[window_pos] = window[match_pos]
                            output[output_pos] = window[window_pos]
                            output_pos += 1
                            window_pos += 1
                            match_pos += 1
                            window_pos &= 0xFFF
                            match_pos &= 0xFFF
            return output

    @classmethod
    def handles(cls, data):
        return data[:4] == B'SZDD'

class tea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, rounds=32)

This unit is implemented in refinery.units.crypto.cipher.tea and has the following commandline Interface:

usage: tea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-k N] key

TEA encryption and decryption.

positional arguments:
  key              The encryption key.

options:
  -i, --iv IV      Specifies the initialization vector. If none is specified, then a block of
                   zero bytes is used.
  -p, --padding P  Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does
                   nothing. By default, all other algorithms are attempted. In most cases, the
                   data was not correctly decrypted if none of these work.
  -m, --mode M     Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB,
                   PCBC. By default, the CBC mode is used when an IV is is provided, and ECB
                   otherwise.
  -r, --raw        Set the padding to raw; ignored when a padding is specified.
  -s, --swap       Decode blocks as big endian rather than little endian.
  -k, --rounds N   Specify the number of rounds, 32 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class tea(TEAUnit, cipher=BlockCipherFactory(TEA)):
    """
    TEA encryption and decryption.
    """

class termfit (width=0, delta=0, tight=False)

This unit is implemented in refinery.units.strings.termfit and has the following commandline Interface:

usage: termfit [-h] [-L] [-Q] [-0] [-v] [-d N] [-t] [width]

Reformat incoming text data to fit a certain width.

positional arguments:
  width          Optionally specify the width, by default the current terminal width is used.

options:
  -d, --delta N  Subtract this number from the calculated width (0 by default).
  -t, --tight    Separate paragraphs by a single line break instead of two.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class termfit(Unit):
    """
    Reformat incoming text data to fit a certain width.
    """

    def __init__(
        self,
        width: Param[int, Arg('width', help='Optionally specify the width, by default the current terminal width is used.')] = 0,
        delta: Param[int, Arg.Number('-d', help='Subtract this number from the calculated width (0 by default).')] = 0,
        tight: Param[bool, Arg.Switch('-t', help='Separate paragraphs by a single line break instead of two.')] = False,
    ):
        super().__init__(width=width, delta=delta, tight=tight)

    @unicoded
    def process(self, data: str) -> str:
        parsep = '\n' if self.args.tight else '\n\n'
        return terminalfit(data, self.args.delta, self.args.width, parsep)

class terminate (sentinel=b'\x00', blocksize=1, bigendian=False)

This unit is implemented in refinery.units.blockwise.terminate and has the following commandline Interface:

usage: terminate [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] [-E] [sentinel]

The unit reads data from the incoming chunk in blocks of any given size until the sentinel value
is encountered. The output of the unit is all data that was read, excluding the sentinel. The
default block size is one and the default sentinel value is zero, which corresponds to reading a
null-terminated string from the input. If the sentinel value is not found anywhere in the
incoming data, the complete input is returned as output.

positional arguments:
  sentinel           sentinel value to look for; default is H:00

options:
  -B, --blocksize N  The size of each block in bytes. The default is 1.
  -E, --bigendian    Read chunks in big endian.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class terminate(BlockTransformationBase):
    """
    The unit reads data from the incoming chunk in blocks of any given size until the
    sentinel value is encountered. The output of the unit is all data that was read,
    excluding the sentinel. The default block size is one and the default sentinel value
    is zero, which corresponds to reading a null-terminated string from the input.
    If the sentinel value is not found anywhere in the incoming data, the complete input
    is returned as output.
    """
    def __init__(
        self,
        sentinel: Param[buf, Arg(help='sentinel value to look for; default is {default}')] = B'\0',
        blocksize=1, bigendian=False
    ):
        super().__init__(blocksize=blocksize, bigendian=bigendian, sentinel=sentinel)

    def process(self, data: bytearray):
        sentinel = self.args.sentinel
        position = 0
        blocksize = self.blocksize

        self.log_info('blocksize:', blocksize)
        self.log_debug('separator:', sentinel)

        while position >= 0:
            position = data.find(sentinel, position)
            if position < 0:
                self.log_info(F'The sentinel value {sentinel} was not found.')
                break
            q, r = divmod(position, blocksize)
            if r:
                position = (q + 1) * blocksize
                continue
            else:
                data[position:] = []
                break

        return data

    def reverse(self, data: bytearray):
        sentinel = self.args.sentinel
        position = 0
        while True:
            position = data.find(sentinel, position)
            if position < 0:
                data.extend(sentinel)
                break
            if position % self.blocksize == 0:
                self.log_warn('input string already contains the termination character; returning unmodified input')
                break
            position += 1
        return data

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data: bytearray):
    sentinel = self.args.sentinel
    position = 0
    while True:
        position = data.find(sentinel, position)
        if position < 0:
            data.extend(sentinel)
            break
        if position % self.blocksize == 0:
            self.log_warn('input string already contains the termination character; returning unmodified input')
            break
        position += 1
    return data

class transpose (padding=b'')

This unit is implemented in refinery.units.meta.transpose and has the following commandline Interface:

usage: transpose [-h] [-L] [-Q] [-0] [-v] [padding]

Interprets the chunks in the current frame as rows of a matrix and yields the columns of that
matrix. When chunks are not of even length, the matrix is considered to have empty entries in
some positions. Optionally, a padding sequence can be provided to pad all rows to the same
length.

positional arguments:
  padding        Optional byte sequence to use as padding for incomplete rows.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class transpose(Unit):
    """
    Interprets the chunks in the current frame as rows of a matrix and yields the columns
    of that matrix. When chunks are not of even length, the matrix is considered to have
    empty entries in some positions. Optionally, a padding sequence can be provided to pad
    all rows to the same length.
    """
    @Unit.Requires('numpy', ['speed', 'default', 'extended'])
    def _numpy():
        import numpy
        return numpy

    def __init__(
        self,
        padding: Param[buf, Arg(help='Optional byte sequence to use as padding for incomplete rows.')] = B'',
    ):
        super().__init__(bigendian=False, padding=padding)

    def filter(self, chunks: Iterable[Chunk]):
        rows = []
        for chunk in chunks:
            if not chunk.visible:
                yield chunk
                continue
            rows.append(chunk)
        if not rows:
            return
        matrix = rows[0]
        matrix.temp = rows
        yield matrix

    def process(self, data: Chunk):
        chunks: list[Chunk] = data.temp
        if not chunks:
            return
        length = [len(chunk) for chunk in chunks]
        n = min(length)
        m = max(length)
        pad = self.args.padding
        if pad:
            for chunk in chunks:
                while len(chunk) < m:
                    chunk.extend(pad)
                del chunk[m:]
        if n > 0:
            try:
                np = self._numpy
            except ImportError:
                pass
            else:
                t = [chunk[n:] for chunk in chunks if len(chunk) > n]
                for chunk in chunks:
                    del chunk[n:]
                a = np.array(chunks, dtype=np.uint8).transpose()
                for row in a:
                    yield row.tobytes('C')
                m = m - n
                chunks = t
        for i in range(m):
            yield bytes(chunk[i] for chunk in chunks if len(chunk) > i)

class trim (*junk, unpad=False, left=False, right=False, nocase=False)

This unit is implemented in refinery.units.strings.trim and has the following commandline Interface:

usage: trim [-h] [-L] [-Q] [-0] [-v] [-u] [-l | -r] [-i] [junk ...]

Removes byte sequences at beginning and end of input data.

positional arguments:
  junk           Binary strings to be removed, default are all whitespace characters.

options:
  -u, --unpad    Also trim partial occurrences of the junk string.
  -l, --left     Trim only left.
  -r, --right    Trim only right.
  -i, --nocase   Ignore capitalization for alphabetic characters.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class trim(Unit):
    """
    Removes byte sequences at beginning and end of input data.
    """

    def __init__(
        self,
        *junk : Param[buf, Arg(help='Binary strings to be removed, default are all whitespace characters.')],
        unpad : Param[bool, Arg.Switch('-u', help='Also trim partial occurrences of the junk string.')] = False,
        left  : Param[bool, Arg.Switch('-l', group='SIDE', help='Trim only left.')] = False,
        right : Param[bool, Arg.Switch('-r', group='SIDE', help='Trim only right.')] = False,
        nocase: Param[bool, Arg.Switch('-i', help='Ignore capitalization for alphabetic characters.')] = False,
    ):
        if not left and not right:
            left = right = True
        super().__init__(junk=junk, left=left, right=right, unpad=unpad, nocase=nocase)

    def _trimfast(self, view: memoryview, *junks: bytes, right=False) -> tuple[bool, memoryview]:
        done = False
        pos = 0
        while not done:
            done = True
            for junk in junks:
                temp = junk
                size = len(junk)
                if right and self.args.unpad:
                    for k in range(size):
                        n = size - k
                        if view[pos:pos + n] == junk[k:]:
                            pos += n
                            done = False
                            break
                if view[pos:pos + size] == temp:
                    m = len(temp)
                    while True:
                        mm = m << 1
                        if view[pos + m:pos + mm] != temp:
                            break
                        temp += temp
                        m = mm
                    temp = memoryview(temp)
                    while m >= size:
                        if view[pos:pos + m] == temp[:m]:
                            done = False
                            pos += m
                        m //= 2
                if right or not self.args.unpad:
                    continue
                while size > 0:
                    if view[pos:pos + size] == temp[:size]:
                        done = False
                        pos += size
                        break
                    size -= 1
        return pos

    def process(self, data: bytearray):
        junk = list(self.args.junk)
        if not junk:
            import string
            space = string.whitespace.encode('ascii')
            junk = [space[k - 1:k] for k in range(1, len(space))]
        lpos = 0
        rpos = 0
        if self.args.nocase:
            work = data.lower()
            junk = [j.lower() for j in junk]
        else:
            work = data
        if self.args.left:
            lpos = self._trimfast(memoryview(work), *junk)
        if self.args.right:
            work.reverse()
            junk = [bytes(reversed(j)) for j in junk]
            rpos = self._trimfast(memoryview(work), *junk, right=True)
            work.reverse()
        view = memoryview(data)
        if lpos:
            view = view[+lpos:]
        if rpos:
            view = view[:-rpos]
        return view

class u16

This unit is implemented in refinery.units.encoding.u16 and has the following commandline Interface:

usage: u16 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Encodes and decodes UTF-16 encoded string data.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class u16(Unit):
    """
    Encodes and decodes UTF-16 encoded string data.
    """

    def reverse(self, data: bytearray):
        return data.decode(self.codec).encode('utf-16LE')

    def process(self, data: bytearray):
        return data.decode('utf-16').encode(self.codec)

    @classmethod
    def handles(cls, data):
        if encoding := guess_text_encoding(data):
            return encoding.step == 2

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data: bytearray):
    return data.decode(self.codec).encode('utf-16LE')

class ucrypt (size=13, salt=b'AA')

This unit is implemented in refinery.units.crypto.keyderive.unixcrypt and has the following commandline Interface:

usage: ucrypt [-h] [-L] [-Q] [-0] [-v] [N] [salt]

Implements the classic Unix crypt algorithm.

positional arguments:
  N              The number of bytes to generate, default is 13.
  salt           Salt for the derivation, the default is "AA".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ucrypt(KeyDerivation):
    """
    Implements the classic Unix crypt algorithm.
    """
    def __init__(
        self,
        size: Param[int, Arg(help='The number of bytes to generate, default is 13.')] = 13,
        salt: Param[buf, Arg(help='Salt for the derivation, the default is "AA".')] = B'AA'
    ):
        super().__init__(size=size, salt=salt)

    def process(self, data):
        crypted = bytes(UnixCrypt(data, salt=self.args.salt))
        if len(crypted) < self.args.size:
            raise RefineryPartialResult(
                F'unix crypt only provided {len(crypted)} bytes, but {self.args.size} '
                F'were requested.', partial=crypted
            )
        return crypted[:self.args.size]

class url (plus=False, hex=False)

This unit is implemented in refinery.units.encoding.url and has the following commandline Interface:

usage: url [-h] [-L] [-Q] [-0] [-v] [-R] [-p] [-x]

Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the following
symbols: _, ., -, ~, \, /. Every other character is escaped by hex-encoding it and prefixing it
with a percent symbol.

options:
  -p, --plus     also replace plus signs by spaces
  -x, --hex      hex encode every character in reverse mode

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class url(Unit):
    """
    Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the
    following symbols: `_`, `.`, `-`, `~`, `\\`, `/`. Every other character is escaped by
    hex-encoding it and prefixing it with a percent symbol.
    """

    def __init__(
        self,
        plus: Param[bool, Arg.Switch('-p', help='also replace plus signs by spaces')] = False,
        hex: Param[bool, Arg.Switch('-x', help='hex encode every character in reverse mode')] = False
    ):
        super().__init__(plus=plus, hex=hex)

    def process(self, data):
        if self.args.plus:
            data = data.replace(B'+', B' ')
        data = unquote_to_bytes(bytes(data))
        data = re.sub(
            B'%[uU]([0-9a-fA-F]{4})',
            lambda m: int(m[1], 16).to_bytes(2, 'little'),
            data)
        return data

    def reverse(self, data):
        if self.args.hex:
            result = bytearray(len(data) * 3)
            offset = 0
            for byte in data:
                result[offset + 0] = 0x25
                offset += 1
                result[offset:offset + 2] = B'%02X' % byte
                offset += 2
            return result
        elif self.args.plus:
            def replace(m):
                c = m[0][0]
                return b'+' if c == 0x20 else B'%%%02X' % c
        else:
            def replace(m):
                return B'%%%02X' % m[0][0]
        return re.sub(B'[^a-zA-Z0-9_.-~\\/]', replace, data)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    if self.args.hex:
        result = bytearray(len(data) * 3)
        offset = 0
        for byte in data:
            result[offset + 0] = 0x25
            offset += 1
            result[offset:offset + 2] = B'%02X' % byte
            offset += 2
        return result
    elif self.args.plus:
        def replace(m):
            c = m[0][0]
            return b'+' if c == 0x20 else B'%%%02X' % c
    else:
        def replace(m):
            return B'%%%02X' % m[0][0]
    return re.sub(B'[^a-zA-Z0-9_.-~\\/]', replace, data)

class urlfix (meta=False, keep=0)

This unit is implemented in refinery.units.misc.urlfix and has the following commandline Interface:

usage: urlfix [-h] [-L] [-Q] [-0] [-v] [-m] [-k]

Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
characters in the URL path component and normalizes the network location part to lowercase. Note
that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
URL will be swallowed and not return any output.

options:
  -m, --meta     Extract the query string parameters as metadata.
  -k, --keep     If specified once, keeps the it keeps the URL params and query string. If
                 specified twice, it keeps the URL fragment as well. At this level, the unit
                 still filters out anything that does not parse as a URL.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class urlfix(Unit):
    """
    Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
    characters in the URL path component and normalizes the network location part to lowercase. Note
    that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
    URL will be swallowed and not return any output.
    """
    def __init__(
        self,
        meta: Param[bool, Arg.Switch('-m', help='Extract the query string parameters as metadata.')] = False,
        keep: Param[int, Arg.Counts('-k', help=(
            'If specified once, keeps the it keeps the URL params and query string. If specified '
            'twice, it keeps the URL fragment as well. At this level, the unit still filters out '
            'anything that does not parse as a URL.'
        ))] = 0
    ):
        super().__init__(keep=keep, meta=meta)

    def process(self, data):
        def fix(string):
            return quote(unquote(string))
        keep = self.args.keep
        meta = self.args.meta
        parsed = urlparse(data.decode(self.codec))
        if not parsed.scheme or not parsed.netloc:
            return None
        query_dict = {key: unquote(value) for key, value in parse_qsl(parsed.query)}
        query_string = '&'.join(F'{key}={quote(value)}' for key, value in query_dict.items())
        replacements = dict(
            netloc=parsed.netloc.lower(),
            params=fix(parsed.params),
            path=fix(parsed.path),
            query=query_string,
            fragment=fix(parsed.fragment),
        )
        if keep < 2:
            replacements.update(fragment='')
            if keep < 1:
                replacements.update(params='', query='')
        url = urlunparse(parsed._replace(**replacements))
        url = url.encode(self.codec)
        if meta:
            url = self.labelled(url, **query_dict)
        return url

class urlguards

This unit is implemented in refinery.units.pattern.urlguards and has the following commandline Interface:

usage: urlguards [-h] [-L] [-Q] [-0] [-v]

Restores the original URLs from their 'protected' versions as generated by Outlook protection and
ProofPoint.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class urlguards(Unit):
    """
    Restores the original URLs from their 'protected' versions as generated by
    Outlook protection and ProofPoint.
    """

    _PP3RLENC = {
        letter: rl for rl, letter in enumerate(
            'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            'abcdefghijklmnopqrstuvwxyz'
            '0123456789-_', 2
        )
    }

    @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v([12])/url\?([:;/_=!?#&.,\w\%\-\+|]+)')
    def _proofpointV2(self, match):
        version = int(match[1])
        self.log_info('proofpoint match:', version)
        argmatch = re.match(
            R'^u=(.+?)&(?:amp;)?{}='.format('k' if version == 1 else '[dc]'),
            match[2],
            flags=re.DOTALL
        )
        if not argmatch:
            self.log_warn('not able to translate unexpected proofpoint format:', match)
            return match[0]
        encoded = argmatch[1]
        if match[1] == '2':
            encoded = encoded.translate(str.maketrans('-_', '%/'))
        return unescape(unquote(encoded))

    @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v3/__(.+?)__;(.*?)![-\w!?$]+')
    def _proofpointV3(self, match):
        data = unquote(match[1])
        cmap = match[2] + '=' * (-len(match[2]) % 4)
        cmap = urlsafe_b64decode(cmap).decode('UTF-8')
        cursor = 0
        result = ''
        for k in range(len(cmap)):
            ast = data.find('*', cursor)
            if ast < 0:
                break
            result += data[cursor:ast]
            if data[ast + 1] == '*':
                end = self._PP3RLENC[data[ast + 2]]
                result += cmap[k:end]
                ast += 2
            else:
                result += cmap[k]
            cursor = ast + 1
        self.log_debug(result)
        self.log_debug(data[cursor:])
        return result + data[cursor:]

    @unguard(r'https?://\w+.safelinks\.protection\.outlook\.com/([:;/_=!?#&.,\w\%\-\+|]+)')
    def _outlook(self, match):
        result = match[0]
        self.log_info('outlook match:', result)
        parsed = urlparse(result)
        params = parse_qs(parsed.query)
        try:
            result = unquote(params['url'][0])
        except Exception:
            pass
        return result

    @unguard(r'https?://outlook.office.com/actions/ei\?u=([:;/_=!?#&.,\w\%\-\+|]+)')
    def _outlook_image_proxy(self, match):
        return unquote(match[1])

    @unguard(r'https?://(?:[\w-]+\.)?trendmicro.com(?::\d+)?/wis/clicktime/v[12]/(?:query|clickthrough)[:;/_=!?#&.,\w\%\-\+|]+')
    def _trendmicro(self, match):
        result = match[0]
        self.log_info('trendmicro match:', result)
        parsed = urlparse(result)
        params = parse_qs(parsed.query)
        try:
            result = unquote(params['url'][0])
        except Exception:
            pass
        return result

    @unicoded
    def process(self, data: str) -> str:
        newsize, size = 0, len(data)
        while newsize != size:
            for handler in (
                self._proofpointV2,
                self._proofpointV3,
                self._outlook,
                self._outlook_image_proxy,
                self._trendmicro
            ):
                data = handler(data)
            size = newsize
            newsize = len(data)
        return data

class urn (size='N:N', keep=False, sort=False)

This unit is implemented in refinery.units.meta.urn and has the following commandline Interface:

usage: urn [-h] [-L] [-Q] [-0] [-v] [-k] [-s] [a:b]

Treat the chunks in the current frame as items in an urn and produce every possible sequence that
could occur as a sequence of draws. For example, selecting both -k and -s is equivalent to
generating all possible permutations of these chunks.

positional arguments:
  a:b            Generate sequences of length x, where x is in [a:b]. The default value is N:N,
                 where N is the number of chunks in the current frame.

options:
  -k, --keep     Chunks are not returned back to the urn after being drawn.
  -s, --sort     The order of items does not matter; for the output, chunks are sorted according
                 to their original position in the frame.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class urn(Unit):
    """
    Treat the chunks in the current frame as items in an urn and produce every possible sequence
    that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent
    to generating all possible permutations of these chunks.
    """

    def __init__(self,
        size: Param[str, Arg.String(metavar='a:b', help=(
            'Generate sequences of length x, where x is in [a:b]. The default value is {default}, '
            'where N is the number of chunks in the current frame.'))] = 'N:N',
        keep: Param[bool, Arg.Switch('-k', help=(
            'Chunks are not returned back to the urn after being drawn.'))] = False,
        sort: Param[bool, Arg.Switch('-s', help=(
            'The order of items does not matter; for the output, chunks are sorted according to '
            'their original position in the frame.'))] = False
    ):
        super().__init__(size=size, keep=keep, sort=sort)

    def process(self, data: Chunk):
        yield from data.temp

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        head = next(it)
        buffer = [bytes(head)]
        buffer.extend(bytes(c) for c in it)
        head = head.copy(meta=True, data=False)
        head.meta['N'] = len(buffer)
        size = sliceobj(self.args.size, head)
        a = size.start or 1
        b = size.stop or len(buffer)
        b = max(b, a + 1)
        c = size.step or 1
        self.log_debug(F'using size [{a}:{b}:{c}]')
        s = 1 if self.args.sort else 0
        k = 1 if self.args.keep else 0
        m = (s << 1) | k
        method = {
            0b00: lambda i, r: product(i, repeat=r),
            0b01: combinations,
            0b10: combinations_with_replacement,
            0b11: permutations
        }[m]
        self.log_info(F'choosing {method.__name__}')
        for n in range(a, b, c):
            self.log_debug(F'generating sequences of length {n}')
            for head.temp in method(buffer, n):
                yield head

class uuenc

This unit is implemented in refinery.units.encoding.uuenc and has the following commandline Interface:

usage: uuenc [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Unit for uuencode.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class uuenc(Unit):
    """
    Unit for uuencode.
    """
    def process(self, data):
        header = re.search(
            B'^begin ([0-7]{3}) (.*?)$', data, flags=re.M)
        if header is None:
            raise ValueError('invalid uu header')
        output = bytearray()
        view = memoryview(data)
        breaks = [m.end() for m in iter(re.finditer(B'^', data, flags=re.M))]
        eol = False
        for k, br in enumerate(itertools.islice(breaks, 1, None)):
            if eol and view[br:br + 3] == b'end':
                path = header[2]
                if path != B'-':
                    output = self.labelled(output, path=path)
                return output
            count = view[br] - 0x20
            if count not in range(0x41):
                raise ValueError(F'Invalid length encoding 0x{view[br]:02X} in line {k}.')
            count %= 0x40
            cursor = len(output)
            q, r = divmod(count, 3)
            q += int(bool(r))
            end = br + 1 + q * 4
            for b in range(br + 1, end, 4):
                chunk = 0
                for j in range(4):
                    character = view[b + j]
                    if character not in range(0x21, 0x61):
                        raise ValueError(F'Invalid character 0x{character:02X} in line {k}.')
                    chunk = ((character - 0x20) % 0x40) | (chunk << 6)
                output.extend(chunk.to_bytes(3, 'big'))
            del output[cursor + count:]
            eol = count == 0
            if len(output) < cursor + count:
                break
        raise RefineryPartialResult(F'Data truncated in line {k}', output)

    def reverse(self, data):
        meta = metavars(data)
        path = meta.get('path', None)
        name = path and pathlib.Path(path).name or '-'
        view = memoryview(data)
        with MemoryFile() as stream:
            stream.write(B'begin 666 ')
            stream.write(name.encode(self.codec))
            for k in range(0, len(view), 45):
                slice = view[k:k + 45]
                stream.write_byte(0x0A)
                stream.write_byte(0x20 + len(slice))
                for chunk in chunks.unpack(slice, 3, bigendian=True, pad=True):
                    for j in range(3, -1, -1):
                        stream.write_byte(0x20 + (((chunk >> j * 6) & 0x3F) or 0x40))
            stream.write(B'\n`\nend\n')
            return stream.getvalue()

    @classmethod
    def handles(cls, data):
        if len(data) < 16:
            return False
        if data[:6] == B'begin ':
            return re.fullmatch(B'[0-7]{3}', data[6:9]) is not None

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    meta = metavars(data)
    path = meta.get('path', None)
    name = path and pathlib.Path(path).name or '-'
    view = memoryview(data)
    with MemoryFile() as stream:
        stream.write(B'begin 666 ')
        stream.write(name.encode(self.codec))
        for k in range(0, len(view), 45):
            slice = view[k:k + 45]
            stream.write_byte(0x0A)
            stream.write_byte(0x20 + len(slice))
            for chunk in chunks.unpack(slice, 3, bigendian=True, pad=True):
                for j in range(3, -1, -1):
                    stream.write_byte(0x20 + (((chunk >> j * 6) & 0x3F) or 0x40))
        stream.write(B'\n`\nend\n')
        return stream.getvalue()

class vaddr (*name, base=None)

This unit is implemented in refinery.units.formats.exe.vaddr and has the following commandline Interface:

usage: vaddr [-h] [-L] [-Q] [-0] [-v] [-R] [-b ADDR] [name ...]

Converts a metadata variable holding a file offset to a virtual address. This unit only works
when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in
place. If you would like to retain the original value, it is recommended to use the put unit
first to create a copy of an already existing variable, and then convert the copy.

positional arguments:
  name             The name of a metadata variable holding an integer.

options:
  -b, --base ADDR  Optionally specify a custom base address B.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class vaddr(Unit):
    """
    Converts a metadata variable holding a file offset to a virtual address. This unit only works when the
    chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you
    would like to retain the original value, it is recommended to use the `refinery.put` unit first to create
    a copy of an already existing variable, and then convert the copy.
    """

    def __init__(
        self, *name: Param[str, Arg.String(help='The name of a metadata variable holding an integer.')],
        base: Param[int, Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.')] = None
    ):
        return super().__init__(names=name, base=base)

    def process(self, data):
        try:
            exe = Executable.Load(data, self.args.base)
        except Exception:
            self.log_warn('unable to parse input as executable; no variable conversion was performed')
            return data
        meta = metavars(data)
        for name in self.args.names:
            value = meta[name]
            meta[name] = exe.location_from_offset(value).virtual.position
        return data

    def reverse(self, data):
        try:
            exe = Executable.Load(data, self.args.base)
        except Exception:
            self.log_warn('unable to parse input as executable; no variable conversion was performed')
            return data
        meta = metavars(data)
        for name in self.args.names:
            value = meta[name]
            meta[name] = exe.location_from_address(value).physical.position
        return data

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    try:
        exe = Executable.Load(data, self.args.base)
    except Exception:
        self.log_warn('unable to parse input as executable; no variable conversion was performed')
        return data
    meta = metavars(data)
    for name in self.args.names:
        value = meta[name]
        meta[name] = exe.location_from_address(value).physical.position
    return data

class vbapc (raw=False)

This unit is implemented in refinery.units.formats.office.vbapc and has the following commandline Interface:

usage: vbapc [-h] [-L] [-Q] [-0] [-v] [-r]

Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to
decompile the disassembled p-code. This unit is specifically useful for macro documents that use
VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the
p-code functionality that the document will actually execute.

options:
  -r, --raw      Return disassembled p-code, do not try to decompile.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class vbapc(Unit):
    """
    Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to
    decompile the disassembled p-code. This unit is specifically useful for macro documents that
    use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent
    the p-code functionality that the document will actually execute.
    """
    def __init__(self, raw: Param[bool, Arg.Switch('-r', help='Return disassembled p-code, do not try to decompile.')] = False):
        super().__init__(raw=raw)

    @Unit.Requires('oletools', ['formats', 'office', 'extended'])
    def _pcodedmp():
        with NoLogging(NoLogging.Mode.ALL):
            import pcodedmp.pcodedmp
            return pcodedmp.pcodedmp

    def process(self, data):
        class args:
            disasmOnly = True
            verbose = False
        with io.StringIO() as output:
            with VirtualFileSystem() as vfs:
                vf = vfs.new(data)
                self._pcodedmp.processFile(vf, args, output)
            code = output.getvalue()
            if not self.args.raw:
                from refinery.lib.thirdparty.pcode2code import Parser
                parser = Parser(code)
                parser.parseInput()
                parser.processInput(False)
                code = parser.getOutput()
                code = re.sub(R'(?m)^((?:Sub|Function).*?)$(?!\n[^\s])', r'\n\1', code)
            return code.encode(self.codec)

class vbastr (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.vbastr and has the following commandline Interface:

usage: vbastr [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract VBA macro variables from Office documents. The items are extracted in a directory
hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same
as the variable's name. The variable can define a caption, a control tip text, and a value; the
unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class vbastr(PathExtractorUnit):
    """
    Extract VBA macro variables from Office documents. The items are extracted in a directory
    hierarchy that specifies their corresponding OLE stream. The stem of their file name is the
    same as the variable's name. The variable can define a caption, a control tip text, and a
    value; the unit extracts these with the synthesized file extension "cap", "tip", and "val",
    respectively.
    """
    @PathExtractorUnit.Requires('oletools', ['formats', 'office'])
    def _olevba():
        from oletools import olevba
        return olevba

    def unpack(self, value):
        try:
            parser = self._olevba.VBA_Parser('.', data=bytes(value), relaxed=True)
        except self._olevba.FileOpenError:
            raise ValueError('Input data not recognized by VBA parser')
        try:
            for path, name, vars in parser.extract_form_strings_extended():
                if not vars:
                    continue
                name = _txt(vars['name'])
                for ext, key in {
                    'cap': 'caption',
                    'tip': 'control_tip_text',
                    'val': 'value',
                }.items():
                    value = _bin(vars.get(key))
                    if not value:
                        continue
                    yield UnpackResult(F'{path!s}/{name!s}/{name}.{ext}', value)
        except self._olevba.oleform.OleFormParsingError as error:
            from collections import Counter
            self.log_debug(str(error))
            self.log_info('extended form extraction failed with error; falling back to simple method')
            form_strings = list(parser.extract_form_strings())
            name_counter = Counter(name for _, name, _ in form_strings)
            dedup = Counter()
            for path, name, string in form_strings:
                if string is None:
                    continue
                if name_counter[name] > 1:
                    dedup[name] += 1
                    name = F'{name!s}.v{dedup[name]}'
                yield UnpackResult(F'{path!s}/{name!s}.val', _bin(string))

class vigenere (key, alphabet=b'abcdefghijklmnopqrstuvwxyz', operator='add', case_sensitive=False, ignore_unknown=False)

This unit is implemented in refinery.units.crypto.cipher.vigenere and has the following commandline Interface:

usage: vigenere [-h] [-L] [-Q] [-0] [-v] [-R] [-: OP] [-c] [-i] key [alphabet]

Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.

positional arguments:
  key                   The encryption key
  alphabet              The alphabet, by default the Latin one is used:
                        "abcdefghijklmnopqrstuvwxyz"

options:
  -:, --operator OP     Choose the vigenere block operation. The default is add, and the
                        available options are: add, sub, xor
  -c, --case-sensitive  Unless this option is set, the key will be case insensitive. Uppercase
                        letters from the input are transformed using the same shift as would be
                        the lowercase variant, but case is retained.
  -i, --ignore-unknown  Unless this option is set, the key stream will be iterated even for
                        letters that are not contained in the alphabet.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation.

Expand source code Browse git

class vigenere(Unit):
    """
    Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.
    """

    def __init__(
        self,
        key: Param[buf, Arg(help='The encryption key')],
        alphabet: Param[buf, Arg(
            help='The alphabet, by default the Latin one is used: "{default}"'
        )] = b'abcdefghijklmnopqrstuvwxyz',
        operator: Param[str, Arg.Choice('-:', choices=['add', 'sub', 'xor'], metavar='OP', help=(
            'Choose the vigenere block operation. The default is {default}, and the available options are: {choices}'))] = 'add',
        case_sensitive: Param[bool, Arg.Switch('-c', help=(
            'Unless this option is set, the key will be case insensitive. Uppercase letters from the input are transformed '
            'using the same shift as would be the lowercase variant, but case is retained.'))] = False,
        ignore_unknown: Param[bool, Arg.Switch('-i', help=(
            'Unless this option is set, the key stream will be iterated even '
            'for letters that are not contained in the alphabet.'
        ))] = False
    ):
        if not callable(operator):
            operator = {
                'add': __add__,
                'sub': __sub__,
                'xor': __xor__,
            }.get(operator.lower(), None)
            if operator is None:
                raise ValueError(F'The value {operator!r} is not valid as an operator.')
        self.superinit(super(), **vars())

    def _tabula_recta(self, data, reverse=True):
        key: str = self.args.key.decode(self.codec)
        alphabet: str = self.args.alphabet.decode(self.codec)
        operator = self.args.operator
        case_sensitive: bool = self.args.case_sensitive
        ignore_unknown: bool = self.args.ignore_unknown
        if not case_sensitive:
            key = key.lower()
            alphabet = alphabet.lower()
            if len(set(alphabet)) != len(alphabet):
                raise ValueError('Duplicate entries detected in alphabet.')
        if not set(key) <= set(alphabet):
            diff = set(key) - set(alphabet)
            diff = ', '.join(diff)
            raise ValueError(F'key contains letters which are not from the given alphabet: {diff}')
        self.log_info(F'using key {key} and alphabet {alphabet}')
        keystream = cycle(key)
        alph_size = len(alphabet)
        if reverse:
            operator = _opeator_inverse[operator]
        for letter in data:
            uppercase = not case_sensitive and letter.isupper()
            if uppercase:
                letter = letter.lower()
            try:
                position = alphabet.index(letter)
            except ValueError:
                yield letter
                if not ignore_unknown:
                    next(keystream)
                continue
            shift = alphabet.index(next(keystream))
            result = alphabet[operator(position, shift) % alph_size]
            yield result.upper() if uppercase else result

    @unicoded
    def process(self, data):
        return ''.join(self._tabula_recta(data, True))

    @unicoded
    def reverse(self, data):
        return ''.join(self._tabula_recta(data, False))

Methods

def reverse(self, data)

Expand source code Browse git

@unicoded
def reverse(self, data):
    return ''.join(self._tabula_recta(data, False))

class vmemref (*address, take=None, base=None, deref_count=1, deref_depth=2)

This unit is implemented in refinery.units.formats.exe.vmemref and has the following commandline Interface:

usage: vmemref [-h] [-L] [-Q] [-0] [-v] [-t SIZE] [-b ADDR] [-c N] [-d N] [ADDR ...]

The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual
address for memory references. For each memory reference, the unit looks up the corresponding
section and file offset for the reference. It then returns all data from that section starting at
the given offset.

positional arguments:
  ADDR                 Specify the address of a function to scan. If no argument is given, the
                       unit will scan all functions for memory references.

options:
  -t, --take SIZE      Optionally specify the number of bytes to read from each reference; by
                       default, all data until the end of the section is returned.
  -b, --base ADDR      Optionally specify a custom base address B.
  -c, --deref-count N  Optionally specify the number of items to inspect at a discovered memory
                       address as as a potential pointer. The default is 1.
  -d, --deref-depth N  Optionally specify the maximum number of times that referenced data is
                       dereferenced as a pointer, potentially leading to another referenced
                       memory location. The default is 2.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class vmemref(Unit):
    """
    The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual
    address for memory references. For each memory reference, the unit looks up the corresponding
    section and file offset for the reference. It then returns all data from that section starting
    at the given offset.
    """

    @Unit.Requires('smda<2.0', ['all'])
    def _smda():
        import datetime
        datetime.UTC = datetime.timezone.utc
        import smda
        import smda.Disassembler
        import smda.DisassemblyResult
        return smda

    def _memory_references(
        self,
        exe: Executable,
        function: SmdaFunction,
        codes: Container[Range],
        max_dereference_depth: int,
        max_dereference_count: int,
        references: dict,
    ):
        def is_valid_data_address(address):
            if not isinstance(address, int):
                return False
            if address not in exe:
                return False
            if address in instructions:
                return False
            for code in codes:
                if address in code:
                    return False
            return True

        def dereference(address):
            return int.from_bytes(exe[address:address + pointer_size], exe.byte_order().value)

        pointer_size = exe.pointer_size // 8

        with NoLogging():
            instructions = {op.offset: op for op in function.getInstructions()}

        for op in instructions.values():
            try:
                with NoLogging():
                    refs = list(op.getDataRefs())
            except Exception:
                continue
            for address in refs:
                try:
                    address = int(address)
                except Exception:
                    continue
                addresses = deque([address])
                while addresses:
                    address = addresses.pop()
                    if not is_valid_data_address(address):
                        continue
                    if (count := references.get(address, 0)) > max_dereference_depth:
                        continue
                    elif not count:
                        yield address
                    references[address] = count + 1
                    for _ in range(max_dereference_count):
                        try:
                            point = dereference(address)
                        except Exception:
                            pass
                        else:
                            addresses.appendleft(point)
                        finally:
                            address += pointer_size

    def __init__(
        self,
        *address: Param[int, Arg.Number(metavar='ADDR', help=(
            'Specify the address of a function to scan. If no argument is given, the unit will scan'
            ' all functions for memory references.'))],
        take: Param[int, Arg.Number('-t', metavar='SIZE', help=(
            'Optionally specify the number of bytes to read from each reference; by default, all '
            'data until the end of the section is returned.'))] = None,
        base: Param[int, Arg.Number('-b', metavar='ADDR',
            help='Optionally specify a custom base address B.')] = None,
        deref_count: Param[int, Arg.Number('-c', help=(
            'Optionally specify the number of items to inspect at a discovered memory address as '
            'as a potential pointer. The default is {default}.'))] = 1,
        deref_depth: Param[int, Arg.Number('-d', help=(
            'Optionally specify the maximum number of times that referenced data is dereferenced '
            'as a pointer, potentially leading to another referenced memory location. The default '
            'is {default}.'))] = 2,
    ):
        super().__init__(
            address=address,
            take=take,
            base=base,
            deref_count=deref_count,
            deref_depth=deref_depth,
        )

    def process(self, data):
        smda = self._smda
        take = self.args.take
        exe = Executable.Load(data, self.args.base)
        fmt = exe.pointer_size // 4
        addresses = self.args.address

        self.log_info('disassembling and exploring call graph using smda')
        with NoLogging():
            cfg = smda.Disassembler.SmdaConfig()
            cfg.CALCULATE_SCC = False
            cfg.CALCULATE_NESTING = False
            cfg.TIMEOUT = 600
            dsm = smda.Disassembler.Disassembler(cfg)
            _input = data
            if not isinstance(_input, bytes):
                _input = bytes(data)
            graph = dsm.disassembleUnmappedBuffer(_input)

        self.log_info('collecting code addresses for memory reference exclusion list')
        visits = {}
        avoid = set()

        for symbol in exe.symbols():
            if not symbol.function:
                continue
            if not symbol.exported:
                continue
            avoid.add(exe.location_from_address(symbol.address).virtual.box)

        if addresses:
            reset = visits.clear
        else:
            def reset():
                pass
            self.log_info('scanning executable for functions')
            with NoLogging():
                addresses = [pfn.offset for pfn in graph.getFunctions()]
                addresses.sort()

        for a in addresses:
            reset()
            address, function = min(
                graph.xcfg.items(), key=lambda t: (abs(t[0] - a), t[0] >= a))
            self.log_debug(F'scanning function: 0x{address:0{fmt}X}')
            refs = list(self._memory_references(
                exe,
                function,
                avoid,
                self.args.deref_depth,
                self.args.deref_count,
                visits,
            ))
            refs.sort(reverse=True)
            last_start = None
            for ref in refs:
                try:
                    box = exe.location_from_address(ref)
                    end = box.physical.box.upper
                    if take is not None:
                        end = min(box.physical.position + take, end)
                    if last_start is not None:
                        end = min(last_start, end)
                    last_start = box.physical.position
                except CompartmentNotFound:
                    self.log_info(F'memory reference could not be resolved: 0x{ref:0{fmt}X}')
                else:
                    yield exe.data[last_start:end]

class vsect (*paths, meta=False, synthetic=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.exe.vsect and has the following commandline Interface:

usage: vsect [-h] [-L] [-Q] [-0] [-v] [-m] [-s] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
             [path ...]

Extract sections/segments from PE, ELF, and MachO executables.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -m, --meta       Populates the metadata variables vaddr and vsize containing the virtual
                   address and size of each section, respectively.
  -s, --synthetic  Include synthesized sections: These represent data regions that are outside
                   the sections as listed by the executable metadata, such as headers and
                   overlays.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class vsect(PathExtractorUnit):
    """
    Extract sections/segments from PE, ELF, and MachO executables.
    """
    def __init__(
        self, *paths,
        meta: Param[bool, Arg.Switch('-m', help=(
            'Populates the metadata variables vaddr and vsize containing the virtual address and size '
            'of each section, respectively.'))] = False,
        synthetic: Param[bool, Arg.Switch('-s', help=(
            'Include synthesized sections: These represent data regions that are outside the sections '
            'as listed by the executable metadata, such as headers and overlays.'))] = False,
        **keywords
    ):
        super().__init__(*paths, meta=meta, synthetic=synthetic, **keywords)

    def unpack(self, data):
        exe = Executable.Load(data)
        mv = memoryview(data)
        for k, section in enumerate(exe.sections()):
            if section.synthetic and not self.args.synthetic:
                continue
            start = section.physical.lower
            end = section.physical.upper
            va = section.virtual.lower
            vs = len(section.virtual)
            kwargs = {'offset': start}
            if self.args.meta:
                if va is not None:
                    kwargs['vaddr'] = va
                if vs is not None:
                    kwargs['vsize'] = vs
            name = section.name
            if not name:
                addr = F'{section.virtual.lower:0{exe.pointer_size // 4}X}'
                self.log_warn(F'section {k} had no name, synthesizing name from virtual address 0x{addr}')
                name = F'.{addr}'
            yield UnpackResult(name, mv[start:end], **kwargs)

class vsnip (*addresses, ascii=False, utf16=False, until=b'', base=None)

This unit is implemented in refinery.units.formats.exe.vsnip and has the following commandline Interface:

usage: vsnip [-h] [-L] [-Q] [-0] [-v] [-a | -u | -t B] [-b ADDR] [start:count:align ...]

Extract data from PE, ELF, and MachO files based on virtual offsets.

positional arguments:
  start:count:align  Use Python slice syntax to describe an area of virtual memory to read. If a
                     chunksize is specified, then the unit will always read a multiple of that
                     number of bytes

options:
  -a, --ascii        Read ASCII strings; equivalent to -th:00
  -u, --utf16        Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)
  -t, --until B      Read until sequence B is read.
  -b, --base ADDR    Optionally specify a custom base address B.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class vsnip(Unit):
    """
    Extract data from PE, ELF, and MachO files based on virtual offsets.
    """

    def __init__(
        self, *addresses: Param[slice, Arg.Bounds(metavar='start:count:align', help=(
            'Use Python slice syntax to describe an area of virtual memory to read. If a chunksize is '
            'specified, then the unit will always read a multiple of that number of bytes'))],
        ascii: Param[bool, Arg.Switch('-a', group='END',
            help='Read ASCII strings; equivalent to -th:00')] = False,
        utf16: Param[bool, Arg.Switch('-u', group='END',
            help='Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)')] = False,
        until: Param[buf, Arg.Binary('-t', group='END',
            help='Read until sequence {varname} is read.')] = B'',
        base: Param[int | None, Arg.Number('-b', metavar='ADDR',
            help='Optionally specify a custom base address B.')] = None,
    ):
        if sum(1 for t in (until, utf16, ascii) if t) > 1:
            raise ValueError('Only one of utf16, ascii, and until can be specified.')
        return super().__init__(addresses=addresses, utf16=utf16, ascii=ascii, until=until, base=base)

    def process(self, data: bytearray):
        until = self.args.until
        addrs = self.args.addresses
        if self.args.ascii:
            until = B'\0'
        if self.args.utf16:
            until = B'\0\0'
            addrs = (slice(a.start, a.stop, 2) for a in addrs)

        exe = Executable.Load(data, self.args.base)

        for addr in addrs:
            area = MemoryArea(addr)
            location = exe.location_from_address(area.start)
            offset = location.physical.position
            max_offset = location.physical.box.upper
            if not until:
                end = max_offset
            else:
                end = offset - 1
                align = area.align
                while True:
                    end = data.find(until, end + 1)
                    if end not in range(offset, max_offset):
                        raise EndOfStringNotFound
                    if (end - offset) % align == 0:
                        break

            if area.count:
                end = min(end, offset + area.count)

            yield self.labelled(data[offset:end], offset=offset)

class vstack (*address, base=None, arch=Arch.X32, engine=_engine.unicorn, se=False, ic=False, uc=False, registers=False, timeout=0, patch_range=slice(5, None, None), write_range=slice(1, None, None), wait=20, wait_calls=False, skip_calls=0, stack_size=65536, stack_push=None, show_apis=False, show_code=False, show_memory=False, block_size=4096, max_visits=65536, log_writes_in_calls=False, log_stack_addresses=False, log_other_addresses=False, log_zero_overwrites=False, log_stack_cookies=False)

This unit is implemented in refinery.units.formats.exe.vstack and has the following commandline Interface:

usage: vstack [-h] [-L] [-Q] [-0] [-v] [-b Addr] [-a Arch] [-e E | --se | --ic | --uc] [-r]
              [-t N] [-p MIN:MAX] [-n MIN:MAX] [-w N] [-c | -C] [-S N] [-u REG] [-A] [-I] [-M]
              [-B N] [-V N] [-W] [-X] [-Y] [-Z] [-E]
              [a[:end|::size] ...]

The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and
extracts data patches that are written to memory during emulation. The unit can also be used to
emulate shellcode blobs, in which case it defaults to emulating 32bit x86 instructions.

Emulation is halted as soon as a certain number of instructions have not performed any memory
writes, or when an error occurs. By default, most registers are set to the current location in
the emulated stack. If you want to initialize some of them differently, the -r switch maes the
unit initialize register values from meta variables:

    emit shellcode [| put eax 0x2000 | vstack -r ]

In this pipeline, the eax register is set to 0x2000 before emulation begins.

positional arguments:
  a[:end|::size]             Specify a symbol name or the (virtual) addresses of what to emulate;
                             optionally specify a stop address or a length.

options:
  -b, --base Addr            Optionally specify a custom base address B.
  -a, --arch Arch            Specify for blob inputs: x32, x64, arm32, arm64, mips16, mips32,
                             mips64, ppc32, ppc64, sparc32, sparc64
  -e, --engine E             The emulator engine. The default is unicorn, options are: speakeasy,
                             icicle, unicorn
      --se                   Equivalent to --engine=speakeasy
      --ic                   Equivalent to --engine=icicle
      --uc                   Equivalent to --engine=unicorn
  -r, --registers            Consume register initialization values from the chunk's metadata. If
                             the value is a byte string, the data will be mapped.
  -t, --timeout N            Optionally stop emulating after a given number of instructions.
  -p, --patch-range MIN:MAX  Extract only patches that are in the given range, default is 5:.
  -n, --write-range MIN:MAX  Log only writes whose size is in the given range, default is 1:.
  -w, --wait N               When this many instructions did not write to memory, emulation is
                             halted. The default is 20.
  -c, --wait-calls           Wait indefinitely when inside a function call.
  -C, --skip-calls           Skip function calls entirely. Use twice to treat each call as
                             allocating memory.
  -S, --stack-size N         Optionally specify the stack size. The default is 0x10000.
  -u, --stack-push REG       Push the value of a register to the stack before beginning
                             emulation; implies -r.
  -A, --show-apis            Show API calls in the debug log.
  -I, --show-code            Show all executed instructions in the debug log.
  -M, --show-memory          Show all memory writes in the debug log.
  -B, --block-size N         Standard memory block size for the emulator, 0x1000 by default.
  -V, --max-visits N         Maximum number of times a code address is visited. Default is 65536.
  -W, --log-writes-in-calls  Log writes of values that occur in functions calls.
  -X, --log-stack-addresses  Log writes of values that are stack addresses.
  -Y, --log-other-addresses  Log writes of values that are addresses to mapped segments.
  -Z, --log-zero-overwrites  Log writes of zeros to memory that contained nonzero values.
  -E, --log-stack-cookies    Log writes that look like stack cookies.

generic options:
  -h, --help                 Show this help message and exit.
  -L, --lenient              Increase the leniency, allowing partial results and ignoring more
                             errors.
  -Q, --quiet                Disables all log output.
  -0, --devnull              Do not produce any output.
  -v, --verbose              Specify up to two times to increase log level.

Expand source code Browse git

class vstack(Unit):
    """
    The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and
    extracts data patches that are written to memory during emulation. The unit can also be used
    to emulate shellcode blobs, in which case it defaults to emulating 32bit x86 instructions.

    Emulation is halted as soon as a certain number of instructions have not performed any memory
    writes, or when an error occurs. By default, most registers are set to the current location in
    the emulated stack. If you want to initialize some of them differently, the `-r` switch maes
    the unit initialize register values from meta variables:

        emit shellcode [| put eax 0x2000 | vstack -r ]

    In this pipeline, the eax register is set to `0x2000` before emulation begins.
    """
    def __init__(
        self,
        *address: Param[str, Arg.String(metavar='a[:end|::size]',
            help='Specify a symbol name or the (virtual) addresses of what to emulate; optionally specify a stop address or a length.')],
        base: Param[int | None, Arg.Number('-b', metavar='Addr', help='Optionally specify a custom base address B.')] = None,
        arch: Param[str | Arch, Arg.Option('-a', metavar='Arch', help='Specify for blob inputs: {choices}', choices=Arch)] = Arch.X32,
        engine: Param[str | _engine, Arg.Option('-e', group='EMU', choices=_engine, metavar='E',
            help='The emulator engine. The default is {default}, options are: {choices}')] = _engine.unicorn,
        se: Param[bool, Arg.Switch(group='EMU', help='Equivalent to --engine=speakeasy')] = False,
        ic: Param[bool, Arg.Switch(group='EMU', help='Equivalent to --engine=icicle')] = False,
        uc: Param[bool, Arg.Switch(group='EMU', help='Equivalent to --engine=unicorn')] = False,
        registers: Param[bool, Arg.Switch('-r', help=(
            'Consume register initialization values from the chunk\'s metadata. If the value is a byte string, '
            'the data will be mapped.'))] = False,
        timeout: Param[int, Arg.Number('-t', help='Optionally stop emulating after a given number of instructions.')] = 0,
        patch_range: Param[slice, Arg.Bounds('-p', metavar='MIN:MAX',
            help='Extract only patches that are in the given range, default is {default}.')] = slice(5, None),
        write_range: Param[slice, Arg.Bounds('-n', metavar='MIN:MAX',
            help='Log only writes whose size is in the given range, default is {default}.')] = slice(1, None),
        wait: Param[int, Arg.Number('-w', help=(
            'When this many instructions did not write to memory, emulation is halted. The default is {default}.'))] = 20,
        wait_calls: Param[bool, Arg.Switch('-c', group='CALL',
            help='Wait indefinitely when inside a function call.')] = False,
        skip_calls: Param[int, Arg.Counts('-C', group='CALL',
            help='Skip function calls entirely. Use twice to treat each call as allocating memory.')] = 0,
        stack_size: Param[int, Arg.Number('-S', help='Optionally specify the stack size. The default is 0x{default:X}.')] = 0x10000,
        stack_push: Param[tuple[str] | None, Arg('-u', action='append', metavar='REG',
            help='Push the value of a register to the stack before beginning emulation; implies -r.')] = None,
        show_apis: Param[bool, Arg.Switch('-A', help='Show API calls in the debug log.')] = False,
        show_code: Param[bool, Arg.Switch('-I', help='Show all executed instructions in the debug log.')] = False,
        show_memory: Param[bool, Arg.Switch('-M', help='Show all memory writes in the debug log.')] = False,
        block_size: Param[int, Arg.Number('-B', help='Standard memory block size for the emulator, 0x{default:X} by default.')] = 0x1000,
        max_visits: Param[int, Arg.Number('-V', help='Maximum number of times a code address is visited. Default is {default}.')] = 0x10000,
        log_writes_in_calls: Param[bool, Arg.Switch('-W', help='Log writes of values that occur in functions calls.')] = False,
        log_stack_addresses: Param[bool, Arg.Switch('-X', help='Log writes of values that are stack addresses.')] = False,
        log_other_addresses: Param[bool, Arg.Switch('-Y', help='Log writes of values that are addresses to mapped segments.')] = False,
        log_zero_overwrites: Param[bool, Arg.Switch('-Z', help='Log writes of zeros to memory that contained nonzero values.')] = False,
        log_stack_cookies: Param[bool, Arg.Switch('-E', help='Log writes that look like stack cookies.')] = False,
    ):
        if sum((se, uc, ic)) > 1:
            raise ValueError('Too many emulators selected.')
        elif se:
            engine = _engine.speakeasy
        elif ic:
            engine = _engine.icicle
        elif uc:
            engine = _engine.unicorn

        super().__init__(
            address=address,
            base=base,
            arch=Arg.AsOption(arch, Arch),
            engine=Arg.AsOption(engine, _engine),
            registers=registers,
            timeout=timeout,
            patch_range=patch_range,
            write_range=write_range,
            wait=wait,
            stack_size=stack_size,
            stack_push=stack_push,
            wait_calls=wait_calls,
            skip_calls=skip_calls,
            block_size=block_size,
            max_visits=max_visits,
            show_apis=show_apis,
            show_code=show_code,
            show_memory=show_memory,
            log_writes_in_calls=log_writes_in_calls,
            log_stack_addresses=log_stack_addresses,
            log_other_addresses=log_other_addresses,
            log_zero_overwrites=log_zero_overwrites,
            log_stack_cookies=log_stack_cookies
        )

    def process(self, data: Chunk):
        meta = metavars(data)
        args = self.args

        engine: _engine = args.engine
        flags = Hook.Default | Hook.ApiCall
        self.log_debug(F'attempting to use {engine.name}')

        Emu = EmuFactory(engine.value)

        emu = Emu(
            data,
            args.base,
            args.arch,
            flags,
            args.block_size,
            args.stack_size,
        )

        cfg = EmuConfig(
            args.wait_calls,
            args.skip_calls,
            args.write_range,
            args.wait,
            args.block_size,
            args.stack_size,
            args.max_visits,
            args.log_stack_cookies,
            args.log_writes_in_calls,
            args.log_stack_addresses,
            args.log_other_addresses,
            args.log_zero_overwrites,
            args.show_apis,
            args.show_code,
            args.show_memory,
        )

        register_values: dict[Register, int] = {}
        emu.reset(None)

        if args.registers or args.stack_push:
            for var, value in list(meta.items()):
                try:
                    register = emu.lookup_register(var)
                except LookupError:
                    continue
                meta.discard(var)
                register_values[register] = value

        def parse_address(a: str):
            try:
                sliced = sliceobj(a, data, intok=True)
            except Exception:
                if m := re.fullmatch('(?i)(?:sub_|fun_|0x)?([A-F0-9]+)H?', a):
                    return slice(int(m[1], 16), None)
                symbols = list(emu.exe.symbols())
                for filter in [
                    lambda s: s.get_name().casefold() == a.casefold(),
                    lambda s: s.get_name() == a,
                    lambda s: s.function,
                    lambda s: s.exported,
                ]:
                    symbols = [s for s in symbols if filter(s)]
                    if len(symbols) == 1:
                        return slice(symbols[0].address, None)
                if len(symbols) > 1:
                    raise RuntimeError(F'there are {len(symbols)} exported function symbol named "{a}", please specify the address')
                else:
                    raise LookupError(F'no symbol with name "{a}" was found')
            else:
                if isinstance(sliced, int):
                    sliced = slice(sliced, None)
                elif sliced.step and sliced.step != 1:
                    if sliced.stop is not None:
                        raise RuntimeError(F'invalid emulation range: {a}')
                    sliced = slice(sliced.start, sliced.start + sliced.step, None)
                return sliced

        addresses = [parse_address(a) for a in args.address]

        if not addresses:
            for symbol in emu.exe.symbols():
                if symbol.name is None:
                    addresses.append(slice(symbol.address, None))
                    break

        for cursor in addresses:
            state = EmuState(cfg, cursor.start, emu.exe.pointer_size // 4, stop=cursor.stop)
            emu.reset(state)
            emu.push((1 << emu.exe.pointer_size) - 1)

            for reg in emu.general_purpose_registers():
                emu.set_register(reg, 0)

            for reg in register_values:
                # check if we are tainting a general purpose register
                emu.set_register(reg, 1)

            for reg in emu.general_purpose_registers():
                if emu.get_register(reg) == 0:
                    state.init_registers.append(reg)

            for reg, value in register_values.items():
                if isinstance(value, int):
                    self.log_info(F'setting {reg.name} to integer value 0x{value:X}')
                    emu.set_register(reg, value)
                    continue
                if isinstance(value, str):
                    value = value.encode()
                if isbuffer(value):
                    start = emu.malloc(len(value))
                    emu.mem_write(start, bytes(value))
                    emu.set_register(reg, start)
                    self.log_info(F'setting {reg.name} to mapped buffer of size 0x{len(value):X}')
                    continue
                _tn = value.__class__.__name__
                self.log_warn(F'canot interpret value of type {_tn} for register {reg.name}')

            if push := args.stack_push:
                for reg in push:
                    emu.push_register(reg)

            timeout = args.timeout
            if timeout:
                self.log_info(F'setting timeout of {timeout} steps')
                state.ticks = timeout

            try:
                emu.emulate(
                    emu.base_exe_to_emu(cursor.start),
                    emu.base_exe_to_emu(cursor.stop),
                )
            except EmulationError:
                pass

            for patch, api in state.synthesized.items():
                chunk = self.labelled(patch, src=api)
                yield chunk

            valid = bounds[args.patch_range]
            for base, patch in state.memory:
                if len(patch) not in valid or not any(patch):
                    continue
                self.log_info(F'memory patch at {state.fmt(base)} of size {len(patch)}')
                chunk = self.labelled(patch, src=base)
                yield chunk

class winreg (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.winreg and has the following commandline Interface:

usage: winreg [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract values from a Windows registry hive or from a registry export (.reg file).

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class winreg(PathExtractorUnit):
    """
    Extract values from a Windows registry hive or from a registry export (.reg file).
    """
    @PathExtractorUnit.Requires('python-registry', ['formats'])
    def _registry():
        import Registry
        import Registry.Registry
        import Registry.RegistryParse
        return Registry

    @staticmethod
    def _walk(patterns: list[PathPattern], key: RegistryKey, *path: str):
        here = '/'.join(path)
        if not any(p.reach(here) for p in patterns):
            winreg.log_debug(F'pruning search at {here}')
            return
        for value in key.values():
            def raw(v: RegistryValue = value):
                return v.raw_data()
            vpath = here
            vname = value.name()
            if vname != '(default)':
                vpath = F'{vpath}/{vname}'
            yield UnpackResult(vpath, raw)
        for subkey in key.subkeys():
            yield from winreg._walk(patterns, subkey, *path, subkey.name())

    def _unpack_hive(self, data: bytearray):
        try:
            with MemoryFile(data) as stream:
                root = self._registry.Registry.Registry(stream).root()
                yield from self._walk(self._patterns, root, root.name())
        except self._registry.RegistryParse.ParseException:
            raise ParseException

    def _decode_registry_export(self, data: str):
        def REG_BINARY(data: str) -> bytes:
            return bytes.fromhex(re.sub('[^a-f0-9]+', '', data))

        def REG_SZ(data: str) -> bytes:
            return data.encode(self.codec) | esc(quoted=True) | bytes

        def REG_EXPAND_SZ(data: str):
            return REG_BINARY(data).decode('UTF-16LE').rstrip('\0').encode(self.codec)

        def REG_MULTI_SZ(data: str):
            for string in REG_BINARY(data).decode('UTF-16LE').split('\0'):
                if string:
                    yield string.encode(self.codec)

        def REG_DWORD(data: str):
            value = int(data, 16)
            return F'0x{value:X}'.encode(self.codec)

        def REG_QWORD(data: str):
            value = int.from_bytes(REG_BINARY(data), 'little')
            return F'0x{value:X}'.encode(self.codec)

        class Missing:
            def __init__(self, name: str): self.name = name
            def __str__(self): return self.name

        REG_NONE = REG_EXPAND_SZ
        REG_DWORD_BIG_ENDIAN = Missing('REG_DWORD_BIG_ENDIAN')
        REG_LINK = Missing('REG_LINK')
        REG_RESOURCE_LIST = Missing('REG_RESOURCE_LIST')
        REG_FULL_RESOURCE_DESCRIPTOR = Missing('REG_FULL_RESOURCE_DESCRIPTOR')
        REG_RESOURCE_REQUIREMENTS_LIST = Missing('REG_RESOURCE_REQUIREMENTS_LIST')

        prefix, _, encoded = data.partition(':')

        try:
            decoder = {
                'hex(0)' : REG_NONE,
                'hex(1)' : REG_SZ,
                'hex(2)' : REG_EXPAND_SZ,
                'hex(3)' : REG_BINARY,
                'hex'    : REG_BINARY,
                'hex(4)' : REG_DWORD,
                'dword'  : REG_DWORD,
                'hex(5)' : REG_DWORD_BIG_ENDIAN,
                'hex(6)' : REG_LINK,
                'hex(7)' : REG_MULTI_SZ,
                'hex(8)' : REG_RESOURCE_LIST,
                'hex(9)' : REG_FULL_RESOURCE_DESCRIPTOR,
                'hex(a)' : REG_RESOURCE_REQUIREMENTS_LIST,
                'hex(b)' : REG_QWORD,
            }[prefix]
        except KeyError:
            decoder = REG_SZ
            encoded = data

        if isinstance(decoder, Missing):
            self.log_warn(F'Found registry type {decoder!s}; no decoder implemented.')
            return
        self.log_debug(F'decoding as {decoder.__name__}: {encoded}')
        it = decoder(encoded)
        if not inspect.isgenerator(it):
            it = (it,)
        yield from it

    def _unpack_file(self, data: bytearray):
        for codec in ('utf16', 'utf-16le', 'utf8'):
            try:
                reg = data.decode(codec).splitlines(keepends=True)
            except UnicodeError:
                continue
            lines = iter(reg)
            if next(lines).startswith('Windows Registry Editor'):
                break
        else:
            raise ParseException

        def _parse():
            parser = WinRegFileParser()
            section.seek(0)
            parser.read_file(section)
            for key in parser.sections():
                self.log_debug(key)
                for value in parser[key]:
                    name = next(iter(shlex.split(value)))
                    path = Path(key)
                    if name != '@':
                        path = path / Path(name)
                    decoded = list(self._decode_registry_export(parser[key][value]))
                    if len(decoded) == 1:
                        yield UnpackResult(str(path), decoded[0])
                        continue
                    for k, d in enumerate(decoded):
                        yield UnpackResult(F'{path!s}.{k}', d)

        section = io.StringIO()

        for line in lines:
            if line.lstrip().startswith('['):
                yield from _parse()
                section.seek(0)
                section.truncate(0)
            section.write(line)

        yield from _parse()

    def unpack(self, data):
        with contextlib.suppress(ParseException):
            yield from self._unpack_hive(data)
            return
        yield from self._unpack_file(data)

    @classmethod
    def handles(cls, data):
        return get_reg_export_type(data) is not None

class wshenc (marker=True)

This unit is implemented in refinery.units.encoding.wshenc and has the following commandline Interface:

usage: wshenc [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-m]

Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).

options:
  -m, --no-marker  Do not require magic marker when encoding and do not search for marker when
                   decoding.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class wshenc(Unit):
    """
    Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).
    """

    _MARKER_INIT = RB'#@~^BINREF=='
    _MARKER_STOP = RB'BINREF==^#~@'

    _CHUNKS = (
        0x57, 0x6E, 0x7B, 0x4A, 0x4C, 0x41, 0x0B, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x4A, 0x4C, 0x41,
        0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12,
        0x13, 0x13, 0x13, 0x14, 0x14, 0x14, 0x15, 0x15, 0x15, 0x16, 0x16, 0x16, 0x17, 0x17, 0x17,
        0x18, 0x18, 0x18, 0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B, 0x1B, 0x1C, 0x1C, 0x1C,
        0x1D, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x2E, 0x2D, 0x32, 0x47, 0x75, 0x30,
        0x7A, 0x52, 0x21, 0x56, 0x60, 0x29, 0x42, 0x71, 0x5B, 0x6A, 0x5E, 0x38, 0x2F, 0x49, 0x33,
        0x26, 0x5C, 0x3D, 0x49, 0x62, 0x58, 0x41, 0x7D, 0x3A, 0x34, 0x29, 0x35, 0x32, 0x36, 0x65,
        0x5B, 0x20, 0x39, 0x76, 0x7C, 0x5C, 0x72, 0x7A, 0x56, 0x43, 0x7F, 0x73, 0x38, 0x6B, 0x66,
        0x39, 0x63, 0x4E, 0x70, 0x33, 0x45, 0x45, 0x2B, 0x6B, 0x68, 0x68, 0x62, 0x71, 0x51, 0x59,
        0x4F, 0x66, 0x78, 0x09, 0x76, 0x5E, 0x62, 0x31, 0x7D, 0x44, 0x64, 0x4A, 0x23, 0x54, 0x6D,
        0x75, 0x43, 0x71, 0x4A, 0x4C, 0x41, 0x7E, 0x3A, 0x60, 0x4A, 0x4C, 0x41, 0x5E, 0x7E, 0x53,
        0x40, 0x4C, 0x40, 0x77, 0x45, 0x42, 0x4A, 0x2C, 0x27, 0x61, 0x2A, 0x48, 0x5D, 0x74, 0x72,
        0x22, 0x27, 0x75, 0x4B, 0x37, 0x31, 0x6F, 0x44, 0x37, 0x4E, 0x79, 0x4D, 0x3B, 0x59, 0x52,
        0x4C, 0x2F, 0x22, 0x50, 0x6F, 0x54, 0x67, 0x26, 0x6A, 0x2A, 0x72, 0x47, 0x7D, 0x6A, 0x64,
        0x74, 0x39, 0x2D, 0x54, 0x7B, 0x20, 0x2B, 0x3F, 0x7F, 0x2D, 0x38, 0x2E, 0x2C, 0x77, 0x4C,
        0x30, 0x67, 0x5D, 0x6E, 0x53, 0x7E, 0x6B, 0x47, 0x6C, 0x66, 0x34, 0x6F, 0x35, 0x78, 0x79,
        0x25, 0x5D, 0x74, 0x21, 0x30, 0x43, 0x64, 0x23, 0x26, 0x4D, 0x5A, 0x76, 0x52, 0x5B, 0x25,
        0x63, 0x6C, 0x24, 0x3F, 0x48, 0x2B, 0x7B, 0x55, 0x28, 0x78, 0x70, 0x23, 0x29, 0x69, 0x41,
        0x28, 0x2E, 0x34, 0x73, 0x4C, 0x09, 0x59, 0x21, 0x2A, 0x33, 0x24, 0x44, 0x7F, 0x4E, 0x3F,
        0x6D, 0x50, 0x77, 0x55, 0x09, 0x3B, 0x53, 0x56, 0x55, 0x7C, 0x73, 0x69, 0x3A, 0x35, 0x61,
        0x5F, 0x61, 0x63, 0x65, 0x4B, 0x50, 0x46, 0x58, 0x67, 0x58, 0x3B, 0x51, 0x31, 0x57, 0x49,
        0x69, 0x22, 0x4F, 0x6C, 0x6D, 0x46, 0x5A, 0x4D, 0x68, 0x48, 0x25, 0x7C, 0x27, 0x28, 0x36,
        0x5C, 0x46, 0x70, 0x3D, 0x4A, 0x6E, 0x24, 0x32, 0x7A, 0x79, 0x41, 0x2F, 0x37, 0x3D, 0x5F,
        0x60, 0x5F, 0x4B, 0x51, 0x4F, 0x5A, 0x20, 0x42, 0x2C, 0x36, 0x65, 0x57)
    _OFFSETS = (
        0, 1, 2, 0, 1, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1,
        0, 0, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 1)
    _ENCODER = {
        0x09 : [0x37, 0x69, 0x64], 0x0B : [0x0B, 0x0B, 0x0B], 0x0C : [0x0C, 0x0C, 0x0C],
        0x0E : [0x0E, 0x0E, 0x0E], 0x0F : [0x0F, 0x0F, 0x0F], 0x10 : [0x10, 0x10, 0x10],
        0x11 : [0x11, 0x11, 0x11], 0x12 : [0x12, 0x12, 0x12], 0x13 : [0x13, 0x13, 0x13],
        0x14 : [0x14, 0x14, 0x14], 0x15 : [0x15, 0x15, 0x15], 0x16 : [0x16, 0x16, 0x16],
        0x17 : [0x17, 0x17, 0x17], 0x18 : [0x18, 0x18, 0x18], 0x19 : [0x19, 0x19, 0x19],
        0x1A : [0x1A, 0x1A, 0x1A], 0x1B : [0x1B, 0x1B, 0x1B], 0x1C : [0x1C, 0x1C, 0x1C],
        0x1D : [0x1D, 0x1D, 0x1D], 0x1E : [0x1E, 0x1E, 0x1E], 0x1F : [0x1F, 0x1F, 0x1F],
        0x20 : [0x7E, 0x2C, 0x50], 0x21 : [0x5A, 0x65, 0x22], 0x22 : [0x45, 0x72, 0x4A],
        0x23 : [0x3A, 0x5B, 0x61], 0x24 : [0x79, 0x66, 0x5E], 0x25 : [0x59, 0x75, 0x5D],
        0x26 : [0x27, 0x4C, 0x5B], 0x27 : [0x76, 0x45, 0x42], 0x28 : [0x63, 0x76, 0x60],
        0x29 : [0x62, 0x2A, 0x23], 0x2A : [0x4D, 0x43, 0x65], 0x2B : [0x51, 0x33, 0x5F],
        0x2C : [0x53, 0x42, 0x7E], 0x2D : [0x52, 0x20, 0x4F], 0x2E : [0x20, 0x63, 0x52],
        0x2F : [0x26, 0x4A, 0x7A], 0x30 : [0x54, 0x5A, 0x21], 0x31 : [0x71, 0x38, 0x46],
        0x32 : [0x2B, 0x79, 0x20], 0x33 : [0x66, 0x32, 0x26], 0x34 : [0x2A, 0x57, 0x63],
        0x35 : [0x58, 0x6C, 0x2A], 0x36 : [0x7F, 0x2B, 0x76], 0x37 : [0x7B, 0x46, 0x47],
        0x38 : [0x30, 0x52, 0x25], 0x39 : [0x31, 0x4F, 0x2C], 0x3A : [0x6C, 0x3D, 0x29],
        0x3B : [0x49, 0x70, 0x69], 0x3D : [0x78, 0x7B, 0x27], 0x3F : [0x5F, 0x51, 0x67],
        0x40 : [0x40, 0x40, 0x40], 0x41 : [0x29, 0x7A, 0x62], 0x42 : [0x24, 0x7E, 0x41], # noqa
        0x43 : [0x2F, 0x3B, 0x5A], 0x44 : [0x39, 0x47, 0x66], 0x45 : [0x33, 0x41, 0x32],
        0x46 : [0x6F, 0x77, 0x73], 0x47 : [0x21, 0x56, 0x4D], 0x48 : [0x75, 0x5F, 0x43],
        0x49 : [0x28, 0x26, 0x71], 0x4A : [0x42, 0x78, 0x39], 0x4B : [0x46, 0x6E, 0x7C],
        0x4C : [0x4A, 0x64, 0x53], 0x4D : [0x5C, 0x74, 0x48], 0x4E : [0x48, 0x67, 0x31],
        0x4F : [0x36, 0x7D, 0x72], 0x50 : [0x4B, 0x68, 0x6E], 0x51 : [0x7D, 0x35, 0x70],
        0x52 : [0x5D, 0x22, 0x49], 0x53 : [0x6A, 0x55, 0x3F], 0x54 : [0x50, 0x3A, 0x4B],
        0x55 : [0x69, 0x60, 0x6A], 0x56 : [0x23, 0x6A, 0x2E], 0x57 : [0x09, 0x71, 0x7F],
        0x58 : [0x70, 0x6F, 0x28], 0x59 : [0x65, 0x49, 0x35], 0x5A : [0x74, 0x5C, 0x7D],
        0x5B : [0x2C, 0x5D, 0x24], 0x5C : [0x77, 0x27, 0x2D], 0x5D : [0x44, 0x59, 0x54],
        0x5E : [0x3F, 0x25, 0x37], 0x5F : [0x6D, 0x7C, 0x7B], 0x60 : [0x7C, 0x23, 0x3D],
        0x61 : [0x43, 0x6D, 0x6C], 0x62 : [0x38, 0x28, 0x34], 0x63 : [0x5E, 0x31, 0x6D],
        0x64 : [0x5B, 0x39, 0x4E], 0x65 : [0x6E, 0x7F, 0x2B], 0x66 : [0x57, 0x36, 0x30],
        0x67 : [0x4C, 0x54, 0x6F], 0x68 : [0x34, 0x34, 0x74], 0x69 : [0x72, 0x62, 0x6B],
        0x6A : [0x25, 0x4E, 0x4C], 0x6B : [0x56, 0x30, 0x33], 0x6C : [0x73, 0x5E, 0x56],
        0x6D : [0x68, 0x73, 0x3A], 0x6E : [0x55, 0x09, 0x78], 0x6F : [0x47, 0x4B, 0x57],
        0x70 : [0x32, 0x61, 0x77], 0x71 : [0x35, 0x24, 0x3B], 0x72 : [0x2E, 0x4D, 0x44],
        0x73 : [0x64, 0x6B, 0x2F], 0x74 : [0x4F, 0x44, 0x59], 0x75 : [0x3B, 0x21, 0x45],
        0x76 : [0x2D, 0x37, 0x5C], 0x77 : [0x41, 0x53, 0x68], 0x78 : [0x61, 0x58, 0x36],
        0x79 : [0x7A, 0x48, 0x58], 0x7A : [0x22, 0x2E, 0x79], 0x7B : [0x60, 0x50, 0x09],
        0x7C : [0x6B, 0x2D, 0x75], 0x7D : [0x4E, 0x29, 0x38], 0x7E : [0x3D, 0x3F, 0x55],
        0x7F : [0x67, 0x2F, 0x51]
    }

    _ESCAPE = {
        0x40: B'@$',
        0x3C: B'@!',
        0x3E: B'@*',
        0x0D: B'@#',
        0x0A: B'@&',
    }

    _UNESCAPE = {
        B'@$': B'@',
        B'@!': B'<',
        B'@*': B'>',
        B'@#': B'\r',
        B'@&': B'\n',
    }

    def __init__(
        self,
        marker: Param[bool, Arg.Switch('-m', '--no-marker', off=True, help=(
            'Do not require magic marker when encoding and do not search for '
            'marker when decoding.')
        )] = True
    ):
        super().__init__(marker=marker)

    @classmethod
    def _chunk(cls, byte, index):
        k = byte - 9
        c = cls._CHUNKS[k * 3 : k * 3 + 3]
        return c[cls._OFFSETS[index % 64]]

    def _escape(self, iterable):
        if self.args.marker:
            yield from self._MARKER_INIT
        for byte in iterable:
            try:
                yield from self._ESCAPE[byte]
            except KeyError:
                yield byte
        if self.args.marker:
            yield from self._MARKER_STOP

    def _unescape(self, data):
        def unescaper(m): return self._UNESCAPE[m[0]]
        return re.sub(RB'@[$!*#&]', unescaper, data)

    @classmethod
    def _decoded(cls, data):
        index = -1
        for byte in data:
            if byte < 128:
                index += 1
            if byte == 9 or byte in range(32, 128) and byte not in (60, 62, 64):
                byte = cls._chunk(byte, index)
            yield byte

    @classmethod
    def _encoded(cls, data):
        for i, byte in enumerate(data):
            try:
                sequence = cls._ENCODER[byte]
            except KeyError:
                yield byte
            else:
                offset = cls._OFFSETS[i % 0x40]
                yield sequence[offset]

    def reverse(self, data):
        return bytearray(self._escape(self._encoded(data)))

    def process(self, data):
        if self.args.marker:
            match = formats.wshenc.search(data)
            if not match:
                raise ValueError('Encoded script marker was not found.')
            data = match[0][12:-12]
        return bytearray(self._decoded(self._unescape(data)))

    @classmethod
    def handles(cls, data):
        return is_likely_vbe(data)

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return bytearray(self._escape(self._encoded(data)))

class xchacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

This unit is implemented in refinery.units.crypto.cipher.chacha and has the following commandline Interface:

usage: xchacha [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce]

XChaCha encryption and decryption. The nonce must be 24 bytes long.

positional arguments:
  key                The encryption key.
  nonce              The nonce. Default is the string REFINERY.

options:
  -s, --stateful     Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N    Discard the first N bytes of the keystream, 0 by default.
  -m, --magic MAGIC  The magic constant; depends on the key size by default.
  -x, --offset N     Optionally specify the stream index, default is 0.
  -r, --rounds N     The number of rounds. Has to be an even number. Default is 20.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class xchacha(LatinCipherUnit):
    """
    XChaCha encryption and decryption. The nonce must be 24 bytes long.
    """
    def keystream(self) -> Iterable[int]:
        kdp, kdn, nonce = struct.unpack('<Q8s8s', self.args.nonce)
        yield from LatinX(
            ChaChaCipher,
            (0, 1, 2, 3, 12, 13, 14, 15),
            self.args.key,
            kdn,
            kdp,
            nonce,
            self.args.magic,
            self.args.rounds,
            self.args.offset,
        )

class xfcc (variable='count', relative=False)

This unit is implemented in refinery.units.meta.xfcc and has the following commandline Interface:

usage: xfcc [-h] [-L] [-Q] [-0] [-v] [-r] [variable]

The cross frame chunk count unit! It computes the number of times a chunk occurs across several
frames of input. It consumes all frames at its current level of the frame tree and counts the
number of times each item occurs in each of them. It converts a frame tree of depth 2 into a new
frame tree of depth 2 where the parent of every leaf has this leaf as its only child. The leaves
of this tree have been enriched with a meta variable containing the number of times the
corresponding chunk has occurred in the input frame tree. The variable that stores this
information is scoped at the first layer of this subtree, which means that a frame can be closed
once after invocation of xfcc and the variable remains accessible. This unit can be used to
compute set intersections across frames as follows:

    (1) [| (2) [| dedup | xfcc -r t ]| iff t==1 | (3) ]

A sequence of chunks is emitted at (1), each of which has chunks extracted at (2). It is then
important to use dedup before calling xfcc, since xfcc performs an absolute count. The frame at
(3) contains the intersection of all datasets that were extracted at (2).

positional arguments:
  variable        The variable which is used as the accumulator

options:
  -r, --relative  Normalize the accumulator to a number between 0 and 1.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class xfcc(Unit):
    """
    The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames
    of input. It consumes all frames at its current level of the frame tree and counts the number of times
    each item occurs in each of them. It converts a frame tree of depth 2 into a new frame tree of depth 2
    where the parent of every leaf has this leaf as its only child. The leaves of this tree have been
    enriched with a meta variable containing the number of times the corresponding chunk has occurred in
    the input frame tree. The variable that stores this information is scoped at the first layer of this
    subtree, which means that a frame can be closed once after invocation of xfcc and the variable remains
    accessible. This unit can be used to compute set intersections across frames as follows:

        (1) [| (2) [| dedup | xfcc -r t ]| iff t==1 | (3) ]

    A sequence of chunks is emitted at (1), each of which has chunks extracted at (2). It is then important
    to use dedup before calling xfcc, since xfcc performs an absolute count. The frame at (3) contains the
    intersection of all datasets that were extracted at (2).
    """
    def __init__(
        self,
        variable: Param[str, Arg(help='The variable which is used as the accumulator')] = 'count',
        relative: Param[bool, Arg.Switch('-r', help='Normalize the accumulator to a number between 0 and 1.')] = False
    ):
        super().__init__(variable=variable, relative=relative)
        self._trunk = None
        self._store: dict[Chunk, int] = defaultdict(int)

    def finish(self):
        vn = self.args.variable
        rc = self.args.relative
        if rc and self._store:
            maximum = max(self._store.values())
        for index, (chunk, count) in enumerate(self._store.items()):
            if rc:
                count /= maximum
            chunk.path[-2] = 0
            chunk.path[-1] = index
            chunk.meta[vn] = count
            chunk.meta.set_scope(vn, chunk.scope - 1)
            yield chunk
        self._store.clear()

    def _getcount(self, chunk):
        try:
            count = int(chunk.meta[self.args.variable])
        except (AttributeError, KeyError, TypeError):
            return 1
        else:
            return count

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        try:
            head = next(it)
        except StopIteration:
            return
        if len(head.path) < 2:
            self.log_warn(F'the current frame is nested {len(head.path)} layers deep, at least two layers are required.')
            yield head
            yield from it
            return
        trunk = head.path[:-2]
        store = self._store
        if trunk != self._trunk:
            yield from self.finish()
            self._trunk = trunk
        store[head] += self._getcount(head)
        for chunk in it:
            store[chunk] += self._getcount(chunk)

class xj0 (fmt='', all=False, one=False, raw=False)

This unit is implemented in refinery.units.formats.json and has the following commandline Interface:

usage: xj0 [-h] [-L] [-Q] [-0] [-v] [-a | -x] [-r] [fmt]

Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic
to extract remaining fields as metadata: String values are extracted only if they do not exceed
80 characters in length and do not contain any line breaks. Floating-point, integer, boolean
values, and lists of the latter are also extracted.

positional arguments:
  fmt            Format expression for the output chunk; may use previously extracted JSON items
                 as format expressions. By default, the input data is returned.

options:
  -a, --all      Extract all other fields as metadata regardless of length and type.
  -x, --one      Do not extract any other fields as metadata.
  -r, --raw      Disable conversion of JSON strings to binary strings in metadata

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xj0(Unit):
    """
    Extracts a single field from a JSON document at depth 0. By default, the unit applies a
    heuristic to extract remaining fields as metadata: String values are extracted only if
    they do not exceed 80 characters in length and do not contain any line breaks.
    Floating-point, integer, boolean values, and lists of the latter are also extracted.
    """
    def __init__(
        self,
        fmt: Param[str, Arg.String(help=(
            'Format expression for the output chunk; may use previously extracted JSON items '
            'as format expressions. By default, the input data is returned.'))] = '',
        all: Param[bool, Arg.Switch('-a', group='META',
            help='Extract all other fields as metadata regardless of length and type.')] = False,
        one: Param[bool, Arg.Switch('-x', group='META',
            help='Do not extract any other fields as metadata.')] = False,
        raw: Param[bool, Arg.Switch('-r',
            help='Disable conversion of JSON strings to binary strings in metadata')] = False,
    ):
        super().__init__(fmt=fmt, one=one, raw=raw, all=all)

    def process(self, data: Chunk):

        def convert(value, iskey=False):
            if self.args.raw:
                return value
            if isinstance(value, (float, int, bool)):
                return value
            if isinstance(value, str):
                return value.encode(self.codec)
            if iskey:
                raise TypeError
            if isinstance(value, dict):
                return {convert(k): convert(v) for k, v in value.items()}
            if isinstance(value, list):
                return [convert(k) for k in value]

        def acceptable(key, value, nested=False, convert=False):
            if not is_valid_variable_name(key):
                self.log_info(F'rejecting item with invalid name {key}')
                return None
            if isinstance(value, (float, int, bool)):
                return value
            if isinstance(value, dict):
                if not self.args.all:
                    self.log_info(F'rejecting item {key} with dictionary value')
                    return False
                return True
            if isinstance(value, list):
                if nested:
                    self.log_info(F'rejecting item {key} containing a doubly nested list')
                    return False
                return all(acceptable(key, t, True) for t in value)
            if isinstance(value, str):
                if not self.args.all:
                    if len(value) not in range(1, 80):
                        self.log_info(F'rejecting string item {key} because {len(value)} exceeds the length limit')
                        return False
                    if '\n' in value:
                        self.log_info(F'rejecting string item {key} because it contains line breaks')
                        return False
                return True
            return False

        jdoc: dict = json.loads(data)
        if not isinstance(jdoc, dict):
            raise ValueError('The input must be a JSON dictionary.')
        meta = metavars(data)
        args = {k: convert(v) for k, v in jdoc.items() if acceptable(k, v)}
        used = set()
        data[:] = meta.format_bin(self.args.fmt, self.codec, [data], args, used)
        for u in used:
            args.pop(u, None)
        if not self.args.one:
            data.meta.update(args)
        return data

class xjl

This unit is implemented in refinery.units.formats.json and has the following commandline Interface:

usage: xjl [-h] [-L] [-Q] [-0] [-v] [-R]

Returns all JSON elements from a JSON iterable as individual outputs. When reversed, the unit
collects all chunks in the frame and wraps them as a JSON list.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.

Expand source code Browse git

class xjl(Unit):
    """
    Returns all JSON elements from a JSON iterable as individual outputs. When reversed, the unit
    collects all chunks in the frame and wraps them as a JSON list.
    """

    def process(self, data):
        try:
            doc: list | dict = json.loads(data)
        except Exception:
            from refinery.units.pattern.carve_json import carve_json
            doc = data | carve_json | json.loads
        try:
            it = doc.values()
        except AttributeError:
            it = doc
        for item in it:
            yield json.dumps(item, indent=4).encode(self.codec)

    def reverse(self, data):
        return json.dumps(data.temp).encode(self.codec)

    def filter(self, chunks: Iterable[Chunk]):
        if not self.args.reverse:
            yield from chunks

        from refinery.lib.tools import begin

        if it := begin(chunks):
            head, rest = it
            collected = [head.decode(self.codec)]
            collected.extend(chunk.decode(self.codec) for chunk in rest)
            head.temp = collected
            yield head

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return json.dumps(data.temp).encode(self.codec)

class xkey (range=slice(1, 32, None), plaintext=b'', searchpos=slice(0, None, None), alph=False, crib=False, freq=False)

This unit is implemented in refinery.units.misc.xkey and has the following commandline Interface:

usage: xkey [-h] [-L] [-Q] [-0] [-v] [-p B] [-s S:E] [-a] [-c] [-f] [start:end:step]

The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For
both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

1. Known plaintext cribs: The unit contains a library of file signatures that are expected to
   occur at specific offsets. It uses these to attempt a known-plaintext attack against the input.
   If a key is found that is at most half the size of such a crib, it is returned.
2. Known alphabets: For each given key length, the input is split into slices that would have
   been encrypted with a single byte for keys of that length. Each such slice undergoes a
   character frequency analysis. If the histogram indicates that an alphabet of a small size was
   used (i.e. base64), then the unit attempts to determine the key based on this.
3. Known high frequency glyph: Works if the plaintext contains one letter that occurs with very
   high frequency, i.e. zero padding in PE or ELF files, and the space character in text. Based on
   this assumption, the unit computes the most likely key. This method will work best on
   uncompressed files that were encrypted with a short key.

When no option is set, the unit uses all the above methods by default. When at least one of the
methods is selected, it will attempt only selected methods. When a custom plaintext is given, the
other methods are disabled by default.

positional arguments:
  start:end:step       range of length values to try in Python slice syntax, the default is 1:32.

options:
  -p, --plaintext B    Provide a buffer of known plaintext. Without a search position, this can
                       slow down the key search significantly.
  -s, --searchpos S:E  Only used when a known plaintext buffer is provided; In this case it
                       narrows the search range for the offset of that data to between S and E.
  -a, --alph           Enable search for keys via known encoder alphabets.
  -c, --crib           Enable search for keys via known plaintext cribs.
  -f, --freq           Enable search for keys via frequency analysis.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class xkey(Unit):
    """
    The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For
    both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

    1. Known plaintext cribs: The unit contains a library of file signatures that are expected to
       occur at specific offsets. It uses these to attempt a known-plaintext attack against the
       input. If a key is found that is at most half the size of such a crib, it is returned.
    2. Known alphabets: For each given key length, the input is split into slices that would have
       been encrypted with a single byte for keys of that length. Each such slice undergoes a
       character frequency analysis. If the histogram indicates that an alphabet of a small size
       was used (i.e. base64), then the unit attempts to determine the key based on this.
    3. Known high frequency glyph: Works if the plaintext contains one letter that occurs with
       very high frequency, i.e. zero padding in PE or ELF files, and the space character in text.
       Based on this assumption, the unit computes the most likely key. This method will work best
       on uncompressed files that were encrypted with a short key.

    When no option is set, the unit uses all the above methods by default. When at least one of
    the methods is selected, it will attempt only selected methods. When a custom plaintext is given,
    the other methods are disabled by default.
    """

    _CRIBS: dict[range, dict[str, bytes | tuple[bytes | tuple[bytes, ...], ...]]] = {
        range(0, 64, 4): {
            'ZIP'           : (B'PK\x03\x04', (B'\x14\x00', B'\x0A\x00'), (B'\x08\x00', B'\x00\x00')),
            'RAR'           : (B'Rar!\x1A\x07', (B'\x01\x00', B'\x00')),
            'ZPAQ'          : (B'\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3\x7A\x50\x51'),
            'ZSTD'          : (B'\x28\xB5\x2F\xFD'),
            'ZZip'          : (B'7z\xBC\xAF\x27\x1C', (B'\x00\x02', B'\x00\x03', B'\x00\x04')),
            'APLib'         : (B'AP32\x18\0\0\0'),
            'BZip'          : (B'BZh'),
            'LNK'           : (B'L\0\0\0\01\x14\02\0\0\0\0\0\xC0\0\0\0\0\0\0F', (B'', B'\x9B')),
            'DDS'           : (B'\x00\x00\x00\x01Bud1'),
            'ELF'           : (B'\x7FELF'),
            'JavaClass'     : (B'\xCA\xFE\xBA\xBE'),
            'LZIP'          : (B'LZIP'),
            'SZDD'          : (B'SZDD\x88\xF0\x27\x33'),
            'LZMA'          : (B'\x5D\x00\x00\x00'),
            'LZMA/XZ'       : (B'\xFD7zXZ'),
            'LZO'           : (B'\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a'),
            'MachO/BE'      : (B'\xCA\xFE\xBA\xBE'),
            'MachO/LE'      : (B'\xBE\xBA\xFE\xCA'),
            'MSCF'          : (B'\x0A\x51\xE5\xC0'),
            'OleDocument'   : (B'\xD0\xCF\x11\xE0', (B'', B'\xA1\xB1\x1A\xE1'), (B'', B'\0\0\0\0\0\0\0\0')),
            'PdfDocument'   : (B'%PDF-', _S(B'12'), (B'.'), _S(B'0123456789'), _S(B'\r\n')),
            'SQLite'        : (B'SQLite format 3\0'),
            'GIF'           : (B'GIF87a', B'GIF89a'),
            'PNG'           : (B'\x89PNG\r\n\x1A\n'),
            'DEX'           : (B'dex\n035\0'),
            'JPG'           : (B'\xFF\xD8\xFF', _S(B'\xE0\xE1\xEE'), (B'\x00\x10\x4A\x46\x49\x46\x00\x01', B'')),
            'OneNote'       : (B'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3'),
            'A3xScript'     : (B'\xA3\x48\x4B\xBE\x98\x6C\x4A\xA9\x99\x4C\x53\x0A\x86\xD6\x48\x7DAU3!EA0', _S(B'56')),
            'RTFDocument'   : (B'{\\rtf1', (B'\\adeflang', B'\\ansi', B'')),
            'CallToPop'     : (B'\xE8\0\0\0\0', (
                               B'\x41\x58', B'\x41\x59', B'\x41\x5A', B'\x41\x5B',
                                   B'\x58',     B'\x59',     B'\x5A',     B'\x5B',   # noqa
                                   B'\x5C',     B'\x5D',     B'\x5E',     B'\x5F',   # noqa
                               )),
            'Cert'          : (B'-----BEGIN CERTIFICATE-----'),
            'PrivateKey'    : (B'-----BEGIN PRIVATE KEY-----'),
            'PrivateKeyDSA' : (B'-----BEGIN DSA PRIVATE KEY-----'),
            'PrivateKeyRSA' : (B'-----BEGIN RSA PRIVATE KEY-----'),
            'PrivateKeySSH' : (B'-----BEGIN OPENSSH PRIVATE KEY-----'),
            'PEM'           : (B'-----BEGIN '),
            'PuTTY-Key'     : (B'PuTTY-User-Key-File-', (B'2:', B'3:')),
            'MsAccess'      : (B'\0\01\0\0Standard ', (B'ACE', B'Jet'), B' DB'),
        },
        range(0x10, 0x11): {
            'ASAR'          : (B'{"files":{"'),
        },
        range(0x10): {
            'DocTypeLower'  : (B'<!doctype\x20', (B'', B'html')),
            'DocTypeUpper'  : (B'<!DOCTYPE\x20', (B'', B'HTML')),
            'HTMLLower'     : (B'<html>'),
            'HTMLUpper'     : (B'<HTML>'),
            'XML'           : (B'<?xml version="'),
            'Ace'           : (B'**ACE**'),
        },
        range(0x36, 0x41): {
            'PEStub': (
                B'\0\x0E\x1F\xBA\x0E\x00\xB4\x09\xCD\x21\xB8\x01\x4C\xCD\x21'
                B'This program cannot be run in DOS mode.\r'
            ),
            'PEDelphiStub': (
                B'\0\xBA\x10\x00\x0E\x1F\xB4\x09\xCD\x21\xB8\x01\x4C\xCD\x21\x90\x90'
                B'This program must be run under Win', (B'32', B'64'), B'\x0D\x0A'
            ),
        },
        range(0x48, 0x60): {
            'PEStubMsg'      : (B'This program cannot be run in DOS mode.\r'),
            'PEDelphiStubMsg': (B'This program must be run under Win', (B'32', B'64'), B'\x0D\x0A'),
        },
        range(0xD0, 0xD1): {
            'Tar'           : (B'\x00' * 0x30 + B'ustar', (B'\x20\x20\x00', B'\x00\x30\x30')),
        },
    }

    _ENC_ALPHABETS = [
        B'0123456789,',
        B'0123456789;',
        B'0123456789ABCDEF',
        B'0123456789abcdef',
        B'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567',
        B'abcdefghijklmnopqrstuvwxyz234567',
        B'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ+/',
        B'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_',
    ]

    _WSH_ALPHABET = bytes(set(range(0x20, 0x80)) - {0x3C, 0x3E} | {0x09})

    class _rt(enum.IntEnum):
        crib = 0
        alph = 1
        freq = 2

    class _result(NamedTuple):
        key: bytes
        how: xkey._rt
        xor: bool | None = None
        score: float = 0.0

    def __init__(
        self,
        range: Param[slice, Arg.Bounds(help=(
            'range of length values to try in Python slice syntax, the default is {default}.'
        ))] = slice(1, 32),
        plaintext: Param[buf, Arg.Binary('-p', help=(
            'Provide a buffer of known plaintext. Without a search position, this can slow '
            'down the key search significantly.'
        ))] = B'',
        searchpos: Param[slice, Arg.Bounds('-s', metavar='S:E', help=(
            'Only used when a known plaintext buffer is provided; In this case it narrows the '
            'search range for the offset of that data to between S and E.'
        ))] = slice(0, None),
        alph: Param[bool, Arg.Switch('-a',
            help='Enable search for keys via known encoder alphabets.')] = False,
        crib: Param[bool, Arg.Switch('-c',
            help='Enable search for keys via known plaintext cribs.')] = False,
        freq: Param[bool, Arg.Switch('-f',
            help='Enable search for keys via frequency analysis.')] = False,
    ):
        if not any((alph, crib, freq)) and not plaintext:
            alph = crib = freq = True
        super().__init__(
            range=range,
            plaintext=plaintext,
            searchpos=searchpos,
            alph=alph,
            crib=crib,
            freq=freq,
        )

    def process(self, data: bytearray):
        for result in self._attack(data):
            out = result.key
            if how := result.how:
                out = self.labelled(out, method=how)
            return out

    def _attack(self, data: bytearray):
        bounds: slice = self.args.range
        view = memoryview(data)
        length = len(view)

        if length <= 1:
            return

        if length >= 0x100:
            view = view[:-4]

        start = bounds.start or 1
        stop = min(bounds.stop or length, length)

        if (step := bounds.step) is None:
            step = 1
        elif bounds.start is None:
            start *= step

        self.log_debug(
            F'received input range [{bounds.start}:{bounds.stop}:{bounds.step}], '
            F'using [{start}:{stop}:{step}]')

        criblist: list[tuple[range, dict[str, bytes | tuple[bytes | tuple[bytes, ...], ...]]]] = []

        if p := self.args.plaintext:
            pos: slice = self.args.searchpos
            end = len(data) - len(p) if pos.stop is None else pos.stop
            criblist.append((range(pos.start or 0, end + 1), {'Plaintext': (p,)}))
        if self.args.crib:
            for r, byname in self._CRIBS.items():
                compiled = {
                    name: tuple(_generate_cribs(cribs))
                    for name, cribs in byname.items()
                }
                criblist.append((r, compiled))

        if self.args.alph:
            alphabets: dict[int, list[bytes]] | None = {}
            for alphabet in self._ENC_ALPHABETS:
                for suffix in (B'', B'\x20', B'\x0A', B'\x20\x0A'):
                    a = alphabet + suffix
                    alphabets.setdefault(len(a), []).append(a)
            alphabets[len(self._WSH_ALPHABET)] = [self._WSH_ALPHABET]
        else:
            alphabets = None

        for xor in (True, False):
            if key := self._process_crib(view, xor, criblist):
                yield self._result(key, self._rt.crib, xor)

        hist = {}
        freq = []

        for xor in (True, False):
            result = self._process_freq(view, (start, stop, step), alphabets, xor, hist)
            if result is None or not result.key:
                continue
            if result.how == self._rt.freq:
                freq.append(result)
                continue
            yield result

        yield from freq

    def _process_crib(
        self,
        view: memoryview,
        xor: bool,
        criblist: list[tuple[range, dict[str, list[bytes]]]]
    ):
        for offsets, cribs_by_type in criblist:
            for name, cribs in cribs_by_type.items():
                for crib in cribs:
                    cn = len(crib)
                    for offset in offsets:
                        test = view[offset:offset + cn]
                        if len(test) != cn:
                            continue
                        key = strxor(test, crib) if xor else bytes(
                            a - b & 0xFF for a, b in zip(test, crib))
                        if key := _cyclic_base(key):
                            self.log_info(F'found key via crib {name}:', crib, clip=True)
                            shift = -offset % len(key)
                            return key[shift:] + key[:shift]

    def _process_freq(
        self,
        view: memoryview,
        bounds: tuple[int, int, int],
        alphabets: dict[int, list[bytes]] | None,
        xor: bool,
        hist: dict[int, tuple[list[bytes], list[Counter]]],
    ):
        n = len(view)
        start, stop, step = bounds
        score = 0
        guess = None
        first = not hist

        for keylen in range(start, stop + 1, step):
            try:
                cached = hist[keylen]
            except KeyError:
                patches = [view[j::keylen] for j in range(keylen)]
                histograms = [Counter(p) for p in patches]
                hist[keylen] = patches, histograms
            else:
                patches, histograms = cached

            if alphabets is not None:
                hlc = Counter(len(h) for h in histograms)
                base, coverage = hlc.most_common(1)[0]

                if coverage * 2 > keylen and base in alphabets:
                    self.log_debug(F'solving for potential plaintext alphabet of size 0x{base:02X} at {keylen}')
                    keys: dict[bytes, bytes] = {}
                    for alphabet in alphabets[base]:
                        key = bytearray(keylen)
                        for k, patch in enumerate(patches):
                            keybyte = set(range(0x100))
                            for c in patch:
                                keybyte &= (
                                    {c ^ p & 0xFF for p in alphabet}
                                ) if xor else (
                                    {c - p & 0xFF for p in alphabet}
                                )
                                if len(keybyte) == 1:
                                    key[k] = next(iter(keybyte))
                                    break
                            else:
                                key = None
                                break
                        if key is not None:
                            keys[alphabet] = key
                    if len(keys) == 1:
                        self.log_debug(F'discovered plaintext alphabet of size 0x{base:02X} at {keylen}')
                        alphabet, key = keys.popitem()
                        return self._result(bytes(key), self._rt.alph, xor)

            if not first or not self.args.freq:
                continue

            _guess = [h.most_common(1)[0] for h in histograms]
            _score = sum(letter_count for _, letter_count in _guess) / n
            # This scaling accounts for the smaller probability of larger keys. No proper statistical analysis has been
            # conducted to derive it; there might be plenty of room for improvement here.
            _score = _score * ((n - keylen) / (n - 1)) ** keylen

            logmsg = F'[{{}}] score {_score * 100:05.2f}% for key length {keylen}'
            if _score > score:
                self.log_info(logmsg.format('+'))
                score = _score
                guess = bytes(value for value, _ in _guess)
            else:
                self.log_debug(logmsg.format(' '))

        if guess is not None:
            return self._result(guess, self._rt.freq, score=score * 100)

class xlmdeobf (extract_only=False, sort_formulas=False, with_ms_excel=False, day=-1, output_formula_format='CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format='CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent=False, start_point='', password='', output_level=0, timeout=0)

This unit is implemented in refinery.units.formats.office.xlmdeobf and has the following commandline Interface:

usage: xlmdeobf [-h] [-L] [-Q] [-0] [-v] [-x] [-s] [-X] [-d N] [-O FMT] [-E FMT] [-I] [-c CELL]
                [-p STR] [-o N] [-t N]

Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.

options:
  -x, --extract-only        Only extract cells without any emulation.
  -s, --sort-formulas       Sort extracted formulas based on their cell address (implies -x).
  -X, --with-ms-excel       Use MS Excel to process XLS files.
  -d, --day N               Specify the day of month
  -O, --output-format FMT   Specify the format for output formulas (using [[CELL-ADDR]], [[INT-
                            FORMULA]], and [[STATUS]])
  -E, --extract-format FMT  Specify the format for extracted formulas (using [[CELL-ADDR]],
                            [[CELL-FORMULA]], and [[CELL-VALUE]])
  -I, --no-indent           Do not show indent before formulas
  -c, --start-point CELL    Start interpretation from a specific cell address
  -p, --password STR        Password to decrypt the protected document
  -o, --output-level N      Set the level of details to be shown (0:all commands, 1: commands no
                            jump 2:important commands 3:strings in important commands).
  -t, --timeout N           Stop emulation after N seconds (0: not interruption N>0: stop
                            emulation after N seconds)

generic options:
  -h, --help                Show this help message and exit.
  -L, --lenient             Increase the leniency, allowing partial results and ignoring more
                            errors.
  -Q, --quiet               Disables all log output.
  -0, --devnull             Do not produce any output.
  -v, --verbose             Specify up to two times to increase log level.

Expand source code Browse git

class xlmdeobf(Unit):
    """
    Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.
    """

    def __init__(
        self,
        extract_only: Param[bool, Arg.Switch(
            '-x', help='Only extract cells without any emulation.'
        )] = False,
        sort_formulas: Param[bool, Arg.Switch(
            '-s', '--sort-formulas',
            help='Sort extracted formulas based on their cell address (implies -x).',
        )] = False,
        with_ms_excel: Param[bool, Arg.Switch(
            '-X', '--with-ms-excel', help='Use MS Excel to process XLS files.'
        )] = False,
        day: Param[int, Arg.Number(
            '-d',
            '--day',
            help='Specify the day of month',
        )] = -1,
        output_formula_format: Param[str, Arg.String(
            '-O', '--output-format',
            metavar='FMT',
            help='Specify the format for output formulas (using [[CELL-ADDR]], [[INT-FORMULA]], and [[STATUS]])',
        )] = 'CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]',
        extract_formula_format: Param[str, Arg.String(
            '-E', '--extract-format',
            metavar='FMT',
            help='Specify the format for extracted formulas (using [[CELL-ADDR]], [[CELL-FORMULA]], and [[CELL-VALUE]])',
        )] = 'CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]',
        no_indent: Param[bool, Arg.Switch(
            '-I', '--no-indent',
            help='Do not show indent before formulas',
        )] = False,
        start_point: Param[str, Arg.String(
            '-c', '--start-point',
            help='Start interpretation from a specific cell address',
            metavar='CELL',
        )] = '',
        password: Param[str, Arg.String(
            '-p',
            '--password',
            help='Password to decrypt the protected document',
        )] = '',
        output_level: Param[int, Arg.Number(
            '-o',
            '--output-level',
            help=(
                'Set the level of details to be shown (0:all commands, 1: commands no jump 2:important '
                'commands 3:strings in important commands).'
            ),
        )] = 0,
        timeout: Param[int, Arg.Number(
            '-t',
            '--timeout',
            help='Stop emulation after N seconds (0: not interruption N>0: stop emulation after N seconds)',
        )] = 0,
    ):
        extract_only = sort_formulas or extract_only
        self.superinit(super(), **vars())

    @Unit.Requires('XLMMacroDeobfuscator', ['formats', 'office'])
    def _process_file():
        with NoLogging(NoLogging.Mode.ALL):
            from XLMMacroDeobfuscator.configs import settings
            settings.SILENT = True
            from XLMMacroDeobfuscator.deobfuscator import process_file
            return process_file

    def process(self, data: bytearray):
        with VirtualFileSystem() as vfs, NoLogging(NoLogging.Mode.ALL):
            result = self._process_file(
                file=vfs.new(data),
                noninteractive=True,
                return_deobfuscated=True,
                extract_only=self.args.extract_only,
                silent=True,
                sort_formulas=self.args.sort_formulas,
                defined_names=False,
                with_ms_excel=self.args.with_ms_excel,
                start_with_shell=False,
                day=self.args.day,
                output_formula_format=self.args.output_formula_format,
                extract_formula_format=self.args.extract_formula_format,
                no_indent=self.args.no_indent,
                start_point=self.args.start_point,
                password=self.args.password,
                output_level=self.args.output_level,
                timeout=self.args.timeout,
            )
        return '\n'.join(result).encode(self.codec)

class xlxtr (*references)

This unit is implemented in refinery.units.formats.office.xlxtr and has the following commandline Interface:

usage: xlxtr [-h] [-L] [-Q] [-0] [-v] [reference ...]

Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet
reference is of the form B1 or 1.2, both specifying the first cell of the second column. A cell
range can be specified as B1:C12, or 1.2:C12, or 1.2:12.3. Finally, the unit will always refer to
the first sheet in the document and to change this, specify the sheet name or index separated by
a hashtag, i.e. sheet#B1:C12 or 1#B1:C12. Note that indices are 1-based. To get all elements of
one sheet, use sheet#. The unit If parsing a sheet reference fails, the script will assume that
the given reference specifies a sheet.

positional arguments:
  reference      A sheet reference to be extracted. If no sheet references are given, the unit
                 lists all sheet names.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xlxtr(_ExcelUnit):
    """
    Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet
    reference is of the form `B1` or `1.2`, both specifying the first cell of the second column.
    A cell range can be specified as `B1:C12`, or `1.2:C12`, or `1.2:12.3`. Finally, the unit will
    always refer to the first sheet in the document and to change this, specify the sheet name or
    index separated by a hashtag, i.e. `sheet#B1:C12` or `1#B1:C12`. Note that indices are
    1-based. To get all elements of one sheet, use `sheet#`. The unit If parsing a sheet reference
    fails, the script will assume that the given reference specifies a sheet.
    """
    def __init__(
        self,
        *references: Param[SheetReference, Arg(
            metavar='reference',
            type=SheetReference,
            help=(
                'A sheet reference to be extracted. '
                'If no sheet references are given, the unit lists all sheet names.'
            )
        )]
    ):
        if not references:
            references = SheetReference('*'),
        super().__init__(references=references)

    def process(self, data):
        try:
            wb = Workbook(data, self)
        except ImportError:
            raise
        except Exception as E:
            raise ValueError('Input not recognized as Excel document.') from E
        for ref in self.args.references:
            ref: SheetReference
            for k, name in enumerate(wb.sheets()):
                if not ref.match(k, name):
                    continue
                try:
                    data = wb.get_sheet_data(name)
                except Exception as error:
                    self.log_info(F'error reading sheet {name}:', error)
                    continue
                for r, row in enumerate(data, 1):
                    for c, value in enumerate(row, 1):
                        if (r, c) not in ref:
                            continue
                        if value is None:
                            continue
                        yield self.labelled(
                            str(value).encode(self.codec),
                            row=r,
                            col=c,
                            ref=_rc2ref(r, c),
                            sheet=name
                        )

class xor (*argument, bigendian=False, blocksize=0)

This unit is implemented in refinery.units.blockwise.xor and has the following commandline Interface:

usage: xor [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [argument ...]

Form the exclusive or of the input data with the given argument.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

options:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes. It is chosen, by default, to be the
                     smallest size that can hold the provided argument without loss of precision.
                     For example, passing the value 0x1234 will result in a default block size of
                     2, while passing the value 12 will mean that the default block size is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class xor(BinaryOperationWithAutoBlockAdjustment):
    """
    Form the exclusive or of the input data with the given argument.
    """
    @staticmethod
    def operate(a, b): return a ^ b
    @staticmethod
    def inplace(a, b): a ^= b

    def _fastblock_fallback(self, data):
        from Cryptodome.Util import strxor
        size = len(data)
        it, masked = self._argument_parse_hook(self.args.argument[0])
        arg0 = self._infinitize_argument(len(data), it, masked)
        take = len(data) // self.blocksize + 1
        argb = self.unchunk(islice(arg0, take))
        del argb[size:]
        return strxor.strxor(data, argb)

    def _fastblock(self, data):
        try:
            return super()._fastblock(data)
        except FastBlockError as E:
            try:
                return self._fastblock_fallback(data)
            except Exception:
                raise E

class xsalsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

This unit is implemented in refinery.units.crypto.cipher.salsa and has the following commandline Interface:

usage: xsalsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce]

XSalsa encryption and decryption. The nonce must be 24 bytes long.

positional arguments:
  key                The encryption key.
  nonce              The nonce. Default is the string REFINERY.

options:
  -s, --stateful     Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N    Discard the first N bytes of the keystream, 0 by default.
  -m, --magic MAGIC  The magic constant; depends on the key size by default.
  -x, --offset N     Optionally specify the stream index, default is 0.
  -r, --rounds N     The number of rounds. Has to be an even number. Default is 20.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.

Expand source code Browse git

class xsalsa(LatinCipherUnit):
    """
    XSalsa encryption and decryption. The nonce must be 24 bytes long.
    """
    def keystream(self) -> Iterable[int]:
        kdn, kdp, nonce = struct.unpack('<8sQ8s', self.args.nonce)
        yield from LatinX(
            SalsaCipher,
            (0, 5, 10, 15, 6, 7, 8, 9),
            self.args.key,
            kdn,
            kdp,
            nonce,
            self.args.magic,
            self.args.rounds,
            self.args.offset,
        )

class xt (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xt and has the following commandline Interface:

usage: xt [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
          [-p PWD]
          [path ...]

This unit generically extracts files from archives. It attempts to identify the archive format
and use the corresponding specific extractor from among the ones implemented in refinery.

This unit is a path extractor which extracts data from a hierarchical structure. Each extracted
item is emitted as a separate chunk and has attached to it a meta variable that contains its path
within the source structure. The positional arguments to the command are patterns that can be
used to filter the extracted items by their path. To view only the paths of all chunks, use the
listing switch:

    emit something | xt --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xt [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xt(ArchiveUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    This unit generically extracts files from archives. It attempts to identify the archive format
    and use the corresponding specific extractor from among the ones implemented in refinery.
    """
    @classmethod
    def handles(cls, data) -> bool | None:
        out = False
        for engine in cls.handlers():
            engine_verdict = engine.handles(data)
            if engine_verdict is True:
                return True
            if engine_verdict is None:
                out = None
        return out

    @staticmethod
    def handlers():
        """
        Returns all archive handlers supported by the unit.
        """
        # units that check fixed offsets
        from refinery.units.formats.archive.xtsql import xtsql        ; yield xtsql     # noqa
        from refinery.units.formats.archive.xttar import xttar        ; yield xttar     # noqa
        from refinery.units.formats.archive.xtiso import xtiso        ; yield xtiso     # noqa
        from refinery.units.formats.archive.xtchm import xtchm        ; yield xtchm     # noqa
        from refinery.units.formats.archive.xtcab import xtcab        ; yield xtcab     # noqa
        from refinery.units.formats.archive.xtace import xtace        ; yield xtace     # noqa
        from refinery.units.formats.archive.xtmacho import xtmacho    ; yield xtmacho   # noqa
        from refinery.units.formats.archive.xtasar import xtasar      ; yield xtasar    # noqa
        from refinery.units.formats.office.xtrtf import xtrtf         ; yield xtrtf     # noqa
        from refinery.units.formats.pdf import xtpdf                  ; yield xtpdf     # noqa
        from refinery.units.formats.winreg import winreg              ; yield winreg    # noqa
        from refinery.units.formats.archive.xtgz import xtgz          ; yield xtgz      # noqa
        from refinery.units.formats.archive.xtcpio import xtcpio      ; yield xtcpio    # noqa
        # units that use fixed offsets + file magic
        from refinery.units.formats.msi import xtmsi                  ; yield xtmsi     # noqa
        # units that search for markers
        from refinery.units.formats.archive.xt7z import xt7z          ; yield xt7z      # noqa
        from refinery.units.formats.archive.xtzip import xtzip        ; yield xtzip     # noqa
        from refinery.units.formats.pe.dotnet.dnsfx import dnsfx      ; yield dnsfx     # noqa
        from refinery.units.formats.archive.xtinno import xtinno      ; yield xtinno    # noqa
        from refinery.units.formats.archive.xtiss import xtiss        ; yield xtiss     # noqa
        from refinery.units.formats.archive.xtnsis import xtnsis      ; yield xtnsis    # noqa
        from refinery.units.formats.archive.xtpyi import xtpyi        ; yield xtpyi     # noqa
        from refinery.units.formats.a3x import a3x                    ; yield a3x       # noqa
        from refinery.units.formats.archive.xtnode import xtnode      ; yield xtnode    # noqa
        from refinery.units.formats.archive.xtzpaq import xtzpaq      ; yield xtzpaq    # noqa
        from refinery.units.formats.email import xtmail               ; yield xtmail    # noqa
        from refinery.units.formats.office.xtone import xtone         ; yield xtone     # noqa
        from refinery.units.formats.office.xtdoc import xtdoc         ; yield xtdoc     # noqa
        # units that implement more complex parsing / searching:
        from refinery.units.formats.archive.xtsim import xtsim        ; yield xtsim     # noqa
        from refinery.units.formats.archive.xtnuitka import xtnuitka  ; yield xtnuitka  # noqa
        # fallbacks that have to be attempted last
        from refinery.units.formats.json import xtjson                ; yield xtjson    # noqa
        from refinery.units.formats.xml import xtxml                  ; yield xtxml     # noqa
        from refinery.units.formats.html import xthtml                ; yield xthtml    # noqa
        from refinery.units.formats.exe.vsect import vsect            ; yield vsect     # noqa

    def unpack(self, data):
        fallback: list[type[PathExtractorUnit]] = []
        errors = {}
        pos_args = self.args.paths
        key_args = dict(
            list=self.args.list,
            path=self.args.path,
            date=self.args.date,
            join_path=self.args.join,
            drop_path=self.args.drop,
        )
        if self.args.pwd:
            key_args.update(pwd=self.args.pwd)
        if self.args.regex:
            key_args.update(regex=self.args.regex)

        class unpacker:
            unit = self

            def __init__(self, handler: type[PathExtractorUnit], fallback: bool):
                self.success = False
                self.handler = handler
                self.fallback = fallback
                self.count = 0

            def __iter__(self):
                handler = self.handler
                if self.fallback:
                    verdict = True
                else:
                    verdict = handler.handles(data)
                if verdict is False:
                    self.unit.log_info(F'rejected: {handler.name}')
                elif verdict is True:
                    if not self.fallback:
                        self.unit.log_info(F'accepted: {handler.name}')
                    try:
                        unit = handler(*pos_args, **key_args)
                        unit.args.lenient = self.unit.args.lenient
                        unit.args.quiet = self.unit.args.quiet
                        unit.log_level = self.unit.log_level
                    except TypeError as error:
                        self.unit.log_debug('handler construction failed:', error)
                        return
                    try:
                        test_unpack = not self.unit.args.list
                        for filtered in unit.filter([data]):
                            for item in unit.unpack(filtered):
                                if test_unpack:
                                    item.get_data()
                                    test_unpack = False
                                self.count += 1
                                yield item
                    except Exception as error:
                        if not self.fallback:
                            errors[handler.name] = error
                        if isinstance(error, MultipleArchives):
                            self.unit.log_warn(error)
                        else:
                            if self.unit.log_debug():
                                raise error
                            self.unit.log_info('handler unpacking failed:', error)
                    else:
                        self.success = True
                elif verdict is None:
                    fallback.append(handler)

        extracted = 0

        for handler in self.handlers():
            self.CustomPathSeparator = handler.CustomPathSeparator
            it = unpacker(handler, fallback=False)
            yield from it
            if it.success:
                extracted += it.count
                if extracted != 0:
                    break
                self.log_debug('handler extracted zero items, continuing')

        if extracted > 0:
            return

        self.log_debug('fallback order:', lambda: ', '.join(h.name for h in fallback))

        for handler in fallback:
            it = unpacker(handler, fallback=True)
            yield from it
            if it.success:
                return

        if not errors:
            raise ValueError('input data did not match any known archive format')
        for name, error in errors.items():
            self.log_info(F'error when trying to unpack with {name}:', error)
        raise RefineryException('none of the available unpackers could handle this data')

Static methods

def handlers()

Returns all archive handlers supported by the unit.

Expand source code Browse git

@staticmethod
def handlers():
    """
    Returns all archive handlers supported by the unit.
    """
    # units that check fixed offsets
    from refinery.units.formats.archive.xtsql import xtsql        ; yield xtsql     # noqa
    from refinery.units.formats.archive.xttar import xttar        ; yield xttar     # noqa
    from refinery.units.formats.archive.xtiso import xtiso        ; yield xtiso     # noqa
    from refinery.units.formats.archive.xtchm import xtchm        ; yield xtchm     # noqa
    from refinery.units.formats.archive.xtcab import xtcab        ; yield xtcab     # noqa
    from refinery.units.formats.archive.xtace import xtace        ; yield xtace     # noqa
    from refinery.units.formats.archive.xtmacho import xtmacho    ; yield xtmacho   # noqa
    from refinery.units.formats.archive.xtasar import xtasar      ; yield xtasar    # noqa
    from refinery.units.formats.office.xtrtf import xtrtf         ; yield xtrtf     # noqa
    from refinery.units.formats.pdf import xtpdf                  ; yield xtpdf     # noqa
    from refinery.units.formats.winreg import winreg              ; yield winreg    # noqa
    from refinery.units.formats.archive.xtgz import xtgz          ; yield xtgz      # noqa
    from refinery.units.formats.archive.xtcpio import xtcpio      ; yield xtcpio    # noqa
    # units that use fixed offsets + file magic
    from refinery.units.formats.msi import xtmsi                  ; yield xtmsi     # noqa
    # units that search for markers
    from refinery.units.formats.archive.xt7z import xt7z          ; yield xt7z      # noqa
    from refinery.units.formats.archive.xtzip import xtzip        ; yield xtzip     # noqa
    from refinery.units.formats.pe.dotnet.dnsfx import dnsfx      ; yield dnsfx     # noqa
    from refinery.units.formats.archive.xtinno import xtinno      ; yield xtinno    # noqa
    from refinery.units.formats.archive.xtiss import xtiss        ; yield xtiss     # noqa
    from refinery.units.formats.archive.xtnsis import xtnsis      ; yield xtnsis    # noqa
    from refinery.units.formats.archive.xtpyi import xtpyi        ; yield xtpyi     # noqa
    from refinery.units.formats.a3x import a3x                    ; yield a3x       # noqa
    from refinery.units.formats.archive.xtnode import xtnode      ; yield xtnode    # noqa
    from refinery.units.formats.archive.xtzpaq import xtzpaq      ; yield xtzpaq    # noqa
    from refinery.units.formats.email import xtmail               ; yield xtmail    # noqa
    from refinery.units.formats.office.xtone import xtone         ; yield xtone     # noqa
    from refinery.units.formats.office.xtdoc import xtdoc         ; yield xtdoc     # noqa
    # units that implement more complex parsing / searching:
    from refinery.units.formats.archive.xtsim import xtsim        ; yield xtsim     # noqa
    from refinery.units.formats.archive.xtnuitka import xtnuitka  ; yield xtnuitka  # noqa
    # fallbacks that have to be attempted last
    from refinery.units.formats.json import xtjson                ; yield xtjson    # noqa
    from refinery.units.formats.xml import xtxml                  ; yield xtxml     # noqa
    from refinery.units.formats.html import xthtml                ; yield xthtml    # noqa
    from refinery.units.formats.exe.vsect import vsect            ; yield vsect     # noqa

class xt7z (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xt7z and has the following commandline Interface:

usage: xt7z [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
            [-p PWD]
            [path ...]

Extract files from a 7zip archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xt7z --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xt7z [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xt7z(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a 7zip archive.
    """
    @ArchiveUnit.Requires('py7zr', ['arc', 'default', 'extended'])
    def _py7zr():
        import py7zr
        import py7zr.exceptions
        return py7zr

    def unpack(self, data: bytearray):
        for match in re.finditer(re.escape(_SIGNATURE), data):
            start = match.start()
            if start != 0:
                self.log_info(F'found a header at offset 0x{start:X}, trying to extract from there.')
            try:
                yield from self._unpack_from(data, start)
            except self._py7zr.Bad7zFile:
                continue
            else:
                break

    def _unpack_from(self, data: bytearray, zp: int = 0):
        def mk7z(**keywords):
            return self._py7zr.SevenZipFile(MemoryFile(mv[zp:]), **keywords)

        pwd = self.args.pwd
        mv = memoryview(data)
        archive = None

        def test(archive: SevenZipFile):
            if self.args.list:
                archive.list()
                return False
            return archive.testzip()

        if pwd:
            try:
                archive = mk7z(password=pwd.decode(self.codec))
            except self._py7zr.Bad7zFile:
                raise ValueError('corrupt archive; the password is likely invalid.')
        else:
            def passwords():
                yield None
                yield from self._COMMON_PASSWORDS
            for pwd in passwords():
                if pwd is None:
                    self.log_debug('trying empty password')
                else:
                    self.log_debug(F'trying password: {pwd}')
                try:
                    archive = mk7z(password=pwd)
                    problem = test(archive)
                except self._py7zr.PasswordRequired:
                    problem = True
                except self._py7zr.UnsupportedCompressionMethodError as E:
                    raise ValueError(E.message)
                except self._py7zr.exceptions.InternalError:
                    # ignore internal errors during testzip
                    break
                except SystemError:
                    problem = True
                except Exception:
                    if pwd is None:
                        raise
                    problem = True
                if not problem:
                    break
            else:
                raise ValueError('a password is required and none of the default passwords worked.')

        assert archive is not None
        has_read_method = hasattr(archive, 'read')

        for info in archive.list():
            if has_read_method:
                def extract(archive: SevenZipFile = archive, name: str = info.filename):
                    archive.reset()
                    io = archive.read([name])
                    io = io[name]
                    io.seek(0)
                    return io.read()
            else:
                def extract(archive: SevenZipFile = archive, name: str = info.filename):
                    io = _IOFactory()
                    archive.reset()
                    archive.extract(None, [name], factory=io)
                    return io.buffer.getvalue()

            if info.is_directory:
                continue

            yield self._pack(
                info.filename,
                info.creationtime,
                extract,
                crc32=info.crc32,
                uncompressed=info.uncompressed
            )

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:6] == _SIGNATURE:
            return True
        if not is_likely_pe(data):
            return None
        offset = get_pe_size(data)
        memory = memoryview(data)
        memory = memory[offset:]
        if memory[:10] == B';!@Install' and buffer_offset(memory, _SIGNATURE, 0, 0x1000) > 0:
            return True

class xtace (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtace and has the following commandline Interface:

usage: xtace [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extract files from an ACE archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtace --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtace [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtace(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from an ACE archive.
    """
    def unpack(self, data):
        ace = acefile.open(MemoryFile(data, output=bytes))
        for member in ace.getmembers():
            member: acefile.AceMember
            comment = {} if not member.comment else {'comment': member.comment}
            yield self._pack(
                member.filename,
                member.datetime,
                lambda a=ace, m=member: a.read(m, pwd=self.args.pwd),
                **comment
            )

    @classmethod
    def handles(cls, data) -> bool:
        return data[7:14] == b'**ACE**'

class xtasar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtasar and has the following commandline Interface:

usage: xtasar [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
              [-p PWD]
              [path ...]

Extract files from Atom Shell Archives (ASAR). These are often used to bundle Electron
application data and resources. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtasar --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtasar [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtasar(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from Atom Shell Archives (ASAR). These are often used to bundle Electron application
    data and resources.
    """
    def unpack(self, data: bytearray):
        def _unpack(dir: JSONDict, *path):
            for name, listing in dir.get('files', {}).items():
                yield from _unpack(listing, *path, name)
            try:
                offset = dir['offset']
                size = dir['size']
            except KeyError:
                return
            try:
                offset = int(offset) + header.base
                end = int(size) + offset
            except TypeError:
                self.log_warn(F'unable to convert offset "{offset}" and size "{size}" to integers')
                return
            if not path:
                self.log_warn(F'not processing item at root with offset {offset} and size {size}')
                return
            yield UnpackResult(
                '/'.join(path),
                lambda a=offset, b=end: data[a:b],
                offset=offset
            )

        header = AsarHeader(data)
        self.log_debug(F'header read successfully, base offset is {header.base}.')
        yield from _unpack(header.directory)

    @classmethod
    def handles(cls, data) -> bool | None:
        return data[:4] == b'\04\0\0\0' and data[0x10:0x18] == B'{"files"'

class xtcab (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtcab and has the following commandline Interface:

usage: xtcab [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extract files from CAB (cabinet) archives. Multi-volume archives can be extracted if all required
disks are present as chunks within the current frame.

This unit is a path extractor which extracts data from a hierarchical structure. Each extracted
item is emitted as a separate chunk and has attached to it a meta variable that contains its path
within the source structure. The positional arguments to the command are patterns that can be
used to filter the extracted items by their path. To view only the paths of all chunks, use the
listing switch:

    emit something | xtcab --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtcab [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtcab(ArchiveUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    Extract files from CAB (cabinet) archives. Multi-volume archives can be extracted if all
    required disks are present as chunks within the current frame.
    """
    def unpack(self, data: Chunk):
        arc: Cabinet = data.temp
        arc.check()
        arc.process()
        one = len(arc.files) == 1
        self.log_info(F'processing CAB with {len(arc)} disks')
        for id, files in arc.files.items():
            for file in files:
                path = file.name
                if not one:
                    path = F'CAB{id:04X}/{path}'
                yield self._pack(path, file.timestamp, lambda f=file: f.decompress())

    def filter(self, chunks):
        box = None
        cab = Cabinet()
        for chunk in chunks:
            if box is None:
                box = chunk
                box.temp = cab
            if cab.needs_more_disks():
                cab.append(memoryview(chunk))
            else:
                yield box
                box = chunk
                cab = box.temp = Cabinet()
        if box:
            yield box

    @classmethod
    def handles(cls, data):
        return data[:4] == CabDisk.MAGIC

class xtchm (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.archive.xtchm and has the following commandline Interface:

usage: xtchm [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract files from CHM (Windows Help) files.

This unit is a path extractor which extracts data from a hierarchical structure. Each extracted
item is emitted as a separate chunk and has attached to it a meta variable that contains its path
within the source structure. The positional arguments to the command are patterns that can be
used to filter the extracted items by their path. To view only the paths of all chunks, use the
listing switch:

    emit something | xtchm --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtchm [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtchm(PathExtractorUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    Extract files from CHM (Windows Help) files.
    """
    def unpack(self, data):
        chm = CHM(memoryview(data))

        self.log_info(F'language: {chm.header.language_name}')
        self.log_info(F'codepage: {chm.header.codepage}')

        for path, record in chm.filesystem.items():
            def extract(chm=chm, record=record):
                return chm.read(record)
            if record.length <= 0:
                continue
            if path.startswith('::DataSpace'):
                continue
            yield UnpackResult(path, extract)

    @classmethod
    def handles(cls, data):
        return data[:4] == ChmHeader.Magic

class xtcpio (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtcpio and has the following commandline Interface:

usage: xtcpio [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
              [-p PWD]
              [path ...]

Extract files from a CPIO archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtcpio --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtcpio [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtcpio(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a CPIO archive.
    """
    def unpack(self, data):
        def cpio():
            with suppress(EOF):
                return CPIOEntry(reader)
        reader = StructReader(memoryview(data))
        for entry in iter(cpio, None):
            if entry.name == 'TRAILER!!!':
                break
            yield self._pack(entry.name, entry.mtime, entry.data)

    @classmethod
    def handles(cls, data) -> bool:
        return data[:6] == B'070701'

class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtdoc and has the following commandline Interface:

usage: xtdoc [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract files from an OLE document such as a Microsoft Word DOCX file.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    @PathExtractorUnit.Requires('olefile', ['formats', 'office', 'extended'])
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data):
        with MemoryFile(data) as stream:
            try:
                oledoc = self._olefile.OleFileIO(stream)
            except OSError as error:
                self.log_info(F'error, {error}, treating input as zip file')
                yield from xtzip().unpack(data)
                return
            for item in oledoc.listdir():
                if not item or not item[-1]:
                    continue
                path = '/'.join(item)
                olestream = oledoc.openstream(path)
                c0 = ord(item[-1][:1])
                if c0 < 20:
                    item[-1] = F'[{c0:d}]{item[-1][1:]}'
                    path = '/'.join(item)
                path = convert_msi_name(path)
                self.log_debug('exploring:', path)
                yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:8] == B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
            return True
        return is_likely_doc(data)

class xtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, rounds=32)

This unit is implemented in refinery.units.crypto.cipher.xtea and has the following commandline Interface:

usage: xtea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-k N] key

XTEA encryption and decryption.

positional arguments:
  key              The encryption key.

options:
  -i, --iv IV      Specifies the initialization vector. If none is specified, then a block of
                   zero bytes is used.
  -p, --padding P  Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does
                   nothing. By default, all other algorithms are attempted. In most cases, the
                   data was not correctly decrypted if none of these work.
  -m, --mode M     Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB,
                   PCBC. By default, the CBC mode is used when an IV is is provided, and ECB
                   otherwise.
  -r, --raw        Set the padding to raw; ignored when a padding is specified.
  -s, --swap       Decode blocks as big endian rather than little endian.
  -k, --rounds N   Specify the number of rounds, 32 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation.

Expand source code Browse git

class xtea(TEAUnit, cipher=BlockCipherFactory(XTEA)):
    """
    XTEA encryption and decryption.
    """

class xtgz (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtgz and has the following commandline Interface:

usage: xtgz [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
            [-p PWD]
            [path ...]

Extract a file from a GZip archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtgz(ArchiveUnit):
    """
    Extract a file from a GZip archive.
    """
    def unpack(self, data: bytearray):
        archive = GzipHeader(data)
        path = archive.name
        date = archive.mtime
        date = date and datetime.fromtimestamp(date) or None
        if path is None:
            try:
                meta = metavars(data)
                path = Path(meta['path'])
            except KeyError:
                path = 'ungz'
            else:
                self.log_warn(path)
                suffix = path.suffix
                if suffix.lower() == '.gz':
                    path = path.with_suffix('')
                else:
                    path = path.with_suffix(F'{suffix}.ungz')
                path = path.as_posix()
        yield self._pack(path, date, archive.data)

    @classmethod
    def handles(cls, data) -> bool:
        return data[:2] == B'\x1F\x8B'

class xthtml (*paths, outer=False, attributes=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.html and has the following commandline Interface:

usage: xthtml [-h] [-L] [-Q] [-0] [-v] [-F] [-o] [-a] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
              [path ...]

The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the
given tag. The main purpose is to extract scripts from HTML documents.

positional arguments:
  path              Wildcard pattern for the path of the item to be extracted. Each item is
                    returned as a separate output of this unit. Paths may contain wildcards; The
                    default argument is a single wildcard, which means that every item will be
                    extracted. If a given path yields no results, the unit performs increasingly
                    fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -o, --outer       Include the HTML tags for an extracted element.
  -a, --attributes  Populate chunk metadata with HTML tag attributes.
  -l, --list        Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path   Join path names with the previously existing one.
  -d, --drop-path   Do not modify the path variable for output chunks.
  -z, --fuzzy       Specify once to add a leading wildcard to each patterns, twice to also add a
                    trailing wildcard.
  -e, --exact       Path patterns never match on substrings.
  -r, --regex       Use regular expressions instead of wildcard patterns.
  -P, --path NAME   Name of the meta variable to receive the extracted path. The default value is
                    "path".

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.
  -F, --iff         Only apply unit if it can handle the input format. Specify twice to drop all
                    other chunks.

Expand source code Browse git

class xthtml(XMLToPathExtractorUnit):
    """
    The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the
    given tag. The main purpose is to extract scripts from HTML documents.
    """
    def __init__(
        self, *paths,
        outer: Param[bool, Arg.Switch('-o', help='Include the HTML tags for an extracted element.')] = False,
        attributes: Param[bool, Arg.Switch('-a', help='Populate chunk metadata with HTML tag attributes.')] = False,
        list=False,
        join_path=False,
        drop_path=False,
        fuzzy=0,
        exact=False,
        regex=False,
        path=b'path',
    ):
        super().__init__(
            *paths,
            outer=outer,
            attributes=attributes,
            format='{tag}',
            path=path,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
        )

    def unpack(self, data):
        try:
            text = data.decode(self.codec)
        except UnicodeDecodeError:
            text = data.decode('latin1')

        html = HTMLTreeParser()
        html.feed(text)
        root = html.tos
        root.reindex()

        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        while root.parent:
            self.log_info(F'tag was not closed: {root.tag}')
            root = root.parent

        while len(root.children) == 1:
            child, = root.children
            if child.tag != root.tag:
                break
            root = child

        def tree(root: HTMLNode, *parts: str):

            def outer(root: HTMLNode = root):
                return root.recover(inner=False).encode(self.codec)

            def inner(root: HTMLNode = root):
                return root.recover().encode(self.codec)

            tagpath = '/'.join(parts)
            meta = {}

            if self.args.attributes:
                meta.update(root.attributes)

            if root.root:
                yield UnpackResult(tagpath, inner, **meta)
            elif self.args.outer:
                yield UnpackResult(tagpath, outer, **meta)
            else:
                yield UnpackResult(tagpath, inner, **meta)

            for child in root.children:
                if child.textual:
                    continue
                yield from tree(child, *parts, path(child))

        yield from tree(root, path(root))

    @classmethod
    def handles(cls, data):
        return is_likely_htm(data)

class xtinno (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtinno and has the following commandline Interface:

usage: xtinno [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
              [-p PWD]
              [path ...]

Extract files from InnoSetup archives: This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtinno --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtinno [| dump {path} ]

Note: This unit generates the following synthetic metadata files under the "meta" directory:

- setup.bin contains the raw bytes for the setup metadata
- setup.template contains the raw and unprocessed metadata in JSON format
- setup.json contains the setup metadata with all format fields expanded

Similarly, there are files.bin, files.template, and files.json that contain the metadata of the
archived files. The files that are extracted under the "embedded" directory are usually parts of
the InnoSetup installer and not user data. All archived files are extracted within the directory
named "data".

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtinno(ArchiveUnit, _ps, docs='{0} {PathExtractorUnit}{p}{_ps}'):
    """
    Extract files from InnoSetup archives:
    """
    def unpack(self, data: bytearray):
        def post_process_json(doc):
            if isinstance(doc, dict):
                return {key: post_process_json(val) for key, val in doc.items()}
            if isinstance(doc, list):
                return [post_process_json(entry) for entry in doc]
            if not isinstance(doc, str):
                return doc
            try:
                return inno.emulator.reset().expand_constant(doc)
            except Exception:
                return doc

        inno = InnoArchive(data, self)

        password: bytes = self.args.pwd
        password = password.decode(self.codec) if password else None

        if any(file.encrypted for file in inno.files) and password is None:
            self.log_info('some files are password-protected and no password was given')

        with BytesAsArrayEncoder as encoder:
            yield self._pack('meta/setup.bin', None, inno.streams.TSetup.data)
            doc = inno.setup_info.json()
            yield self._pack('meta/setup.template', None, encoder.dumps(doc).encode(self.codec))
            doc = post_process_json(doc)
            yield self._pack('meta/setup.json', None, encoder.dumps(doc).encode(self.codec))

        with BytesAsArrayEncoder as encoder:
            yield self._pack('meta/files.bin', None, inno.streams.TData.data)
            doc = inno.setup_data.json()
            yield self._pack('meta/files.template', None, encoder.dumps(doc).encode(self.codec))
            doc = post_process_json(doc)
            yield self._pack('meta/files.json', None, encoder.dumps(doc).encode(self.codec))

        def _uninstaller(i=inno):
            return i.read_stream(i.streams.Uninstaller)
        yield self._pack('embedded/uninstaller.exe', None, _uninstaller)

        if license := inno.setup_info.Header.get_license():
            yield self._pack('embedded/license.rtf', None, license.encode(self.codec))

        if script := inno.setup_info.Header.get_script():
            yield self._pack('embedded/script.bin', None, script)
            yield self._pack('embedded/script.ps', None,
                lambda i=inno: i.ifps.disassembly().encode(self.codec))

        if dll := inno.setup_info.get_decompress_dll():
            yield self._pack(F'embedded/decompress.{magic(dll).extension}', None, dll)

        if dll := inno.setup_info.get_decryption_dll():
            yield self._pack(F'embedded/decryption.{magic(dll).extension}', None, dll)

        for size, images in (
            ('small', inno.setup_info.get_wizard_images_small()),
            ('large', inno.setup_info.get_wizard_images_large()),
        ):
            _formatting = len(str(len(images) + 1))
            for k, img in enumerate(images, 1):
                yield self._pack(F'embedded/images/{size}{k:0{_formatting}d}.{magic(img).extension}', None, img)

        for file in inno.files:
            if file.dupe:
                continue

            def _read(inno=inno, file=file, pwd=password):
                if pwd is None:
                    inno.guess_password(10)
                if self.leniency > 0:
                    return inno.read_file(file, pwd)
                try:
                    return inno.read_file_and_check(file, pwd)
                except InvalidPassword:
                    raise
                except Exception as E:
                    raise ValueError(F'{E!s} [ignore this check with -L]') from E

            yield self._pack(file.path, file.date, _read,
                tags=[t.name for t in SetupFileFlags if t & file.tags])

    @classmethod
    def handles(cls, data):
        return is_inno_setup(data)

class xtiso (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs='auto')

This unit is implemented in refinery.units.formats.archive.xtiso and has the following commandline Interface:

usage: xtiso [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-s TYPE]
             [path ...]

Extract files from a ISO archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtiso --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtiso [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -s, --fs TYPE    Specify a file system (udf, joliet, rr, iso, auto) extension to use. The
                   default setting auto will automatically detect the first of the other
                   available options and use it.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtiso(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a ISO archive.
    """
    def __init__(
        self,
        *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
        fs: Param[str, Arg.Choice('-s', metavar='TYPE', choices=_ISO_FILE_SYSTEMS, help=(
            'Specify a file system ({choices}) extension to use. The default setting {default} will automatically '
            'detect the first of the other available options and use it.'))] = 'auto'
    ):
        if fs not in _ISO_FILE_SYSTEMS:
            raise ValueError(F'invalid file system {fs}: must be udf, joliet, rr, iso, or auto.')
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            path=path,
            date=date,
            fs=fs
        )

    @ArchiveUnit.Requires('pycdlib', ['arc', 'default', 'extended'])
    def _pycdlib():
        import pycdlib
        import pycdlib.dates

        def fixed_parse(self, datestr):
            datestr = datestr[:-3] + b'00\0'
            return original_parse(self, datestr)

        original_parse = pycdlib.dates.VolumeDescriptorDate.parse
        pycdlib.dates.VolumeDescriptorDate.parse = fixed_parse
        return pycdlib

    @staticmethod
    def _strip_revision(name: str):
        base, split, revision = name.partition(';')
        return base if split and revision.isdigit() else name

    def unpack(self, data):
        if not self.handles(data):
            self.log_warn('The data does not look like an ISO file.')
        with MemoryFile(data, output=bytes) as stream:
            iso = self._pycdlib.PyCdlib()
            iso.open_fp(stream)
            fs = self.args.fs
            if fs != 'auto':
                mkfacade = {
                    'iso'    : iso.get_iso9660_facade,
                    'udf'    : iso.get_udf_facade,
                    'joliet' : iso.get_joliet_facade,
                    'rr'     : iso.get_rock_ridge_facade,
                }
                facade = mkfacade[fs]()
            elif iso.has_udf():
                self.log_info('using format: udf')
                facade = iso.get_udf_facade()
            elif iso.has_joliet():
                self.log_info('using format: joliet')
                facade = iso.get_joliet_facade()
            elif iso.has_rock_ridge():
                self.log_info('using format: rr')
                facade = iso.get_rock_ridge_facade()
            else:
                self.log_info('using format: iso')
                facade = iso.get_iso9660_facade()

            for root, _, files in facade.walk('/'):
                root = root.rstrip('/')
                for name in files:
                    name = name.lstrip('/')
                    path = F'{root}/{name}'
                    try:
                        info = facade.get_record(path)
                        date = info.date
                    except Exception:
                        info = None
                        date = None
                    else:
                        date = datetime.datetime(
                            date.years_since_1900 + 1900,
                            date.month,
                            date.day_of_month,
                            date.hour,
                            date.minute,
                            date.second,
                            tzinfo=datetime.timezone(datetime.timedelta(minutes=15 * date.gmtoffset))
                        )

                    def extract(info=info, path=path):
                        if info:
                            buffer = MemoryFile(bytearray(info.data_length))
                        else:
                            buffer = MemoryFile(bytearray())
                        facade.get_file_from_iso_fp(buffer, path)
                        return buffer.getvalue()

                    yield self._pack(self._strip_revision(path), date, extract)

    @classmethod
    def handles(cls, data) -> bool:
        return any(data[k] == B'CD001' for k in (
            slice(0x8001, 0x8006),
            slice(0x8801, 0x8806),
            slice(0x9001, 0x9006),
        ))

class xtiss (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtiss and has the following commandline Interface:

usage: xtiss [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extracts files from Install Shield Setup files. This unit is a path extractor which extracts data
from a hierarchical structure. Each extracted item is emitted as a separate chunk and has
attached to it a meta variable that contains its path within the source structure. The positional
arguments to the command are patterns that can be used to filter the extracted items by their
path. To view only the paths of all chunks, use the listing switch:

    emit something | xtiss --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtiss [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtiss(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extracts files from Install Shield Setup files.
    """
    def unpack(self, data: bytearray):
        offset = max(data.rfind(magic) for magic in ISSReader.MAGIC)
        if offset < 0:
            raise ValueError('ISS magic not found.')
        data[:offset] = []

        reader = ISSReader(data)
        count = reader.iss_archive_header()

        self.log_info(F'archive contains {count} files according to header')

        for _ in range(count):
            name, data = reader.iss_file()
            yield self._pack(name, None, data)

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:2] != B'MZ':
            return False
        return any(buffer_contains(data, m) for m in ISSReader.MAGIC)

class xtjson (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.json and has the following commandline Interface:

usage: xtjson [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract values from a JSON document.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtjson(PathExtractorUnit):
    """
    Extract values from a JSON document.
    """
    CustomPathSeparator = '.'

    def unpack(self, data):

        sep = self.CustomPathSeparator

        def crawl(path, cursor):
            if isinstance(cursor, dict):
                for key, value in cursor.items():
                    yield from crawl(F'{path}{sep}{key}', value)
            elif isinstance(cursor, list):
                for key, value in enumerate(cursor):
                    yield from crawl(F'{path}{sep}{key:d}', value)
            if path:
                yield path, cursor, cursor.__class__.__name__

        if not isinstance(data, (dict, list)):
            data = json.loads(data)

        for path, item, typename in crawl('', data):
            def extract(item=item):
                if isinstance(item, (list, dict)):
                    dumped = json.dumps(item, indent=4)
                else:
                    dumped = str(item)
                try:
                    return dumped.encode('latin1')
                except UnicodeEncodeError:
                    return dumped.encode('utf8')

            yield UnpackResult(path, extract, type=typename)

    @classmethod
    def handles(cls, data) -> bool | None:
        return is_likely_json(data)

class xtmacho (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtmacho and has the following commandline Interface:

usage: xtmacho [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
               [-p PWD]
               [path ...]

Extract the individual executables from a MachO universal binary (sometimes called a MachO fat
file)."

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtmacho(ArchiveUnit):
    """
    Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)."
    """
    _SIGNATURE_BE = B'\xCA\xFE\xBA\xBE'
    _SIGNATURE_LE = B'\xBE\xBA\xFE\xCA'

    def unpack(self, data: bytearray):
        view = memoryview(data)
        signature = bytes(view[:4])
        try:
            reader = StructReader(view, bigendian={
                self._SIGNATURE_BE: True,
                self._SIGNATURE_LE: False,
            }[signature])
        except KeyError as KE:
            raise ValueError('Not a MachO universal binary; invalid magic header bytes.') from KE
        else:
            reader.seekset(4)
        count = reader.u32()
        self.log_info(F'reading {count} embedded executables')
        while count > 0:
            fa = FatArch(reader)
            self.log_info(F'reading item of size 0x{len(fa.data):08X}, arch {fa.cputype.name}')
            yield self._pack(fa.cputype.name, None, fa.data)
            count -= 1

    @classmethod
    def handles(cls, data):
        return data[:4] in (
            cls._SIGNATURE_BE,
            cls._SIGNATURE_LE,
        )

class xtmagtape

This unit is implemented in refinery.units.formats.archive.xtmagtape and has the following commandline Interface:

usage: xtmagtape [-h] [-L] [-Q] [-0] [-v]

Extract files from SIMH magtape files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xtmagtape(Unit):
    """
    Extract files from SIMH magtape files.
    """
    def process(self, data: bytearray):
        reader = StructReader(data)

        for r in itertools.count():
            buffer = MemoryFile()

            for k in itertools.count():
                try:
                    head = reader.peek(4)
                    size = reader.read_integer(24)
                    mark = reader.read_byte()
                except EOFError:
                    self.log_info('end of file while reading chunk header, terminating')
                    return
                if not any(head):
                    if k == 0:
                        return
                    break
                if mark != 0:
                    self.log_warn(F'error code 0x{mark:02X} in record {r}.{k}')
                buffer.write(reader.read(size))
                if reader.peek(4) != head:
                    if reader.tell() % 2 and reader.peek(5)[1:] == head:
                        padding = reader.read_byte()
                        if padding != 0:
                            self.log_info(F'nonzero padding byte in record {r}.{k}')
                    else:
                        raise ValueError('Invalid footer, data is corrupted.')
                reader.seekrel(4)

            yield buffer.getvalue()

class xtmail (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.email and has the following commandline Interface:

usage: xtmail [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract files and body from EMail messages. The unit supports both the Outlook message format and
regular MIME documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtmail(PathExtractorUnit):
    """
    Extract files and body from EMail messages. The unit supports both the Outlook message format
    and regular MIME documents.
    """
    def _get_headparts(self, head: Iterable[tuple[str, str]]):
        def normalize_spaces(value: str):
            return ''.join(re.sub(R'\A\s+', '\x20', t) for t in value.splitlines(False))

        _headers: dict[str, list[str]] = {}
        for key, value in head:
            _headers.setdefault(key, []).append(mimewords.convert(normalize_spaces(value)))
        headers = {
            key: value[0] if len(value) == 1 else [t for t in value if t]
            for key, value in _headers.items()}

        yield UnpackResult('headers.txt',
            lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec))

        received = []

        for recv in headers.get('Received', []):
            if not recv.startswith('from '):
                received = None
                break
            recv = recv[5:]
            src, _, rest = recv.partition(' by ')
            dst, _, rest = rest.partition(' with ')
            received.append({
                'Source': src.partition('\x20')[0],
                'Target': dst.partition('\x20')[0],
            })

        if received:
            received.reverse()
            headers['ReceivedTrace'] = received

        yield UnpackResult('headers.json',
            lambda jsn=headers: json.dumps(jsn, indent=4).encode(self.codec))

    @PathExtractorUnit.Requires('extract-msg', ['formats', 'office', 'default', 'extended'])
    def _extract_msg():
        import extract_msg.enums
        return extract_msg

    def _get_parts_outlook(self, data):
        def ensure_bytes(data: bytes | str | None):
            if data is None:
                return B''
            elif isinstance(data, str):
                return data.encode(self.codec)
            else:
                return data

        def make_message(name, msg: Message):
            bodies = msg.detectedBodies
            BT = self._extract_msg.enums.BodyTypes
            if bodies & BT.HTML:
                def htm(msg=msg):
                    with NoLogging():
                        try:
                            return ensure_bytes(msg.htmlBody)
                        except Exception:
                            return B''
                yield UnpackResult(F'{name}.htm', htm)
            if bodies & BT.PLAIN:
                def txt(msg=msg):
                    with NoLogging():
                        try:
                            return ensure_bytes(msg.body)
                        except Exception:
                            return B''
                yield UnpackResult(F'{name}.txt', txt)
            if bodies & BT.RTF:
                def rtf(msg=msg):
                    with NoLogging():
                        try:
                            return ensure_bytes(msg.rtfBody)
                        except Exception:
                            return B''
                yield UnpackResult(F'{name}.rtf', rtf)

        msgcount = 0

        with NoLogging():
            class ForgivingMessage(self._extract_msg.Message):
                """
                If parsing the input bytes fails early, the "__open" private attribute may not
                yet exist. This hack prevents an exception to occur in the destructor.
                """
                def __getattr__(self, key: str):
                    if key.endswith('_open'):
                        return False
                    raise AttributeError(key)
            msg = ForgivingMessage(bytes(data))

        header = dict(msg.header)

        if x := msg.date:
            header['Date'] = email.utils.format_datetime(x)
        if x := msg.sender:
            header['From'] = x
        if x := msg.to:
            header['To'] = x
        if x := msg.cc:
            header['Cc'] = x
        if x := msg.bcc:
            header['Bcc'] = x
        if x := msg.messageId:
            header['Message-Id'] = x
        if x := msg.subject:
            header['Subject'] = x

        for key, val in list(header.items()):
            if val := val.strip().replace('\0', ''):
                header[key] = val
            else:
                del header[key]

        yield from self._get_headparts(header.items())
        yield from make_message('body', msg)

        def attachments(msg):
            for attachment in getattr(msg, 'attachments', ()):
                yield attachment
                if attachment.type == 'data':
                    continue
                yield from attachments(attachment.data)

        for attachment in attachments(msg):
            at = attachment.type
            if at is self._extract_msg.enums.AttachmentType.MSG:
                msgcount += 1
                yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data)
                continue
            if not isbuffer(attachment.data):
                self.log_warn(F'unknown attachment of type {at}, please report this!')
                continue
            path = attachment.longFilename or attachment.shortFilename
            path = path.rstrip('\0')
            yield UnpackResult(F'attachments/{path}', attachment.data)

    @PathExtractorUnit.Requires('chardet', ['default', 'extended'])
    def _chardet():
        import chardet
        return chardet

    def _get_parts_regular(self, data: bytes):
        try:
            info = self._chardet.detect(data)
            msg = data.decode(str(info['encoding']))
        except UnicodeDecodeError:
            raise ValueError('This is not a plaintext email message.')
        else:
            msg = Parser().parsestr(msg)

        yield from self._get_headparts(msg.items())

        for k, part in enumerate(msg.walk()):
            path = part.get_filename()
            error_message = None
            result = None
            if path is None:
                extension = file_extension(part.get_content_type(), 'txt')
                path = F'body.{extension}'
            else:
                path = path | mimewords | str
                path = F'attachments/{path}'
            try:
                payload = part.get_payload(decode=True)
                if payload is None or isinstance(payload, bytes):
                    result = payload
                else:
                    raise TypeError
            except Exception as E:
                try:
                    payload = part.get_payload(decode=False)
                except Exception as E:
                    error_message = str(E)
                else:
                    from refinery.units.pattern.carve import carve
                    self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}')
                    if isinstance(payload, str):
                        payload = payload.encode('latin1')
                    if payload := asbuffer(payload):
                        result = next(payload | carve('b64', stripspace=True, single=True, decode=True))
                    else:
                        error_message = str(E)
                        result = None
            if not result:
                if error_message is not None:
                    self.log_warn(F'could not get content of message part {k}: {error_message!s}')
                continue
            yield UnpackResult(path, result)

    def unpack(self, data):
        if data[:len(CDFv2_MARKER)] == CDFv2_MARKER:
            yield from self._get_parts_outlook(data)
        else:
            yield from self._get_parts_regular(data)

    @classmethod
    def handles(cls, data) -> bool:
        return is_likely_email(data)

class xtmsi (*paths, list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, nocab=False)

This unit is implemented in refinery.units.formats.msi and has the following commandline Interface:

usage: xtmsi [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-P NAME] [-j | -d] [-z | -e] [-r] [-N]
             [path ...]

Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file
MsiTables.json contains parsed MSI table information, similar to the output of the Orca tool.
Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom
actions are separately extracted in a virtual folder named "Action".

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -N, --nocab      Do not list and extract embedded CAB archives.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtmsi(xtdoc):
    """
    Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file {FN} contains
    parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a
    virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in
    a virtual folder named "Action".
    """

    _SYNTHETIC_STREAMS_FILENAME = 'MsiTables.json'
    _SYNTHETIC_STREAMS_TOPLEVEL = 'MsiTables'

    # https://learn.microsoft.com/en-us/windows/win32/msi/summary-list-of-all-custom-action-types
    _CUSTOM_ACTION_TYPES = {
        0x01: 'DLL file stored in a Binary table stream.',
        0x02: 'EXE file stored in a Binary table stream.',
        0x05: 'JScript file stored in a Binary table stream.',
        0x06: 'VBScript file stored in a Binary table stream.',
        0x11: 'DLL file that is installed with a product.',
        0x12: 'EXE file that is installed with a product.',
        0x13: 'Displays a specified error message and returns failure, terminating the installation.',
        0x15: 'JScript file that is installed with a product.',
        0x16: 'VBScript file that is installed with a product.',
        0x22: 'EXE file having a path referencing a directory.',
        0x23: 'Directory set with formatted text.',
        0x25: 'JScript text stored in this sequence table.',
        0x26: 'VBScript text stored in this sequence table.',
        0x32: 'EXE file having a path specified by a property value.',
        0x33: 'Property set with formatted text.',
        0x35: 'JScript text specified by a property value.',
        0x36: 'VBScript text specified by a property value.',
    }

    def __init__(
            self, *paths,
            list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
            nocab: Param[bool, Arg.Switch('-N', help='Do not list and extract embedded CAB archives.')] = False, **kw,
    ):
        super().__init__(
            *paths,
            list=list,
            path=path,
            join_path=join_path,
            drop_path=drop_path,
            nocab=nocab,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            **kw,
        )

    def unpack(self, data):
        streams = {result.path: result for result in super().unpack(data)}

        def stream(name: str):
            return streams.pop(name).get_data()

        def column_formats(table: dict[str, MSITableColumnInfo]) -> str:
            return ''.join(v.struct_format for v in table.values())

        def stream_to_rows(data: buf, row_format: str):
            row_size = struct.calcsize(F'<{row_format}')
            row_count = int(len(data) / row_size)
            reader = StructReader(data)
            columns = [reader.read_struct(F'<{sc * row_count}') for sc in row_format]
            for i in range(row_count):
                yield [c[i] for c in columns]

        tables: dict[str, dict[str, MSITableColumnInfo]] = collections.defaultdict(collections.OrderedDict)
        strings = MSIStringData(stream('!_StringData'), stream('!_StringPool'))

        for tbl_name_id, col_number, col_name_id, col_attributes in stream_to_rows(stream('!_Columns'), 'HHHH'):
            tbl_name = strings.ref(tbl_name_id)
            col_name = strings.ref(col_name_id)
            tables[tbl_name][col_name] = MSITableColumnInfo(col_number, col_attributes)

        table_names_given = {strings.ref(k) for k in chunks.unpack(stream('!_Tables'), 2, False)}
        table_names_known = set(tables)

        for name in table_names_known - table_names_given:
            self.log_warn(F'table name known but not given: {name}')
        for name in table_names_given - table_names_known:
            self.log_warn(F'table name given but not known: {name}')

        class ScriptItem(NamedTuple):
            row_index: int
            extension: str | None

        processed_table_data: dict[str, list[dict[str, str]]] = {}
        tbl_properties: dict[str, str] = {}
        tbl_files: dict[str, str] = {}
        tbl_components: dict[str, str] = {}
        postprocessing: list[ScriptItem] = []

        def format_string(string: str):
            # https://learn.microsoft.com/en-us/windows/win32/msi/formatted
            def _replace(match: re.Match[str]):
                _replace.done = False
                prefix, name = match.groups()
                if not prefix:
                    tbl = tbl_properties
                elif prefix in '%':
                    name = name.rstrip('%').upper()
                    return F'%{name}%'
                elif prefix in '!#':
                    tbl = tbl_files
                elif prefix in '$':
                    tbl = tbl_components
                else:
                    raise ValueError
                return tbl.get(name, '')
            while True:
                _replace.done = True
                string = re.sub(R'''(?x)
                    \[             # open square bracket
                      (?![~\\])    # not followed by escapes
                      ([%$!#]?)    # any of the valid prefix characters
                      ([^[\]{}]+)  # no brackets or braces
                    \]''', _replace, string)
                if _replace.done:
                    break
            string = re.sub(r'\[\\(.)\]', r'\1', string)
            string = string.replace('[~]', '\0')
            return string

        for table_name, table in tables.items():
            stream_name = F'!{table_name}'
            if stream_name not in streams:
                continue
            processed = []
            info = list(table.values())
            keys = list(table.keys())
            temp = [k.strip('_') for k in keys]
            if len(set(keys)) == len(set(temp)):
                keys = temp
            for r, row in enumerate(stream_to_rows(stream(stream_name), column_formats(table))):
                values = []
                for index, value in enumerate(row):
                    vt = info[index].type
                    if vt is MsiType.Long:
                        if value != 0:
                            value -= 0x80000000
                    elif vt is MsiType.Short:
                        if value != 0:
                            value -= 0x8000
                    elif value in strings:
                        value = strings.ref(value)
                    elif not info[index].is_integer:
                        value = ''
                    values.append(value)
                if table_name == 'Property':
                    tbl_properties[values[0]] = values[1]
                if table_name == 'File':
                    tbl_properties[values[0]] = values[2]
                if table_name == 'Component':
                    tbl_properties[values[0]] = F'%{values[2]}%'
                entry = dict(zip(keys, values))
                einfo = {t: i for t, i in zip(keys, info)}
                if table_name == 'MsiFileHash':
                    entry['Hash'] = struct.pack(
                        '<IIII',
                        row[2] ^ 0x80000000,
                        row[3] ^ 0x80000000,
                        row[4] ^ 0x80000000,
                        row[5] ^ 0x80000000,
                    ).hex()
                if table_name == 'CustomAction':
                    code = row[1] & 0x3F
                    try:
                        entry['Comment'] = self._CUSTOM_ACTION_TYPES[code]
                    except LookupError:
                        pass
                    t = einfo.get('Target')
                    c = {0x25: 'js', 0x26: 'vbs', 0x33: None}
                    if code in c and t and not t.is_integer:
                        postprocessing.append(ScriptItem(r, c[code]))
                processed.append(entry)
            if processed:
                processed_table_data[table_name] = processed

        ca = processed_table_data.get('CustomAction', None)
        for item in postprocessing:
            entry = ca[item.row_index]
            try:
                path: str = entry['Action']
                data: str = entry['Target']
            except KeyError:
                continue
            root = F'Action/{path}'
            if item.extension:
                path = F'{root}.{item.extension}'
                streams[path] = UnpackResult(path, data.encode(self.codec))
                continue
            data = format_string(data)
            parts = [part.partition('\x02') for part in data.split('\x01')]
            if not all(part[1] == '\x02' for part in parts):
                continue
            for name, _, script in parts:
                if not name.lower().startswith('script'):
                    continue
                if not script:
                    continue
                path = F'{root}.{name}'
                streams[path] = UnpackResult(path, script.encode(self.codec))

        for ignored_stream in [
            'SummaryInformation',
            'DocumentSummaryInformation',
            'DigitalSignature',
            'MsiDigitalSignatureEx'
        ]:
            if r := streams.pop(F'[5]{ignored_stream}', None):
                r.path = F'Meta/{ignored_stream}'
                yield r

        inconsistencies = 0
        w1 = len(str(len(strings)))
        w2 = len(str(max(max(strings.computed_ref_count), max(strings.provided_ref_count))))
        for k in range(len(strings)):
            c = strings.computed_ref_count[k]
            p = strings.provided_ref_count[k]
            if c != p and not self.log_debug(F'string {k:0{w1}d} reference count computed={c:0{w2}d} provided={p:0{w2}d}'):
                inconsistencies += 1
        if inconsistencies:
            self.log_info(F'found {inconsistencies} incorrect string reference counts')

        def fix_msi_path(path: str):
            prefix, dot, name = path.partition('.')
            if dot == '.' and prefix in processed_table_data:
                path = F'{prefix}/{name}'
            return path

        if self.args.nocab:
            cabs = {}
        else:
            def _iscab(path):
                return media_info and any(item.get('Cabinet', '') == F'#{path}' for item in media_info)
            media_info: list[JSONDict] = processed_table_data.get('Media', [])
            cabs: dict[str, UnpackResult] = {
                path: item for path, item in streams.items() if _iscab(path)}
            for cab in cabs:
                self.log_info(F'found cab file: {cab}')
        if cabs:
            file_names: dict[str, JSONDict] = {}

            for file_info in processed_table_data.get('File', []):
                try:
                    src_name = file_info['File']
                    dst_name = file_info['FileName']
                except KeyError:
                    continue
                _, _, long = dst_name.partition('|')
                dst_name = long or dst_name
                file_names[src_name] = dst_name

            for path, cab in cabs.items():
                try:
                    _cabinet = Cabinet(cab.get_data())
                    unpacked = _cabinet.process().get_files()
                except Exception as e:
                    self.log_info(F'unable to extract embedded cab file: {e!s}')
                    continue
                base, dot, ext = path.rpartition('.')
                if dot == '.' and ext.lower() == 'cab':
                    path = base
                else:
                    del streams[path]
                    cab.path = F'{path}.cab'
                    streams[cab.path] = cab
                for result in unpacked:
                    sub_path = file_names.get(result.name, result.name)
                    sub_path = self._get_path_separator().join((path, sub_path))
                    streams[sub_path] = UnpackResult(sub_path, lambda r=result: r.decompress())

        streams = {fix_msi_path(path): item for path, item in streams.items()}
        ds = UnpackResult(self._SYNTHETIC_STREAMS_FILENAME,
                json.dumps(processed_table_data, indent=4).encode(self.codec))
        streams[ds.path] = ds

        converter = csv()
        for key, data in processed_table_data.items():
            sk = key.strip('_')
            if sk not in processed_table_data:
                key = sk
            try:
                tbl = UnpackResult(F'{self._SYNTHETIC_STREAMS_TOPLEVEL}/{key}.csv', converter.json_to_csv(data))
            except Exception:
                continue
            streams[tbl.path] = tbl

        for path in sorted(streams):
            streams[path].path = path
            yield streams[path]

    @classmethod
    def handles(cls, data):
        return is_likely_msi(data)

class xtnode (*paths, entry=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date')

This unit is implemented in refinery.units.formats.archive.xtnode and has the following commandline Interface:

usage: xtnode [-h] [-L] [-Q] [-0] [-v] [-F] [-u] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
              [-D NAME]
              [path ...]

Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two
utilities that are commonly used to generate stand-alone executables.

This unit is a path extractor which extracts data from a hierarchical structure. Each extracted
item is emitted as a separate chunk and has attached to it a meta variable that contains its path
within the source structure. The positional arguments to the command are patterns that can be
used to filter the extracted items by their path. To view only the paths of all chunks, use the
listing switch:

    emit something | xtnode --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtnode [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -u, --entry      Only extract the entry point.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtnode(ArchiveUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two
    utilities that are commonly used to generate stand-alone executables.
    """

    _NEXE_SENTINEL = B'<nexe~~sentinel>'
    _PKG_PAYLOAD_P = B'PAYLOAD_POSITION'
    _PKG_PAYLOAD_S = B'PAYLOAD_SIZE'
    _PKG_PRELUDE_P = B'PRELUDE_POSITION'
    _PKG_PRELUDE_S = B'PRELUDE_SIZE'
    _PKG_COMMON_JS = B'sourceMappingURL=common.js.map'

    def __init__(
        self, *paths, entry: Param[bool, Arg.Switch('-u', help='Only extract the entry point.')] = False,
        list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
    ):
        super().__init__(*paths, entry=entry,
            list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex,
            path=path, date=date)

    def unpack(self, data: buf) -> Iterable[UnpackResult]:
        if self._is_nexe(data):
            self.log_info('unpacking as nexe')
            yield from self._unpack_nexe(data)
            return
        if self._is_pkg(data):
            self.log_info('unpacking as pkg')
            yield from self._unpack_pkg(data)
            return

    def _unpack_nexe(self, data: buf):
        try:
            ep = re.compile(
                RB"entry\s*=\s*path\.resolve\(path\.dirname\(process\.execPath\),\s*(%s)\)" % formats.string)
            ep, = ep.finditer(data)
        except Exception:
            ep = None
            self.log_info('could not identify entry point')
        else:
            ep = ep.group(1) | esc(quoted=True) | str
            self.log_info(F'entry point: {ep}')
        view = memoryview(data)
        for marker in re.finditer(re.escape(self._NEXE_SENTINEL), data):
            end = marker.end() + 16
            sizes = data[marker.end():end]
            if sizes.startswith(b"')"):
                continue
            reader = StructReader(sizes)
            code_size = int(reader.f64())
            blob_size = int(reader.f64())
            start = marker.start() - code_size - blob_size
            try:
                reader = StructReader(view[start:end])
                code = reader.read_exactly(code_size)
                blob = reader.read_exactly(blob_size)
            except EOFError:
                self.log_debug(F'found marker at 0x{marker.start():X}, but failed to read data')
                continue
            else:
                self.log_debug(F'found marker at 0x{marker.start():X}, data start at {start:X}')
            for rsrc in re.finditer(RB'process\.__nexe\s*=', code):
                rsrc = JSONReader(code[rsrc.end():])
                rsrc = rsrc.read_json()
                if len(rsrc) == 1:
                    _, rsrc = rsrc.popitem()
                for path, (offset, length) in rsrc.items():
                    end = offset + length
                    if ep and self.args.entry and path != ep:
                        continue
                    yield UnpackResult(path, blob[offset:end])

    def _unpack_pkg(self, data: buf):
        def _extract_coordinates(*v: bytes):
            for name in v:
                pattern = name + BR'''\s{0,3}=\s{0,3}(['"])([\s\d]+)\1'''
                value, = re.finditer(pattern, data)
                yield int(value.group(2).decode('utf8').strip(), 0)

        def _extract_data(*v: bytes):
            try:
                offset, length = _extract_coordinates(*v)
            except Exception:
                return None
            return data[offset:offset + length]

        payload = _extract_data(self._PKG_PAYLOAD_P, self._PKG_PAYLOAD_S)
        if not payload:
            raise ValueError('unable to extract payload')
        prelude = _extract_data(self._PKG_PRELUDE_P, self._PKG_PRELUDE_S)
        if not prelude:
            raise ValueError('unable to extract prelude')
        mapping = re.search(re.escape(self._PKG_COMMON_JS) + BR'\s*\},\s*\{', prelude)
        if not mapping:
            raise ValueError('unable to find common.js mapping')

        reader = JSONReader(prelude[mapping.end() - 1:])

        files: dict[str, dict] = reader.read_json()

        if files is None:
            raise ValueError('failed to read file list')

        entry = reader.skip_comma().read_string()
        links = reader.skip_comma().read_json()

        # _unknown1 = reader.skip_comma().read_json()
        # _unknown2 = reader.skip_comma().read_terminated_array(B')').strip()

        root = next(iter(files))
        skip = 0
        view = memoryview(payload)

        for k in range(len(root) + 1):
            test = root[:k].rstrip('/').rstrip('\\')
            if not all(path.startswith(test) for path in files):
                root = test[:-1]
                skip = k - 1
                break

        entry = entry[skip:]
        self.log_info(F'detected root directory {root}, entry point is {entry}')

        for src, dst in links.items():
            new_files = {}
            self.log_info('link src:', src[skip:])
            self.log_info('link dst:', dst[skip:])
            for path, location in files.items():
                if not path.startswith(src):
                    continue
                new_path = dst + path[len(src):]
                new_files[new_path] = location
                self.log_debug('synthesizing linked file:', new_path)
            files.update(new_files)

        for path, location in files.items():
            path = path[skip:]
            if entry and self.args.entry and path != entry:
                continue
            data = None
            for kind, (offset, length) in location.items():
                stop = offset + length
                if kind == '3':  # metadata
                    continue
                if kind == '2':  # unknown
                    continue
                if kind in '01':
                    data = view[offset:stop]
            if data is not None:
                yield UnpackResult(path, data)

    @classmethod
    def _is_nexe(cls, data: buf) -> bool:
        return cls._NEXE_SENTINEL in data

    @classmethod
    def _is_pkg(cls, data: buf) -> bool:
        if cls._PKG_PAYLOAD_P not in data:
            return False
        if cls._PKG_PAYLOAD_S not in data:
            return False
        if cls._PKG_PRELUDE_P not in data:
            return False
        if cls._PKG_PRELUDE_S not in data:
            return False
        if cls._PKG_COMMON_JS not in data:
            return False
        return True

    @classmethod
    def handles(cls, data: buf) -> bool | None:
        return cls._is_nexe(data) or cls._is_pkg(data)

class xtnsis (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtnsis and has the following commandline Interface:

usage: xtnsis [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
              [-p PWD]
              [path ...]

Extract files from NSIS archives. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtnsis --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtnsis [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtnsis(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from NSIS archives.
    """

    @classmethod
    def _find_archive_offset(cls, data: bytearray, before: int = -1, flawmax=2):
        def signatures(*magics):
            for changes in range(flawmax + 1):
                for magic in magics:
                    if not changes:
                        yield 0, magic
                        continue
                    for positions in itertools.permutations(range(len(magic)), r=changes):
                        signature = bytearray(magic)
                        for p in positions:
                            signature[p] = 0x2E
                        yield changes, bytes(signature)
        best_guess = None
        search_space = memoryview(data)
        for flaws, sig in signatures(*NSArchive.MAGICS):
            if flaws > 1:
                search_space = search_space[:0x20_000]
            matches = [m.start() - 4 for m in re.finditer(sig, search_space, flags=re.DOTALL)]
            if before >= 0:
                matches = [match for match in matches if match < before]
            matches.reverse()
            archive = None
            for match in matches:
                if match % 0x200 == 0:
                    archive = match
                    break
            if not archive:
                if matches and not best_guess:
                    best_guess = matches[-1]
            else:
                msg = F'Archive signature was found at offset 0x{archive:X}'
                if flaws > 0:
                    msg = F'{msg}; it has {flaws} imperfections and was likely modified'
                cls.log_info(F'{msg}.')
                return archive
        if best_guess:
            cls.log_info(F'A signature was found at offset 0x{best_guess:08X}; it is not properly aligned.')
            return best_guess
        return None

    def unpack(self, data):
        memory = memoryview(data)
        before = -1
        _error = None
        while True:
            offset = self._find_archive_offset(data, before)
            if offset is None:
                _error = _error or ValueError('Unable to find an NSIS archive marker.')
                raise _error
            try:
                arc = NSArchive(memory[offset:])
            except Exception as e:
                _error = e
                before = offset
            else:
                break

        def info():
            yield F'{arc.header.type.name} archive'
            yield F'compression type {arc.method.value}'
            yield F'mystery value 0x{arc.header.unknown_value:X}'
            yield 'solid archive' if arc.solid else 'fragmented archive'
            yield '64-bit header' if arc.header.is64bit else '32-bit header'
            yield 'unicode' if arc.header.unicode else 'ascii'

        self.log_info(', '.join(info()))

        for item in arc.header.items:
            yield self._pack(item.path, item.mtime, lambda i=item: arc._extract_item(i).data)

        yield self._pack('setup.bin', None, arc.header_data)
        yield self._pack('setup.nsis', None, arc.script.encode(self.codec))

    @classmethod
    def handles(cls, data) -> bool:
        return any(buffer_contains(data, magic) for magic in NSArchive.MAGICS)

class xtnuitka (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.archive.xtnuitka and has the following commandline Interface:

usage: xtnuitka [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extracts files packed by Nuitka using the --onefile option.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtnuitka(PathExtractorUnit):
    """
    Extracts files packed by Nuitka using the --onefile option.
    """
    _MAGIC = B'KA'

    @PathExtractorUnit.Requires('pyzstd', ['arc'])
    def _pyzstd():
        import pyzstd
        return pyzstd

    def unpack(self, data: buf) -> Iterable[UnpackResult]:
        class NuitkaData(Struct):
            unit = self

            def __init__(self, reader: StructReader):
                self.magic = reader.read_exactly(2)
                self.compression_flag = reader.read_exactly(1)
                if self.compressed:
                    zd = self.unit._pyzstd.ZstdDecompressor()
                    reader = StructReader(zd.decompress(reader.read()))
                self.files = {}
                self.truncated = False
                while not reader.eof:
                    path = reader.read_w_string('utf-16')
                    if not path:
                        break
                    size = reader.u64()
                    data = reader.read(size)
                    if len(data) == size:
                        self.files[path] = data
                    else:
                        self.truncated = True

            @property
            def compressed(self):
                return self.compression_flag == b'Y'

        if data.startswith(b'MZ'):
            arcs = list(self._pe_candidates(data))
        else:
            arcs = [data]

        for arc in arcs:
            archive = NuitkaData(arc)
            if archive.truncated:
                self.log_warn('the archive is truncated')
            if archive.magic != self._MAGIC:
                self.log_warn('the archive data does not start with the correct magic sequence')
            for path, data in archive.files.items():
                yield UnpackResult(path, data)

    @classmethod
    def handles(cls, data: buf) -> bool | None:
        if data[:2] == b'MZ':
            try:
                next(cls._pe_candidates(data))
            except StopIteration:
                return False
        else:
            return data[:2] == cls._MAGIC

    @classmethod
    def _pe_candidates(cls, data: buf):

        from refinery.units.formats.pe.peoverlay import peoverlay
        blob = data | peoverlay | bytearray
        if blob.startswith(cls._MAGIC):
            yield blob

        from refinery.units.formats.pe.perc import perc
        for blob in data | perc:
            if blob.startswith(cls._MAGIC):
                yield blob

class xtone (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtone and has the following commandline Interface:

usage: xtone [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract embedded files from OneNote documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtone(PathExtractorUnit):
    """
    Extract embedded files from OneNote documents.
    """
    @PathExtractorUnit.Requires('pyonenote', ['formats', 'office', 'extended'])
    def _pyOneNote():
        import pyOneNote
        import pyOneNote.OneDocument
        return pyOneNote.OneDocument

    def unpack(self, data: bytearray):
        with MemoryFile(memoryview(data)) as stream:
            one = self._pyOneNote.OneDocment(stream)
        for guid, file in one.get_files().items():
            chunk = file['content']
            try:
                extension = file['extension']
            except KeyError:
                extension = F'.{get_cached_file_magic_info(chunk).extension}'
            yield UnpackResult(F'{guid}{extension}', chunk)

    @classmethod
    def handles(cls, data) -> bool | None:
        return re.search(
            br'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3', data
        ) is not None

class xtp (*pattern, filter=0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.xtp and has the following commandline Interface:

usage: xtp [-h] [-L] [-Q] [-0] [-v] [-f] [-n N] [-m N] [-e N] [-x] [-r] [-l] [-t K] [pattern ...]

Extract Patterns: Uses regular expressions to extract indicators from the input data and
optionally filters these results heuristically. The unit is designed to extract indicators such
as domain names and IP addresses, see below for a complete list. To extract data formats such as
hex-encoded data, use carve.

positional arguments:
  pattern           Choose the pattern to extract. The unit uses ('hostname', 'url', 'email') by
                    default. Use an asterix character to select all available patterns. The
                    available patterns are: domain, email, guid, date, ipv4, ipv6, md5, sha1,
                    sha256, hostname, socket, subdomain, url, btc, pem, xmr, path-terse, path,
                    winpath, nixpath, environment-variable

options:
  -f, --filter      If this setting is enabled, the xtp unit will attempt to reduce the number of
                    false positives by certain crude heuristics. Specify multiple times to make
                    the filtering more aggressive.
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -r, --duplicates  Yield every (transformed) Match, even when it was found before.
  -l, --longest     Pick longer results first. The output will be sorted by length unless the
                    --take option is specified, in which case the longest K results will be
                    returned in order of appearance.
  -t, --take K      Return only the first K occurrences in order of appearance. If --longest is
                    specified, the K longest results will be returned in order of appearance
                    within the input.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class xtp(PatternExtractor):
    """
    Extract Patterns: Uses regular expressions to extract indicators from the input data and
    optionally filters these results heuristically. The unit is designed to extract indicators
    such as domain names and IP addresses, see below for a complete list. To extract data
    formats such as hex-encoded data, use `refinery.carve`.
    """

    def __init__(
        self,
        *pattern: Param[str, Arg.String('pattern',
            default=(
                indicators.hostname.name,
                indicators.url.name,
                indicators.email.name,
            ), help=(
                'Choose the pattern to extract. The unit uses {{default}} by default. Use an '
                'asterix character to select all available patterns. The available patterns '
                'are: {}'.format(', '.join(p.display for p in indicators))
            )
        )],
        filter: Param[int, Arg.Counts('-f', help=(
            'If this setting is enabled, the xtp unit will attempt to reduce the number '
            'of false positives by certain crude heuristics. Specify multiple times to '
            'make the filtering more aggressive.')
        )] = 0,
        min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None
    ):
        self.superinit(super(), **vars(), ascii=True, utf16=True)

        patterns = {
            p for name in pattern for p in indicators if fnmatch(p.display, name)
        }
        # if indicators.hostname in patterns:
        #     patterns.remove(indicators.hostname)
        #     patterns.add(indicators.ipv4)
        #     patterns.add(indicators.domain)
        patterns = [F'(?P<{p.name}>{p.value})' for p in patterns]
        if not patterns:
            raise RefineryCriticalException('The given mask does not match any known indicator pattern.')
        joined = '|'.join(patterns)
        self.args.pattern = re.compile(joined.encode(self.codec), flags=re.DOTALL)
        self.args.filter = filter

    _ALPHABETIC = ascii_letters.encode('ASCII')

    _LEGITIMATE_HOSTS = {
        'acm.org'                 : 1,
        'adobe.com'               : 1,
        'aka.ms'                  : 1,
        'android.com'             : 1,
        'apache.org'              : 1,
        'apple.com'               : 1,
        'archive.org'             : 2,
        'azure.com'               : 1,
        'baidu.com'               : 2,
        'bootstrapcdn.com'        : 2,
        'cdnjs.cloudflare.com'    : 4,
        'comodo.net'              : 1,
        'comodoca.com'            : 1,
        'curl.haxx.se'            : 1,
        'curl.se'                 : 1,
        'digicert.com'            : 1,
        'dublincore.org'          : 1,
        'example.com'             : 1,
        'facebook.com'            : 4,
        'fontawesome.com'         : 1,
        'github.com'              : 3,
        'globalsign.com'          : 1,
        'globalsign.net'          : 1,
        'godaddy.com'             : 1,
        'golang.org'              : 1,
        'google.com'              : 4,
        'googleapis.com'          : 5,
        'googleusercontent.com'   : 5,
        'gov'                     : 2,
        'gstatic.com'             : 2,
        'iana.org'                : 1,
        'ietf.org'                : 1,
        'intel.com'               : 1,
        'jquery.com'              : 1,
        'jsdelivr.net'            : 2,
        'libssh.org'              : 1,
        'live.com'                : 1,
        'microsoft.com'           : 1,
        'mozilla.org'             : 1,
        'msdn.com'                : 1,
        'msn.com'                 : 1,
        'newtonsoft.com'          : 3, # json.net
        'nuget.org'               : 3,
        'office.com'              : 1,
        'office365.com'           : 2,
        'openssl.org'             : 1,
        'openssh.com'             : 1,
        'openxmlformats.org'      : 1,
        'oracle.com'              : 1,
        'purl.org'                : 1,
        'python.org'              : 1,
        'readthedocs.io'          : 1,
        'schema.org'              : 2,
        'sectigo.com'             : 1,
        'skype.com'               : 1,
        'sourceforge.net'         : 4,
        'stackoverflow.com'       : 1,
        'sun.com'                 : 1,
        'sway-cdn.com'            : 1,
        'sway-extensions.com'     : 1,
        'symantec.com'            : 1,
        'symauth.com'             : 1,
        'symcb.com'               : 1,
        'symcd.com'               : 1,
        'sysinternals.com'        : 3,
        'thawte.com'              : 1,
        'unicode.org'             : 2,
        'usertrust.com'           : 1,
        'verisign.com'            : 1,
        'w3.org'                  : 1,
        'wikipedia.org'           : 1,
        'wolfram.com'             : 1,
        'xml.org'                 : 1,
        'xmlsoap.org'             : 1,
        'yahoo.com'               : 1,
    }

    for _ext in [
        'build',
        'data',
        'do',
        'help',
        'java',
        'md',
        'mov',
        'name',
        'py',
        'so',
        'sys',
        'zip',
    ]:
        _LEGITIMATE_HOSTS[_ext] = 4

    _DOMAIN_WHITELIST = {
        'system.net',
        'wscript.shell',
    }

    _BRACKETING = {
        B"'"[0]: B"'",
        B'"'[0]: B'"',
        B'('[0]: B')',
        B'{'[0]: B'}',
        B'['[0]: B']',
        B'<'[0]: B'>',
    }

    def _check_host(self, host: str, text: str):
        hl = host.lower()
        if hl in self._DOMAIN_WHITELIST:
            self.log_info(F'excluding indicator because domain {hl} is forcefully ignored: {text}')
            return False
        for white, level in self._LEGITIMATE_HOSTS.items():
            if self.args.filter >= level and (hl == white or hl.endswith(F'.{white}')):
                self.log_info(F'excluding indicator because domain {hl} is whitelisted: {text}', clip=True)
                self.log_debug(F'reduce level below {level} to allow, current level is {self.args.filter}')
                return False
        return True

    def _check_match(self, data: memoryview | bytearray, pos: int, name: str, value: bytes):
        term = self._BRACKETING.get(data[pos - 1], None)
        text = value.decode(self.codec)
        if term:
            pos = value.find(term)
            if pos > 0:
                value = value[:pos]
        if not self.args.filter:
            return value
        if name == indicators.hostname.name:
            if all(part.isdigit() for part in value.split(B'.')):
                name = indicators.ipv4.name
            elif B'.' not in value:
                name = indicators.ipv6.name
            else:
                name = indicators.domain.name
        if name == indicators.ipv4.name:
            ocets = [int(x) for x in value.split(B'.')]
            if ocets.count(0) >= 3:
                self.log_info(F'excluding ipv4 because it contains many zeros: {text}')
                return None
            if self.args.filter > 2 and sum(ocets) < 10:
                self.log_info(F'excluding ipv4 because of low value ocets: {text}')
                return None
            if ocets[0] <= 5 * self.args.filter:
                for area in (
                    bytes(data[pos - 20 : pos + 20]),
                    bytes(data[pos * 2 - 40 : pos * 2 + 40 : 2]),
                    bytes(data[pos * 2 - 41 : pos * 2 + 39 : 2]),
                ):
                    check = area.lower()
                    if B'version' in check or b'build' in check:
                        self.log_info(F'excluding ipv4 because it might be a version: {text}')
                        return None
            small_ocet_count = sum(1 for ocet in ocets if ocet < 10)
            if small_ocet_count > max(0, 4 - self.args.filter):
                self.log_info(F'excluding ipv4 because it has too many small ocets: {text}')
                return None
            ip = ip_address(text)
            if not ip.is_global:
                if self.args.filter >= 3 or not ip.is_private:
                    self.log_info(F'excluding ipv4 because it is not global: {text}')
                    return None
        elif name in {
            indicators.url.name,
            indicators.socket.name,
            indicators.hostname.name,
            indicators.domain.name,
            indicators.subdomain.name
        }:
            if self.args.filter >= 2:
                if LetterWeights.IOC(value) < 0.6:
                    self.log_info(F'excluding indicator because with low score: {text}', clip=True)
                    return None
                if name != indicators.url.name and len(value) > 0x100:
                    self.log_info(F'excluding indicator because it is too long: {text}', clip=True)
                    return None
            ioc = text
            if '://' not in ioc:
                ioc = F'tcp://{ioc}'
            parts = urlparse(ioc)
            host, _, _ = parts.netloc.partition(':')
            if not self._check_host(host, text):
                return None
            if name == indicators.url.name:
                scheme = parts.scheme.lower()
                for p in ('http', 'https', 'ftp', 'file', 'mailto'):
                    if scheme.endswith(p):
                        pos = scheme.find(p)
                        value = value[pos:]
                        break
            if name in {
                indicators.hostname.name,
                indicators.domain.name,
                indicators.subdomain.name
            }:
                if data[pos - 1] in b'/\\' and self.args.filter >= 2:
                    return None
                hostparts = host.split('.')
                if self.args.filter >= 2:
                    if not all(p.isdigit() for p in hostparts) and all(len(p) < 4 for p in hostparts):
                        self.log_info(F'excluding host with too many short parts: {text}')
                        return None
                if self.args.filter >= 3:
                    if len(hostparts) <= sum(3 for p in hostparts if p != p.lower() and p != p.upper()):
                        self.log_info(F'excluding host with too many mixed case parts: {text}')
                        return None
                # These heuristics attempt to filter out member access to variables in
                # scripts which can be mistaken for domains because of the TLD inflation
                # we've had.
                uppercase = sum(1 for c in host if c.isalpha() and c.upper() == c)
                lowercase = sum(1 for c in host if c.isalpha() and c.lower() == c)
                if lowercase and uppercase:
                    caseratio = uppercase / lowercase
                    if 0.1 < caseratio < 0.9:
                        self.log_info(F'excluding indicator with too much uppercase letters: {text}')
                        return None
                if all(x.isidentifier() for x in hostparts):
                    if len(hostparts) == 2 and hostparts[0] in ('this', 'self'):
                        self.log_info(F'excluding host that looks like a code snippet: {text}')
                        return None
                    if len(hostparts[-2]) < 3:
                        self.log_info(F'excluding host with too short root domain name: {text}')
                        return None
                    if any(x.startswith('_') for x in hostparts):
                        self.log_info(F'excluding host with underscores: {text}')
                        return None
                    if len(hostparts[-1]) > 3:
                        prefix = '.'.join(hostparts[:-1])
                        seen_before = len(set(re.findall(
                            fR'{prefix}(?:\.\w+)+'.encode('ascii'), data)))
                        if seen_before > 2:
                            self.log_debug(F'excluding indicator that was already seen: {text}')
                            return None
        elif name == indicators.email.name:
            _, _, host = value.partition(B'@')
            host = host.decode(self.codec)
            if not self._check_host(host, text):
                return None
            at = value.find(B'@')
            ix = 0
            while value[ix] not in self._ALPHABETIC:
                ix += 1
            return None if at - ix < 3 else value[ix:]
        elif name in (
            indicators.path.name,
            indicators.winpath.name,
            indicators.nixpath.name,
        ):
            if len(value) < 8:
                self.log_info(F'excluding path because it is too short: {text}')
                return None
            if len(value) > 16 and len(re.findall(RB'\\x\d\d', value)) > len(value) // 10:
                self.log_info(F'excluding long path containign hex: {text}', clip=True)
                return None
            try:
                path_string = text
            except Exception:
                self.log_debug(F'excluding path which did not decode: {value!r}', clip=True)
                return None
            try:
                path = Path(path_string)
            except Exception as E:
                self.log_debug(F'error parsing path "{path}": {E!s}')
                return None
            path_likeness = sum(v for v, x in [
                (1, path.suffix),
                (1, path_string.startswith('/')),
                (2, path_string.startswith('%')),
                (2, path_string.startswith('\\\\')),
                (2, path_string[1:3] == ':\\'),
            ] if x)
            if 2 + path_likeness < min(self.args.filter, 2):
                self.log_info(F'excluding long path because it has no characteristic parts: {text}')
                return None
            bad_parts = 0
            all_parts = len(path.parts)
            if self.args.filter >= 1:
                date_likeness = sum(1
                    for t in ['yyyy', 'yy', 'mm', 'dd', 'hh', 'ss']
                    if t in path.parts or t.upper() in path.parts)
                if len(value) < 20 and date_likeness >= all_parts - 1:
                    self.log_info(F'excluding path that looks like a date format: {text}', clip=True)
                    return None
            if self.args.filter >= 2:
                for k, part in enumerate(path.parts):
                    if not k:
                        drive, colon, slash = part.partition(':')
                        if colon and len(drive) == 1 and len(slash) <= 1:
                            continue
                        if part[0] == part[~0] == '%':
                            continue
                        if len(part) == 1:
                            continue
                    if (
                        LetterWeights.Path(part) < 0.5 + (min(self.args.filter, 4) * 0.1)
                        or (self.args.filter >= 2 and LetterWeights.Path(part[:1]) < 0.5)
                    ):
                        bad_parts += 1
                        self.log_debug(F'bad part {k + 1} in path: {part}')
            for filter_limit in (2, 3, 4):
                bad_ratio = 2 ** (filter_limit - 1)
                if self.args.filter >= filter_limit and bad_parts * bad_ratio >= all_parts:
                    self.log_info(F'excluding path with bad parts: {text}', clip=True)
                    return None
        return value

    def process(self, data):
        whitelist = set()

        def check(match: re.Match):
            for name, value in match.groupdict().items():
                if value is not None:
                    break
            else:
                raise RefineryCriticalException('Received empty match.')
            if value in whitelist:
                return None
            result = self._check_match(match.string, match.start(), name, value)
            if result is not None:
                return self.labelled(result, pattern=name)
            whitelist.add(value)

        transforms = [check]
        yield from self.matches_filtered(memoryview(data), self.args.pattern, *transforms)

class xtpdf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.pdf and has the following commandline Interface:

usage: xtpdf [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extract objects from PDF documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtpdf(ArchiveUnit):
    """
    Extract objects from PDF documents.
    """
    # @ArchiveUnit.Requires('pypdf>=3.1.0')
    # def _pypdf2():
    #     import pypdf
    #     import pypdf.generic
    #     return pypdf

    @ArchiveUnit.Requires('pikepdf<=9.5', ['formats', 'default', 'extended'])
    def _pikepdf():
        import pikepdf
        return pikepdf

    @ArchiveUnit.Requires('pymupdf', ['formats', 'default', 'extended'])
    def _mupdf():
        import os
        for setting in ('PYMUPDF_MESSAGE', 'PYMUPDF_LOG'):
            os.environ[setting] = F'path:{os.devnull}'
        import pymupdf
        return pymupdf

    def _walk_pypdf2(self, blob, memo: set[int] | None = None, *path):
        lib = self._pypdf2

        while isinstance(blob, lib.generic.IndirectObject):
            try:
                blob = blob.get_object()
            except Exception:
                break
        if memo is None:
            memo = {id(blob)}
        elif id(blob) in memo:
            return
        else:
            memo.add(id(blob))
        try:
            name = blob['/F']
            blob = blob['/EF']['/F']
        except Exception:
            pass
        else:
            def unhex(match):
                return bytes.fromhex(match[1]).decode('latin1')
            name = re.sub('#([0-9a-fA-F]{2})', unhex, name)
            path = *path[:-1], F'/{name}'
        try:
            def extract():
                with NoLogging():
                    return get_data()
            if TYPE_CHECKING:
                blob = cast(EncodedStreamObject, blob)
            get_data = blob.get_data
        except AttributeError:
            pass
        else:
            yield UnpackResult(''.join(path), extract, kind='object')
            return

        if isinstance(blob, lib.generic.ByteStringObject):
            yield UnpackResult(''.join(path), blob, kind='bytes')
            return
        if isinstance(blob, lib.generic.TextStringObject):
            yield UnpackResult(''.join(path), blob.encode(self.codec), kind='string')
            return

        if isinstance(blob, (
            lib.generic.BooleanObject,
            lib.generic.ByteStringObject,
            lib.generic.FloatObject,
            lib.generic.NameObject,
            lib.generic.NullObject,
            lib.generic.NumberObject,
            lib.generic.RectangleObject,
        )):
            # unhandled PDF objects
            return

        if isinstance(blob, lib.generic.TreeObject):
            blob = list(blob)

        pdf = lib.generic.PdfObject

        if isinstance(blob, list):
            if (
                len(blob) % 2 == 0
                and all(isinstance(key, str) for key in islice(iter(blob), 0, None, 2))
                and all(isinstance(key, pdf) for key in islice(iter(blob), 1, None, 2))
            ):
                blob = dict(zip(*([iter(blob)] * 2)))
            else:
                for key, value in enumerate(blob):
                    yield from self._walk_pypdf2(value, memo, *path, F'/{key}')
                return

        if not isdict(blob):
            return

        assert isinstance(blob, dict)

        for key, value in blob.items():
            if not isinstance(key, str):
                continue
            if not key.startswith('/'):
                key = F'/{key}'
            yield from self._walk_pypdf2(value, memo, *path, key)

    def _walk_pike(self, blob: Object, memo: list[Object] | None = None, *keys):
        if memo is None:
            memo = [blob]
        elif blob in memo:
            return
        else:
            memo.append(blob)

        try:
            name = blob['/F']
            blob = blob['/EF']['/F']
        except Exception:
            pass
        else:
            def unhex(match):
                return bytes.fromhex(match[1]).decode('latin1')
            name = re.sub('#([0-9a-fA-F]{2})', unhex, str(name))
            keys = *keys, F'/{name}'

        pike = self._pikepdf
        meta = {}
        path = ''.join(keys)
        done = set()

        if isinstance(blob, pike.Dictionary):
            nested = {}
            for key, value in blob.items():
                if isinstance(value, pike.Name):
                    value = str(value)
                if isinstance(value, (int, float, str, bool)):
                    key = key.lstrip('/')
                    meta[key] = value
                    continue
                nested[key] = value
                done.add(key)
            for key, value in nested.items():
                yield from self._walk_pike(value, memo, *keys, key)
            if meta:
                yield UnpackResult(path, blob.to_json(dereference=True))
                return
        elif isinstance(blob, pike.Array):
            for key, value in enumerate(iter(blob)):
                if isinstance(value, pike.Object):
                    yield from self._walk_pike(value, memo, *keys, F'/{key}')
            return

        try:
            buffer = blob.get_stream_buffer()
        except Exception:
            try:
                buffer = blob.get_raw_stream_buffer()
            except Exception:
                buffer = None
        if buffer or buffer:
            yield UnpackResult(path, bytearray(buffer))
        elif isinstance(blob, pike.String):
            yield UnpackResult(path, bytes(blob))
        elif isinstance(blob, pike.Object):
            yield UnpackResult(path, blob.to_json())

    def unpack(self, data):
        try:
            mu = self._mupdf.open(stream=data, filetype='pdf')
        except Exception:
            mu = password = None
        else:
            if password := self.args.pwd or None:
                if mu.is_encrypted:
                    mu.authenticate(password)
                else:
                    self.log_warn('This PDF document is not protected; ignoring password argument.')
                    password = ''
            elif mu.is_encrypted:
                raise ValueError('This PDF is password protected.')

        with MemoryFile(data, output=bytes) as stream, NoLogging():
            try:
                pdf = self._pikepdf.open(stream, password=(password or ''))
                yield from self._walk_pike(pdf.trailer, None, 'raw')
            except Exception:
                raise
                pdf = self._pypdf2.PdfReader(stream, password=password)
                yield from self._walk_pypdf2(pdf.trailer, None, 'raw')

        if mu is None:
            return

        if (md := mu.metadata) and (md := {k: v for k, v in md.items() if v}):
            md = json.dumps(md, indent=4)
            yield UnpackResult('parsed/meta.json', md.encode(self.codec))

        for k in range(len(mu)):
            with NoLogging(NoLogging.Mode.ALL):
                try:
                    page: Page = mu[k]
                    text = page.get_textpage()
                except Exception:
                    continue
            yield UnpackResult(F'parsed/page{k}.html', text.extractHTML().encode(self.codec))
            yield UnpackResult(F'parsed/page{k}.json', text.extractJSON().encode(self.codec))
            yield UnpackResult(F'parsed/page{k}.txt', text.extractText().encode(self.codec))
            for j, image in enumerate(page.get_images(), 1):
                xref = image[0]
                base = mu.extract_image(xref)
                data = base['image']
                info = get_cached_file_magic_info(data)
                yield UnpackResult(F'parsed/page{k}/img{j}.{info.extension}', data)

    @classmethod
    def handles(cls, data) -> bool | None:
        return data[:5] == B'%PDF-'

class xtpyi (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', decompile=False, user_code=False, unmarshal=0)

This unit is implemented in refinery.units.formats.archive.xtpyi and has the following commandline Interface:

usage: xtpyi [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-c]
             [-u | -y]
             [path ...]

Extracts and decompiles files from a Python Installer (aka PyInstaller) archive. This unit is a
path extractor which extracts data from a hierarchical structure. Each extracted item is emitted
as a separate chunk and has attached to it a meta variable that contains its path within the
source structure. The positional arguments to the command are patterns that can be used to filter
the extracted items by their path. To view only the paths of all chunks, use the listing switch:

    emit something | xtpyi --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtpyi [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -c, --decompile  Attempt to decompile PYC files.
  -u, --user-code  Extract only source code files from the root of the archive. These usually
                   implement the actual domain logic. This implies the --decompile option.
  -y, --unmarshal  (DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted
                   packages can potentially exploit this to execute code. It is advised to only
                   use this option inside an isolated environment. Specify twice to decompile
                   unmarshalled Python bytecode.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtpyi(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extracts and decompiles files from a Python Installer (aka PyInstaller) archive.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
        decompile: Param[bool, Arg.Switch('-c', help='Attempt to decompile PYC files.')] = False,
        user_code: Param[bool, Arg.Switch('-u', group='FILTER', help=(
            'Extract only source code files from the root of the archive. These usually implement '
            'the actual domain logic. This implies the --decompile option.'))] = False,
        unmarshal: Param[int, Arg('-y', action='count', group='FILTER', help=(
            '(DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted packages can '
            'potentially exploit this to execute code. It is advised to only use this option inside '
            'an isolated environment. Specify twice to decompile unmarshalled Python bytecode.'
        ))] = 0
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            path=path,
            date=date,
            decompile=decompile,
            unmarshal=unmarshal,
            user_code=user_code,
        )

    def unpack(self, data):
        view = memoryview(data)
        positions = [m.start() for m in re.finditer(re.escape(PyInstallerArchiveEpilogue.MagicSignature), view)]
        mode = Unmarshal(min(2, int(self.args.unmarshal)))
        self.log_debug(F'unmarshal mode: {mode.name}')
        if not positions:
            raise LookupError('unable to find PyInstaller signature')
        if len(positions) > 2:
            # first position is expected to be the sentinel value in the unpacker stub
            width = max(len(F'{p:X}') for p in positions)
            for position in positions:
                self.log_info(F'magic signature found at offset 0x{position:0{width}X}')
            self.log_warn(F'found {len(positions) - 1} potential PyInstaller epilogue markers; using last one.')
        decompile = self.args.decompile
        uc_target = PiType.USERCODE if decompile else PiType.SOURCE
        archive = PyInstallerArchiveEpilogue(view, positions[-1], mode, decompile)
        for name, file in archive.files.items():
            if self.args.user_code:
                if file.type != uc_target:
                    continue
                if name.startswith('pyiboot'):
                    continue
            yield self._pack(name, None, file.data, type=file.type.name)

    @classmethod
    def handles(cls, data: buf) -> bool | None:
        return PyInstallerArchiveEpilogue.MagicSignature in data

class xtrtf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtrtf and has the following commandline Interface:

usage: xtrtf [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract embedded objects in RTF documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtrtf(PathExtractorUnit):
    """
    Extract embedded objects in RTF documents.
    """
    @PathExtractorUnit.Requires('oletools', ['formats', 'office', 'extended'])
    def _oletools():
        import oletools
        import oletools.oleobj
        import oletools.rtfobj
        return oletools

    def unpack(self, data):
        parser = self._oletools.rtfobj.RtfObjParser(data)
        parser.parse()
        width = len(str(len(parser.objects)))
        for k, item in enumerate(parser.objects):
            item: RtfObject
            path = item.filename or F'carve{k:0{width}}.bin'
            data = item.rawdata
            meta = {}
            if item.is_ole:
                if item.format_id == self._oletools.oleobj.OleObject.TYPE_EMBEDDED:
                    meta['ole_type'] = 'EMBEDDED'
                elif item.format_id == self._oletools.oleobj.OleObject.TYPE_LINKED:
                    meta['ole_type'] = 'LINKED'
                if item.is_package:
                    meta['src_path'] = item.src_path
                    meta['tmp_path'] = item.temp_path
                if item.clsid is not None:
                    meta['ole_info'] = item.clsid_desc
                    meta['ole_guid'] = item.clsid
                meta['ole_name'] = item.class_name
            if item.oledata:
                data = item.oledata
                pos = item.rawdata.find(data)
                if pos > 0:
                    meta['raw_header'] = item.rawdata[:pos]
                if item.olepkgdata:
                    data = item.olepkgdata
                    pos = item.oledata.find(data)
                    if pos >= 0:
                        meta['ole_header'] = item.oledata[:pos]
            yield UnpackResult(path, data, **meta)

    @classmethod
    def handles(cls, data) -> bool:
        import re
        return bool(re.search(bR'^\s{0,500}\{\\rtf', memoryview(data)[:505]))

class xtsim (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtsim and has the following commandline Interface:

usage: xtsim [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extract files from Smart Install Maker (SIM) executables. This unit is a path extractor which
extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk
and has attached to it a meta variable that contains its path within the source structure. The
positional arguments to the command are patterns that can be used to filter the extracted items
by their path. To view only the paths of all chunks, use the listing switch:

    emit something | xtsim --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtsim [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtsim(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from Smart Install Maker (SIM) executables.
    """

    _RUNTIME_MAPPING = {
        '4.tmp'  : 'header.png',
        '5.tmp'  : 'wizard.bmp',
        '6.tmp'  : 'background.bmp',
        '7.tmp'  : 'folder.png',
        '8.tmp'  : 'group.png',
        '9.tmp'  : 'password.png',
        '15.tmp' : 'license1.rtf',
        '16.tmp' : 'information.rtf',
        '20.tmp' : 'license2.rtf',
    }

    _DIRECTORY_MASKS = {
        '@$&%01': 'ProgramFiles',
        '@$&%02': 'WindowsDir',
        '@$&%03': 'SystemDir',
        '@$&%04': 'InstallPath',
        '@$&%05': 'TempDir',
        '@$&%06': 'Desktop',
        '@$&%07': 'QuickLaunch',
        '@$&%08': 'ProgramsDir',
        '@$&%09': 'StartMenu',
        '@$&%10': 'MyDocuments',
        '@$&%11': 'Favorites',
        '@$&%12': 'SendTo',
        '@$&%13': 'UserProfile',
        '@$&%14': 'StartUp',
        '@$&%15': 'FontsDir',
        '@$&%16': 'CommonFiles',
        '@$&%17': 'SystemDrive',
        '@$&%18': 'CurrentDirectory',
        '@$&%20': 'UserName',
        '@$&%21': 'Language',
        '@$&%22': 'ComputerName',
        '@$&%26': 'AppData',
        '@$&%27': 'CommonAppData',
        '@$&%28': 'CommonDesktop',
        '@$&%29': 'CommonDocuments',
        '@$&%30': 'CommonFavourites',
        '@$&%31': 'CommonPrograms',
        '@$&%32': 'CommonStartMenu',
        '@$&%33': 'CommonStartup',
        '@$&%34': 'Templates',
        '@$&%35': 'CommonTemplates',
        '@$&%36': 'ProgramFiles64',
    }

    def unpack(self, data):
        mem = memoryview(data)
        sim = self.get_offsets(data)

        if sim is None:
            return B''

        strings = StructReader(mem[sim.strings_offset:sim.runtime_offset])
        runtime = StructReader(mem[sim.runtime_offset:sim.content_offset])
        content = StructReader(mem[sim.content_offset:sim.archive_end])

        header = [strings.read_c_string() for _ in range(sim.nr_of_strings)]
        tables: dict[str, list[list[str | int]]] = {}
        unknown_tables: dict[str, list[list[str | int]]] = {}

        def sc(k: int):
            return int(header[k])

        for size, index, name in [
            (4, 98, None),
            (7, 50, 'registry'),    # (2=HKLM/1=HKCU,key)
            (3, 96, None),
            (2, 31, 'fonts'),
            (8, 54, 'shortcuts'),   # (?,0=Menu/1=Desktop,filename,target_path,comment,icon_path1,icon_path2)
            (3, 67, 'filenames'),
            (2, 93, None),
            (6, 40, 'install'),     #
            (6, 25, 'uninstall'),
            (6, 24, 'ini'),         # 34991da998ece07d4a941394c6630ce74955fb4800e5915f6766180d12a8dc61
            (2, 45, None),
            (2, 20, None),
            (4, 26, 'languages'),
        ]:
            count = sc(index)
            if not count:
                continue
            table = [[
                strings.read_c_string() for _ in range(size)
            ] for _ in range(count)]
            if name is None:
                unknown_tables[F'T{index}'] = table
            else:
                tables[name] = table

        unknown_marker = strings.read_c_string()

        language_count = sc(26)
        message_matrix = [[
            strings.read_c_string() for _ in range(sc(57))
        ] for _ in range(language_count)]

        len_chunks = sc(117)
        chunk_size = sc(95)
        chunk_rest = sc(118)

        def check_empty_reader(r: StructReader, name: str):
            if _c := r.remaining_bytes:
                self.log_warn(F'{name} reader had 0x{_c:08X} bytes remaining:', r.peek(), clip=True)

        check_empty_reader(strings, 'strings')

        lngid = tables['languages'][0]
        if not lngid[2].isdigit():
            lname: bytes = lngid[1]
            lname = lname.decode('latin1')
            lngid = max(LCID, key=lambda k: longest_common_substring(LCID[k], lname))
        else:
            lngid = int(lngid[2])

        codec = DEFAULT_CODEPAGE.get(lngid, 'latin1')

        def decode(cell: bytes, codec: str):
            try:
                cell = cell.decode(codec)
            except UnicodeDecodeError:
                cell = cell.decode('latin1')
                self.log_debug('failed to decode string:', cell, clip=True)
            if cell.isdigit():
                return int(cell)
            if not cell:
                return None
            for key, val in self._DIRECTORY_MASKS.items():
                cell = cell.replace(key, F'${val}')
            return cell

        header[:] = [decode(s, codec) for s in header]

        for t in (tables, unknown_tables):
            for name, table in t.items():
                for row in table:
                    row[:] = [decode(cell, codec) for cell in row]

        messages = {}

        for array, lng in zip(message_matrix, tables['languages']):
            lng_codec = DEFAULT_CODEPAGE.get(lng[2], 'latin1')
            messages[lng[1]] = [decode(cell, lng_codec) for cell in array]

        tables['messages'] = messages
        tables['header'] = header

        if unknown_tables:
            tables['unknown_tables'] = unknown_tables
        if unknown_marker:
            tables['unknown_marker'] = decode(unknown_marker, codec)

        yield self._pack('setup.json', None,
            json.dumps(tables, indent=4).encode(self.codec))

        def runtime_path(name: str):
            root, backslash, temp = name.rpartition('\\')
            if backslash and root == '$inst' and (t := self._RUNTIME_MAPPING.get(temp)):
                name = t
            return F'runtime/{name}'

        if sim.runtime_is_cab:
            runtime_cab = Cabinet(runtime.read(), no_magic=True)
            for file in runtime_cab.process().get_files():
                yield self._pack(runtime_path(file.name), file.timestamp, lambda f=file: f.decompress())
        else:
            for _ in range(sim.nr_of_runtime):
                name = decode(runtime.read_c_string(), codec)
                path = runtime_path(name)
                size = int(runtime.read_c_string())
                yield self._pack(path, None, runtime.read(size))
            check_empty_reader(runtime, 'runtime')

        def no_abs_path(p: str):
            drive, d, rest = p.partition(':\\')
            if d and len(drive) == 1:
                return F'$Drive{drive.upper()}\\{rest}'
            return p

        if len_chunks + chunk_rest == 0:
            for file in tables['filenames']:
                path = no_abs_path(file[1])
                content.u32() # unknown
                size = content.u32()
                content.u32() # unknown
                content.u32() # unknown
                content.u32() # unknown
                content.u32() # unknown
                yield self._pack(F'data/{path}', None, content.read(size))
        else:
            content_cab = Cabinet(no_magic=True)
            content_cab.extend(content.read(chunk_size) for _ in range(len_chunks))
            if chunk_rest > 0:
                content_cab.append(content.read(chunk_rest))
            for file in content_cab.process().get_files():
                try:
                    path = tables['filenames'][int(file.name)][1]
                except Exception:
                    path = file.name
                path = F'content/{no_abs_path(path)}'
                yield self._pack(path, file.timestamp, lambda f=file: f.decompress())

        check_empty_reader(content, 'content')

    @classmethod
    def get_offsets(cls, data: bytes | bytearray) -> SIMOffsets | None:
        if len(data) < 0x1000:
            return None

        def sane(offsets: SIMOffsets):
            if offsets.sim_signature != _SIGBYTE:
                return False
            for offset in (
                offsets.strings_offset,
                offsets.runtime_offset,
                offsets.content_offset,
            ):
                if offset not in range(0x1000, 0x100000000):
                    return False
            if offsets.strings_offset >= offsets.runtime_offset:
                return False
            return offsets.content_offset >= offsets.runtime_offset + offsets.runtime_length

        end = len(data) - 0x24
        offsets = SIMOffsets(end, *struct.unpack('<QQQQ?BBB', data[end:]))

        if sane(offsets):
            pos = offsets.strings_offset
            end = pos + len(_SIMNAME)
            if data[pos:end] == _SIMNAME:
                return offsets
            pos = data.rfind(_SIMNAME)
            if pos > 0:
                return offsets.rebase(pos)

        view = memoryview(data)

        for stub in re.finditer(rb'MZ.{78}This program must be run under Win', data):
            pos_zero = stub.start()
            pos_data = data.find(_SIMNAME, pos_zero)
            if pos_data < 0:
                continue
            pattern = re.escape((pos_data - pos_zero).to_bytes(8, 'little')) + B'.{27}\\xF1'
            if match := re.search(pattern, view[pos_zero:]):
                end = match.start()
                offsets = SIMOffsets(end, *struct.unpack('<QQQQ?BBB', match[0]))
                if sane(offsets):
                    return offsets.rebase(pos_zero)

    @classmethod
    def handles(cls, data) -> bool | None:
        if isinstance(data, (bytes, bytearray)):
            return cls.get_offsets(data) is not None

Static methods

def get_offsets(data)

class xtsql (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.archive.xtsql and has the following commandline Interface:

usage: xtsql [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract files from SQLite3 databases.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtsql(PathExtractorUnit):
    """
    Extract files from SQLite3 databases.
    """
    def unpack(self, data: bytearray):
        def _json(object):
            with BytesAsStringEncoder as encoder:
                return encoder.dumps(object).encode(self.codec)

        if sys.version_info[:2] < (3, 11):
            raise NotImplementedError(F'python 3.11 is required to use {self.__class__.__name__}.')

        database = sqlite3.connect(':memory:')
        database.text_factory = bytes
        database.deserialize(data)
        cursor = database.cursor()
        result: dict[str, list[dict[str, int | float | str | bytes]]] = {}

        listing: list[tuple[bytes, bytes]] = cursor.execute(
            "SELECT name, sql FROM sqlite_master WHERE type='table';").fetchall()

        for tbl, spec in listing:
            table = tbl.decode('utf8')
            result[table] = t = []
            ct, _tbl, names = spec.partition(tbl)
            ct = ct.rstrip(B'"')
            names = names.lstrip(B'"')
            names = names.strip()
            names, _, _ = names.rpartition(B')')
            if (
                tbl != _tbl
                or ct.strip().upper().split() != [B'CREATE', B'TABLE']
                or not names.startswith(B'(')
            ):
                raise ValueError(F'Unexpeted SQL statement for {table} in master table: {spec}')
            names = [next(iter(name.strip().split()))
                for name in names[1:-1].decode().split(',')]
            for row in cursor.execute(F'SELECT * FROM {table}').fetchall():
                t.append(dict(zip(names, row)))

        yield UnpackResult('db', functools.partial(_json, result))

        for table, rows in result.items():

            yield UnpackResult(F'db/{table}', functools.partial(_json, rows))

            for k, row in enumerate(rows):

                root = F'db/{table}/{k}'
                yield UnpackResult(root, functools.partial(_json, row))

                for name, value in row.items():
                    path = F'{root}/{name}'
                    if value is None:
                        continue
                    if isinstance(value, (int, float)):
                        value = str(value)
                    if isinstance(value, str):
                        value = value.encode(self.codec)
                    if isinstance(value, bytes):
                        yield UnpackResult(path, value)

    @classmethod
    def handles(cls, data):
        return memoryview(data)[:15] == B'SQLite format 3'

class xttar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xttar and has the following commandline Interface:

usage: xttar [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extract files from a Tar archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xttar --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xttar [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xttar(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a Tar archive.
    """
    def unpack(self, data: bytearray):
        with MemoryFile(data) as stream:
            try:
                archive = tarfile.open(fileobj=stream)
            except Exception:
                ustar = data.find(B'ustar')
                if ustar < 257:
                    raise
                stream.seek(ustar - 257)
                archive = tarfile.open(fileobj=stream)
            for info in archive.getmembers():
                if not info.isfile():
                    continue
                extractor = archive.extractfile(info)
                if extractor is None:
                    continue
                date = datetime.datetime.fromtimestamp(info.mtime)
                yield self._pack(info.name, date, lambda e=extractor: e.read())

    @classmethod
    def handles(cls, data) -> bool:
        return data[257:262] == B'ustar'

class xtvba (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtvba and has the following commandline Interface:

usage: xtvba [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract VBA macro code from Office documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtvba(PathExtractorUnit):
    """
    Extract VBA macro code from Office documents.
    """
    @PathExtractorUnit.Requires('oletools', ['formats', 'office', 'extended'])
    def _olevba():
        with NoLogging(NoLogging.Mode.ALL):
            import oletools.olevba
            return oletools.olevba

    def unpack(self, data):
        sentinel = str(uuid4())
        try:
            parser = self._olevba.VBA_Parser(sentinel, data=bytes(data), relaxed=True)
        except self._olevba.FileOpenError:
            raise ValueError('Input data not recognized by VBA parser')
        for p1, stream_path, p2, code in parser.extract_all_macros():
            code: str
            if not stream_path:
                if p1 == sentinel:
                    continue
                if p2 == sentinel:
                    continue
            yield UnpackResult(stream_path, code.encode(self.codec))

    @classmethod
    def handles(cls, data):
        if data[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
            return True
        if data[:2] == B'PK':
            return buffer_contains(data, B'xl/vbaProject.bin')
        return any(buffer_contains(data, ns) for ns in [
            b'http://schemas.microsoft.com/office/word/2003/wordml',
            b'http://schemas.microsoft.com/office/2006/xmlPackage',
        ])

class xtw (stripspace=False, duplicates=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.xtw and has the following commandline Interface:

usage: xtw [-h] [-L] [-Q] [-0] [-v] [-x] [-r] [-l] [-t K]

Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works
similar to the xtp unit.

options:
  -x, --stripspace  Strip all whitespace from input data.
  -r, --duplicates  Yield every (transformed) Match, even when it was found before.
  -l, --longest     Pick longer results first. The output will be sorted by length unless the
                    --take option is specified, in which case the longest K results will be
                    returned in order of appearance.
  -t, --take K      Return only the first K occurrences in order of appearance. If --longest is
                    specified, the K longest results will be returned in order of appearance
                    within the input.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class xtw(PatternExtractor):
    """
    Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address.
    This works similar to the `refinery.xtp` unit.
    """

    def __init__(self, stripspace=False, duplicates=False, longest=False, take=None):
        self.superinit(super(), **vars(), ascii=True, utf16=True)

    def process(self, data):
        pattern = '|'.join(FR'(?P<{p.name}>\b{p.value}\b)' for p in wallets)
        pattern = FR'\b{pattern}\b'.encode('latin1')

        def check(match: re.Match[bytes]):
            for name, value in match.groupdict().items():
                if value is not None:
                    break
            else:
                raise RefineryCriticalException('Received empty match.')
            return self.labelled(value, kind=name)

        yield from self.matches_filtered(memoryview(data), pattern, check)

class xtxml (*paths, format=None, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.xml and has the following commandline Interface:

usage: xtxml [-h] [-L] [-Q] [-0] [-v] [-F] [-f F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
             [path ...]

Extract values from an XML document.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -f, --format F   A format expression to be applied for computing the path of an item. This must
                   use metadata that is available on the item. The current tag can be accessed as
                   {tag}. If no format is specified, the unit attempts to derive a good attribute
                   from the XML tree to use for generating paths.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtxml(XMLToPathExtractorUnit):
    """
    Extract values from an XML document.
    """
    def unpack(self, data):
        root = xml.parse(data.strip())
        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        def walk(node: xml.XMLNode, *parts: str):
            def extract(node: xml.XMLNode = node):
                if not node.children:
                    return node.content.encode(self.codec)
                with MemoryFile() as stream:
                    node.write(stream)
                    return bytes(stream.getvalue() | ppxml)

            attributes = {
                self._normalize_key(k): self._normalize_val(v)
                for k, v in node.attributes.items()
            }

            if not all(is_valid_variable_name(k) for k in attributes):
                attributes = {F'_{k}': v for k, v in attributes.items()}

            yield UnpackResult('/'.join(parts), extract, **attributes)

            for child in node.children:
                yield from walk(child, *parts, path(child))

        yield from walk(root, path(root))

    @classmethod
    def handles(cls, data):
        return is_likely_xml(data)

class xtxs (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtxs and has the following commandline Interface:

usage: xtxs [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path ...]

Extract data from Microsoft Access Databases.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtxs(PathExtractorUnit):
    """
    Extract data from Microsoft Access Databases.
    """

    @PathExtractorUnit.Requires('access-parser', ['formats', 'office', 'extended'])
    def _access_parser():
        import access_parser
        return access_parser

    def unpack(self, data):

        with VirtualFileSystem() as vfs:
            file = vfs.new(data, 'accdb')
            xsdb = self._access_parser.AccessParser(file.path)

        for name in xsdb.catalog:
            with NoLogging():
                table = xsdb.parse_table(name)
            if not table:
                continue
            length = max(len(cells) for cells in table.values())
            for k in range(length):
                for header, column in table.items():
                    try:
                        entry = column[k]
                    except IndexError:
                        continue
                    if entry is None:
                        continue

                    if isinstance(entry, (int, float)):
                        entry = str(entry)
                    if isinstance(entry, str):
                        entry = entry.encode(self.codec)
                    if isinstance(entry, bytes):
                        yield UnpackResult(F'{name}/{k}/{header}', entry)

    @classmethod
    def handles(cls, data) -> bool | None:
        if data[:19] == b'\0\01\0\0Standard ACE DB':
            return True
        if data[:19] == b'\0\01\0\0Standard Jet DB':
            return True

class xtzip (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtzip and has the following commandline Interface:

usage: xtzip [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
             [-p PWD]
             [path ...]

Extract files from a Zip archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtzip --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtzip [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtzip(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a Zip archive.
    """
    @ArchiveUnit.Requires('chardet', ['default', 'extended'])
    def _chardet():
        import chardet
        return chardet

    @ArchiveUnit.Requires('pyzipper', ['arc', 'default', 'extended'])
    def _pyzipper():
        import pyzipper
        return pyzipper

    @classmethod
    def _carver(cls):
        return carve_zip

    def unpack(self, data: bytearray):
        from zipfile import BadZipFile, ZipFile, ZipInfo

        def password_invalid(password: bytes | None):
            nonlocal archive, fallback
            if password:
                archive.setpassword(password)
            try:
                archive.testzip()
                files = (t for t in archive.infolist() if t.filename and not t.is_dir())
                files = sorted(files, key=lambda info: info.file_size)
                for info in files:
                    self.log_debug('testing password against:', info.filename)
                    try:
                        with archive.open(info.filename, "r") as test:
                            while test.read(1024):
                                pass
                    except BadZipFile:
                        continue
                    else:
                        break
            except NotImplementedError:
                if fallback:
                    raise
                self.log_debug('compression method unsupported, switching to pyzipper')
                archive = self._pyzipper.AESZipFile(MemoryFile(data))
                fallback = True
                return password_invalid(password)
            except RuntimeError as E:
                if 'password' not in str(E):
                    raise
                return True
            else:
                if password:
                    self.log_debug('using password:', password)
                return False

        password = bytes(self.args.pwd)
        fallback = False
        archive = ZipFile(MemoryFile(data))
        passwords = [password]

        if not password:
            passwords.extend(p.encode(self.codec) for p in self._COMMON_PASSWORDS)
        for p in passwords:
            if not password_invalid(p):
                break

        for info in archive.infolist():
            def xt(archive: ZipFile = archive, info: ZipInfo = info, data=memoryview(data)):
                try:
                    return archive.read(info.filename)
                except RuntimeError as E:
                    if 'password' not in str(E):
                        raise
                    msg = 'invalid password; use -L to extract raw encrypted data'
                    rec = _FileRecord(data[info.header_offset:])
                    raise RefineryPartialResult(msg, rec.data) from E

            if info.filename:
                if info.is_dir():
                    continue

            # courtesy of https://stackoverflow.com/a/37773438/9130824
            filename = info.filename
            if info.flag_bits & ZIP_FILENAME_UTF8_FLAG == 0:
                filename_bytes = filename.encode('437')
                try:
                    guessed_encoding = self._chardet.detect(filename_bytes)['encoding']
                except ImportError:
                    guessed_encoding = None
                guessed_encoding = guessed_encoding or 'cp1252'
                filename = filename_bytes.decode(guessed_encoding, 'replace')

            try:
                date = datetime(*info.date_time)
            except Exception as e:
                self.log_info(F'{e!s} - unable to determine date from tuple {info.date_time} for: {filename}')
                date = None

            yield self._pack(filename, date, xt)

    @classmethod
    def handles(cls, data):
        if data[:4] in (
            B'PK\x03\x04',
            B'PK\x07\x08',
        ):
            return True
        if not is_likely_pe(data):
            return False
        memory = memoryview(data)
        if 0 <= buffer_offset(memory[-0x400:], ZipEndOfCentralDirectory.SIGNATURE):
            return True
        pe = lief.load_pe_fast(data)
        offset = get_pe_size(pe)
        if 0 <= buffer_offset(memory[offset:], B'PK\x03\x04') < 0x1000:
            return True
        if not pe.has_debug:
            return False
        for entry in pe.debug:
            if not isinstance(entry, lief.PE.CodeViewPDB):
                continue
            path = entry.filename
            if not isinstance(path, str):
                path = codecs.decode(path, 'latin1')
            if 'sfxzip32' in path and 'WinRAR' in path:
                return True

class xtzpaq (*paths, index=False, pwd=b'', date=b'date', path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.archive.xtzpaq and has the following commandline Interface:

usage: xtzpaq [-h] [-L] [-Q] [-0] [-v] [-F] [-i] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
              [-D NAME] [-p PWD]
              [path ...]

Extract files from a ZPAQ archive. This unit is a path extractor which extracts data from a
hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it
a meta variable that contains its path within the source structure. The positional arguments to
the command are patterns that can be used to filter the extracted items by their path. To view
only the paths of all chunks, use the listing switch:

    emit something | xtzpaq --list

Otherwise, extracted items are written to the standard output port and usually require a frame to
properly process. In order to dump all extracted data to disk, the following pipeline can be
used:

    emit something | xtzpaq [| dump {path} ]

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

options:
  -i, --index      Archive is an index (no d-blocks).
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names with the previously existing one.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -F, --iff        Only apply unit if it can handle the input format. Specify twice to drop all
                   other chunks.

Expand source code Browse git

class xtzpaq(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a ZPAQ archive.
    """

    _MAGIC = B'7kSt\xA01\x83\xD3\x8C\xB2\x28\xB0\xD3zPQ'

    def __init__(
        self, *paths,
        index: Param[bool, Arg.Switch('-i', help='Archive is an index (no d-blocks).')] = False,
        **more
    ):
        for _code, _size in {
            _TCU32: 4,
            _TCI32: 4,
            _TCU16: 2,
            _TCI16: 2,
        }.items():
            _item_size = array(_code).itemsize
            if _item_size == _size:
                continue
            raise RuntimeError(
                F'Expected array type "{_code}" to have entries of size {_size}, but the API '
                F'reports a size of {_item_size}.')

        super().__init__(*paths, index=index, **more)

    @classmethod
    def handles(cls, data) -> bool | None:
        return data[:len(cls._MAGIC)] == cls._MAGIC

    def unpack(self, data: bytearray):
        def mkdate(date) -> datetime:
            date = int(date)
            year = date // 1000000 // 10000
            month = date // 100000000 % 100
            day = date // 1000000 % 100
            hour = date // 10000 % 100
            minute = date // 100 % 100
            second = date % 100
            return datetime(year, month, day, hour, minute, second, 0)

        @dataclass
        class DT:
            date: int = 0
            attr: int = 0
            name: str = ""
            frag: list[int] = field(default_factory=list)

            @property
            def dt(self) -> datetime | None:
                if self.date > 0:
                    return mkdate(self.date)

        # TODO: implement password-protected archives
        # key = self.args.pwd
        index = self.args.index
        bsize: dict[int, int] = {}  # frag ID -> d block compressed size
        dt: dict[str, DT] = {}      # filename -> date, attr, frags
        frag: list[bytes] = []      # ID -> hash[20] size[4] data
        csize = 0                   # expected offset of next non d block
        streaming = False
        journaling = False

        done = False
        dc = Decompressor(data)
        src = dc.dec.src
        offset = 0

        while not done and dc.read_block():
            while not done:
                filename = dc.read_filename()
                if filename is None:
                    break
                self.log_info('reading file', filename)
                comment = dc.read_comment()
                jsize = 0
                if comment and len(comment) >= 4 and comment[-4:] == "jDC\x01":
                    num = re.search('^\\d+', comment)
                    if not num:
                        raise RuntimeError('missing size in comment')
                    jsize = int(num[0])
                    if streaming:
                        raise RuntimeError('journaling block after streaming one')
                    journaling = True
                    self.log_info('archive type is journaling')
                else:
                    if journaling:
                        raise RuntimeError('streaming block after journaling one')
                    if index:
                        raise RuntimeError('streaming block in index')
                    streaming = True
                    self.log_info('archive type is streaming')

                # Test journaling filename. The format must be
                # jDC[YYYYMMDDHHMMSS][t][NNNNNNNNNN]
                # where YYYYMMDDHHMMSS is the date, t is the type {c,d,h,i}, and
                # NNNNNNNNNN is the 10 digit first fragment ID for types c,d,h.
                # They must be in ascending lexicographical order.

                frag_id = 0
                block_type = None

                if journaling:
                    if len(filename) != 28:
                        raise RuntimeError('filename size not 28')
                    if filename[:3] != 'jDC':
                        raise RuntimeError('filename not jDC')
                    block_type = filename[17]
                    if block_type not in 'cdhi':
                        raise RuntimeError('type not c,d,h,i')
                    try:
                        mkdate(filename[3:17])
                    except Exception as E:
                        raise RuntimeError('invalid date') from E
                    frag_id = int(filename[18:28])
                    if not 1 <= frag_id <= 4294967295:
                        raise RuntimeError('fragment ID out of range')

                seg = MemoryFile(size_limit=jsize)
                dc.set_output(seg)
                sha1 = hashlib.sha1()
                dc.set_hasher(sha1)
                dc.decompress_data()

                if journaling and len(seg) != jsize:
                    raise RuntimeError('incomplete output')

                checksum = dc.read_segment_end()
                if checksum is None:
                    self.log_debug('no checksum')
                elif checksum != sha1.digest():
                    raise RuntimeError('SHA1 mismatch')

                # check csize at first non-d block
                if csize and block_type and block_type in 'chi':
                    if csize != offset:
                        raise RuntimeError(F'csize={csize} does not point to offset={offset}')
                    csize = 0

                # get csize from c block
                seglen = len(seg)
                seg = StructReader(seg.getvalue())
                if block_type == 'c':
                    if seglen < 8:
                        raise RuntimeError("c block too small")
                    csize = seg.u64()
                    offset = src.tell() + 1
                    self.log_debug(F'csize={csize} at offset={offset}')
                    if csize >> 63:
                        self.log_warn('incomplete transaction at end of archive')
                        done = True
                    elif index and csize != 0:
                        raise RuntimeError('nonzero csize in index')
                    # Set csize to expected offset of first non d block
                    # assuming 1 more byte for unread end of block marker.
                    csize += offset

                if block_type == 'd':
                    if index:
                        raise RuntimeError('d block in index')
                    bsize[frag_id] = src.tell() + 1 - offset  # compressed size
                    self.log_debug(F' {bsize[frag_id]} -> {len(seg)}')
                    # Test frag size list at end. The format is f[id..id+n-1] fid n
                    # where fid may be id or 0. sizes must sum to the rest of block.
                    if seglen < 8:
                        raise RuntimeError('d block too small')
                    seg.seekset(-8)
                    fid = seg.u32() or frag_id
                    n = seg.u32()
                    if fid != frag_id:
                        raise RuntimeError('missing ID')
                    if n > (seglen - 8) // 4:
                        raise RuntimeError('frag list too big')
                    fragsum = 0  # computed sum of frag sizes
                    seg.seekset(-4 * (n + 2))
                    for _ in range(n):
                        fragsum += seg.u32()
                    if fragsum + n * 4 + 8 != seglen:
                        raise RuntimeError('bad frag size list')
                    # Save frag hashes and sizes. For output, save data too.
                    seg.seekset(fragsum)
                    buffer = seg.getvalue()
                    assert seg.remaining_bytes == n * 4 + 8
                    for i in range(n):
                        while len(frag) <= frag_id + i:
                            frag.append(B'')
                        if frag[frag_id + i]:
                            raise RuntimeError('duplicate frag ID')
                        f = seg.u32()
                        h = hashlib.sha1(buffer[:f]).digest()
                        frag[frag_id + i] = h + f.to_bytes(4, 'little') + buffer[:f]
                        buffer = buffer[f:]

                    assert len(buffer) == n * 4 + 8
                    assert seg.remaining_bytes == 8

                # Test and save h block. Format is: bsize (sha1[20] size)...
                # where bsize is the compressed size of the d block with the same id,
                # and each size corresonds to a fragment in that block. The list
                # must match the list in the d block if present.

                if block_type == 'h':
                    if seglen % 24 != 4:
                        raise RuntimeError('bad h block size')
                    b = seg.u32()
                    self.log_debug(F'[{frag_id}..{frag_id + seglen // 24}[ {b}')
                    fragsum = 0 # uncompressed size of all frags
                    for i in range(seglen // 24):
                        fd = seg.read(24)
                        if index:
                            while len(frag) <= frag_id + i:
                                frag.append(B'')
                            if frag[frag_id + i]:
                                raise RuntimeError('data in index')
                            frag[frag_id + i] = fd
                        elif frag_id + i >= len(frag) or len(frag[frag_id + i]) < 24:
                            raise RuntimeError('no matching d block')
                        elif frag[frag_id + i][:24] != fd:
                            raise RuntimeError('frag size or hash mismatch')
                        fragsum += int.from_bytes(fd[20:24], 'little')

                # Test i blocks and save files to extract. Format is:
                #   date filename 0 na attr[0..na) ni ptr[0..ni)   (to update)
                #   0    filename                                  (to delete)
                # Date is 64 bits in YYYYMMDDHHMMSS format.

                if block_type == 'i':
                    while not seg.eof:
                        f = DT(seg.u64())
                        f.name = seg.read_c_string('utf8')
                        if f.date > 0:
                            na = seg.u32()
                            if na > 65535:
                                raise ValueError('attr size > 65535')
                            f.attr = seg.read_integer(na * 8)
                            ni = seg.u32()
                            for i in range(ni):
                                a = seg.u32()
                                f.frag.append(a)
                                if index:
                                    continue
                                elif not 1 <= a < len(frag):
                                    raise RuntimeError('frag ID out of range')
                                elif not frag[a]:
                                    raise LookupError('missing frag data')
                        dt[f.name] = f

                if streaming:
                    yield self._pack(filename, None, seg.getvalue())

            offset = src.tell()

        self.log_debug(F'{offset} bytes of archive tested')

        if not journaling:
            return

        for name, f in dt.items():
            if not f.date:
                continue
            size = sum(
                int.from_bytes(frag[fp][20:24], 'little')
                for fp in f.frag
                if 0 < fp < len(frag) and len(frag[fp]) >= 24
            )
            out = MemoryFile()
            for fp in f.frag:
                if fp < len(frag):
                    out.write(memoryview(frag[fp])[24:])
            if len(out) != size:
                self.log_warn('invalid size during unpacking')
            yield self._pack(name, f.dt, out.getvalue())

class xxh (seed=0, text=False)

This unit is implemented in refinery.units.crypto.hash.xxhash and has the following commandline Interface:

usage: xxh [-h] [-L] [-Q] [-0] [-v] [-t] [seed]

Implements the xxHash hashing algorithm.

positional arguments:
  seed           specify the seed value; the default is 0

options:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xxh(HashUnit):
    """
    Implements the xxHash hashing algorithm.
    """
    def __init__(
        self,
        seed: Param[int, Arg.Number(metavar='seed', help='specify the seed value; the default is {default}')] = 0,
        text=False
    ):
        super().__init__(text, seed=seed)

    def _algorithm(self, data):
        return xxhash(data, self.args.seed).digest()

class xxtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size=1)

This unit is implemented in refinery.units.crypto.cipher.xxtea and has the following commandline Interface:

usage: xxtea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-b N] key

positional arguments:
  key                 The encryption key.

options:
  -i, --iv IV         Specifies the initialization vector. If none is specified, then a block of
                      zero bytes is used.
  -p, --padding P     Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                      does nothing. By default, all other algorithms are attempted. In most
                      cases, the data was not correctly decrypted if none of these work.
  -m, --mode M        Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                      OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and
                      ECB otherwise.
  -r, --raw           Set the padding to raw; ignored when a padding is specified.
  -s, --swap          Decode blocks as big endian rather than little endian.
  -b, --block-size N  Cipher block size in 32-bit words. The default value 1 implies that the
                      input is treated as a single block, which is common behaviour of many
                      implementations.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.
  -R, --reverse       Use the reverse operation.

Expand source code Browse git

class xxtea(TEAUnit, cipher=BlockCipherFactory(XXTEA)):

    block_size: int = 4

    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False, swap=False,
        block_size: Param[int, Arg.Number('-b', help=(
            'Cipher block size in 32-bit words. The default value {default} implies that the input '
            'is treated as a single block, which is common behaviour of many implementations.'))] = 1
    ):
        super().__init__(
            key, iv=iv, padding=padding, mode=mode, raw=raw, swap=swap, block_size=block_size)

    def _prepare_block(self, data: bytes):
        if self.args.block_size <= 1:
            blocks, remainder = divmod(len(data), 4)
            if remainder:
                blocks += 1
            self.block_size = blocks * 4
        else:
            self.block_size = self.args.block_size * 4

    def encrypt(self, data: bytes) -> bytes:
        self._prepare_block(data)
        return super().encrypt(data)

    def decrypt(self, data: bytes) -> bytes:
        self._prepare_block(data)
        return super().decrypt(data)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return StandardBlockCipherUnit._new_cipher(self,
            big_endian=self.args.swap, block_size=self.block_size, **optionals)

class z85

This unit is implemented in refinery.units.encoding.z85 and has the following commandline Interface:

usage: z85 [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

Z85 encoding and decoding, an alternative variant of Base85 with a different alphabet. This
variant derives its name from the developer, ZeroMQ.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class z85(Unit):
    """
    Z85 encoding and decoding, an alternative variant of Base85 with a different alphabet.
    This variant derives its name from the developer, ZeroMQ.
    """
    def reverse(self, data):
        return base64.b85encode(data).translate(_z85_encode_translation)

    def process(self, data: bytearray):
        return base64.b85decode(data.translate(_z85_decode_translation))

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return formats.z85s.value.bin.fullmatch(data) is not None

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    return base64.b85encode(data).translate(_z85_encode_translation)

class zl (level=9, window=15, zlib_header=False, gzip_header=False)

This unit is implemented in refinery.units.compression.zl and has the following commandline Interface:

usage: zl [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-l N] [-w N] [-z | -g]

ZLib compression and decompression.

options:
  -l, --level N      Specify a compression level between 0 and 9.
  -w, --window N     Manually specify the window size between 8 and 15.
  -z, --zlib-header  Use a ZLIB header.
  -g, --gzip-header  Use a GZIP header.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation.
  -F, --iff          Only apply unit if it can handle the input format. Specify twice to drop all
                     other chunks.

Expand source code Browse git

class zl(Unit):
    """
    ZLib compression and decompression.
    """

    def __init__(
        self,
        level: Param[int, Arg.Number('-l', bound=(0, 0X9), help='Specify a compression level between 0 and 9.')] = 9,
        window: Param[int, Arg.Number('-w', bound=(8, 0XF), help='Manually specify the window size between 8 and 15.')] = 15,
        zlib_header: Param[bool, Arg.Switch('-z', group='MODE', help='Use a ZLIB header.')] = False,
        gzip_header: Param[bool, Arg.Switch('-g', group='MODE', help='Use a GZIP header.')] = False
    ):
        if zlib_header and gzip_header:
            raise ValueError('You can only specify one header type (ZLIB or GZIP).')
        return super().__init__(level=level, window=window, zlib_header=zlib_header, gzip_header=gzip_header)

    def _decompress_data(self, data, mode: int, step: int):
        zl = zlib.decompressobj(mode)
        memory = memoryview(data)
        result = bytearray()
        while not zl.eof:
            read = min(step, len(memory))
            try:
                chunk = zl.decompress(memory[:read])
            except zlib.error as e:
                if not result:
                    raise
                raise RefineryPartialResult(exception_to_string(e), result) from e
            else:
                result.extend(chunk)
                consumed = read - len(zl.unused_data)
                if not memory or consumed == 0:
                    break
                memory = memory[consumed:]
        return result, memory

    def process(self, data):
        if data[0] == 0x78 or data[0:2] == B'\x1F\x8B' or self.args.zlib_header or self.args.gzip_header:
            modes = [self.args.window | 0x20, -self.args.window]
        else:
            modes = [-self.args.window, self.args.window | 0x20]
        modes.extend([0x10 | self.args.window, 0])
        view = memoryview(data)
        rest = view
        step = 32 if self.leniency > 0 else len(data)
        count = 0
        error = None
        for k in itertools.count(1):
            error = None
            for mode in modes:
                msg = F'decompressing chunk {k} with mode {mode & 0xFF:02X}'
                try:
                    out, rest = self._decompress_data(view, mode, step)
                    yield out
                except Exception as e:
                    self.log_info(F'{msg} failed: {e!s}')
                    error = error or e
                else:
                    self.log_info(F'{msg} ok, remaining data:', rest, clip=True)
                    count += 1
                    error = None
                    modes = [mode]
                    break
            if error or not rest or len(rest) == len(view):
                break
            if len(rest) > len(view):
                raise RuntimeError('Decompressor returned more tail data than input data.')
            view = rest
        if count <= 0:
            raise error or ValueError('Could not detect any zlib stream.')
        if rest:
            from refinery.lib.meta import SizeInt
            size = SizeInt(len(rest))
            raise RefineryPartialResult(F'{size!r} excess data after compressed stream', rest)

    def reverse(self, data):
        mode = -self.args.window
        if self.args.zlib_header:
            mode = -mode
        if self.args.gzip_header:
            mode = -mode | 0x10
        self.log_debug(F'using mode {mode:+2d} for compression')
        zl = zlib.compressobj(self.args.level, zlib.DEFLATED, mode)
        zz = zl.compress(data)
        return zz + zl.flush(zlib.Z_FINISH)

    @classmethod
    def handles(cls, data):
        for sig in (
            B'\x1F\x8B',  # gzip header
            B'\x78\x01',  # zlib low compression
            B'\x78\x9C',  # zlib medium compression
            B'\x78\xDA',  # zlib high compression
        ):
            if data[:2] == sig:
                return True

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    mode = -self.args.window
    if self.args.zlib_header:
        mode = -mode
    if self.args.gzip_header:
        mode = -mode | 0x10
    self.log_debug(F'using mode {mode:+2d} for compression')
    zl = zlib.compressobj(self.args.level, zlib.DEFLATED, mode)
    zz = zl.compress(data)
    return zz + zl.flush(zlib.Z_FINISH)

class zstd

This unit is implemented in refinery.units.compression.zstd and has the following commandline Interface:

usage: zstd [-h] [-L] [-Q] [-0] [-v] [-R] [-F]

ZStandard (ZSTD) compression and decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Increase the leniency, allowing partial results and ignoring more errors.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation.
  -F, --iff      Only apply unit if it can handle the input format. Specify twice to drop all
                 other chunks.

Expand source code Browse git

class zstd(Unit):
    """
    ZStandard (ZSTD) compression and decompression.
    """
    @Unit.Requires('pyzstd', ['all'])
    def _pyzstd():
        import pyzstd
        return pyzstd

    def process(self, data):
        zd = self._pyzstd.ZstdDecompressor()
        out = zd.decompress(data)
        if zd.needs_input:
            raise RefineryPartialResult('Incomplete ZSTD stream.', out)
        return out

    def reverse(self, data):
        zc = self._pyzstd.ZstdCompressor()
        return zc.compress(data) + zc.flush()

    @classmethod
    def handles(cls, data) -> bool:
        return data[:4] == B'\x28\xB5\x2F\xFD'

Methods

def reverse(self, data)

Expand source code Browse git

def reverse(self, data):
    zc = self._pyzstd.ZstdCompressor()
    return zc.compress(data) + zc.flush()