Module `refinery`

        __     __  High Octane Triage Analysis          __
        ||    _||______ __       __________     _____   ||
        ||    \||___   \__| ____/   ______/___ / ____\  ||
========||=====||  | __/  |/    \  /==|  / __ \   __\===]|
        '======||  |   \  |   |  \_  _| \  ___/|  |     ||
               ||____  /__|___|__/  / |  \____]|  |     ||
===============''====\/=========/  /==|__|=====|__|======'
                               \  /
                                \/

This is the binary refinery package documentation; see GitHub and PyPi for more information.

The package refinery exports all Units which are of type Entry; this marker implies that the unit exposes a shell command. The command line interface for each of these units is given below, this is the same text as would be available by executing the command with the -h or --help option. The documentation for this module only lists the classes that correspond to exported refinery units, but for convenience, the refinery module also exports the classes Unit and Arg.

To better understand how the command line parameters are parsed, it is also recommended to study the module documentation of the following library modules, as their content is relevant for how the various Units can be combined.

refinery.lib.frame: framing syntax for working on lists of binary chunks
refinery.lib.argformats: the multibin syntax for refinery arguments
refinery.lib.meta: defining and using metadata variables within frames
refinery.units: writing custom units, add command-line arguments, and how to use refinery units within Python code.

Expand source code Browse git

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
R"""
    ----------------------------------------------------------
            __     __  High Octane Triage Analysis          __
            ||    _||______ __       __________     _____   ||
            ||    \||___   \__| ____/   ______/___ / ____\  ||
    ========||=====||  | __/  |/    \  /==|  / __ \   __\===]|
            '======||  |   \  |   |  \_  _| \  ___/|  |     ||
                   ||____  /__|___|__/  / |  \____]|  |     ||
    ===============''====\/=========/  /==|__|=====|__|======'
                                   \  /
                                    \/

This is the binary refinery package documentation; see
 [GitHub](https://github.com/binref/refinery/) and
 [PyPi](https://pypi.org/project/binary-refinery/)
for more information.

The package `refinery` exports all `refinery.units.Unit`s which are of type `refinery.units.Entry`;
this marker implies that the unit exposes a shell command. The command line interface for each of
these units is given below, this is the same text as would be available by executing the command
with the `-h` or `--help` option. The documentation for this module only lists the classes that
correspond to exported refinery units, but for convenience, the `refinery` module also exports the
classes `refinery.units.Unit` and `refinery.units.Arg`.

To better understand how the command line parameters are parsed, it is also recommended to study
the module documentation of the following library modules, as their content is relevant for how the
various `refinery.units.Unit`s can be combined.

1. `refinery.lib.frame`: framing syntax for working on lists of binary chunks
2. `refinery.lib.argformats`: the multibin syntax for refinery arguments
3. `refinery.lib.meta`: defining and using metadata variables within frames
4. `refinery.units`: writing custom units, add command-line arguments, and how to use refinery
   units within Python code.
"""
__version__ = '0.6.38'
__distribution__ = 'binary-refinery'

from typing import Dict, List, Optional, Type
from importlib import resources
from datetime import datetime
from threading import RLock

import pickle

from refinery.units import Arg, Unit


def _singleton(cls):
    return cls()


@_singleton
class __unit_loader__:
    """
    Every unit can be imported from the refinery base module. The import is performed on demand to
    reduce import times. The library ships with a pickled dictionary that maps unit names to their
    corresponding module path. This data is expected to be stored as `__init__.pkl` in the package
    directory.
    """
    units: Dict[str, str]
    cache: Dict[str, Type[Unit]]
    _lock: RLock = RLock()

    def __init__(self):
        with resources.path(__name__, '__init__.py') as current_file:
            # This is an annoying hack to allow this to work when __init__.pkl does not
            # yet exist during setup. Starting with Python 3.9, we could use the slightly
            # less awkward: resources.files(__name__).joinpath('__init__.pkl')
            self.path = current_file.parent / '__init__.pkl'
        self.reloading = False
        self.loaded = False
        self.units = {}
        self.cache = {}
        self.last_reload = datetime(1985, 8, 5)
        self.load()

    def __enter__(self):
        return self._lock.__enter__()

    def __exit__(self, et, ev, tb):
        return self._lock.__exit__(et, ev, tb)

    def load(self):
        try:
            self.units = pickle.load(self.path.open('rb'))
        except (FileNotFoundError, EOFError):
            self.reload()
        else:
            self.loaded = True

    def clear(self):
        self.loaded = False
        self.units.clear()
        self.cache.clear()

    def save(self):
        try:
            pickle.dump(self.units, self.path.open('wb'))
        except Exception:
            pass
        else:
            self.loaded = True

    def reload(self):
        if not self.reloading:
            from refinery.lib.loader import get_all_entry_points
            self.reloading = True
            self.clear()
            for executable in get_all_entry_points():
                name = executable.__qualname__
                self.units[name] = executable.__module__
                self.cache[name] = executable
            self.save()
            self.reloading = False

    def resolve(self, name) -> Optional[Unit]:
        if not self.loaded:
            self.load()
        try:
            module_path = self.units[name]
            module = __import__(module_path, None, None, [name])
            entry = getattr(module, name)
            self.cache[name] = entry
            return entry
        except (KeyError, ModuleNotFoundError):
            return None


@_singleton
class __pdoc__(dict):
    def __init__(self, *a, **kw):
        super().__init__()
        self._loaded = False

    def _strip_globals(self, hlp: str):
        def _strip(lines):
            triggered = False
            for line in lines:
                if triggered:
                    if line.lstrip() != line:
                        continue
                    triggered = False
                if line.lower().startswith('global options:'):
                    triggered = True
                    continue
                yield line
        return ''.join(_strip(hlp.splitlines(keepends=True)))

    def _load(self):
        if self._loaded:
            return
        from .explore import get_help_string
        self['Unit'] = False
        self['Arg'] = False
        with __unit_loader__:
            for name in __unit_loader__.units:
                unit = __unit_loader__.resolve(name)
                for base in unit.mro():
                    try:
                        abstractmethods: List[str] = base.__abstractmethods__
                    except AttributeError:
                        break
                    for method in abstractmethods:
                        if method.startswith('_'):
                            continue
                        at = getattr(unit, method, None)
                        bt = getattr(unit.mro()[1], method, None)
                        if at and at is not bt:
                            self[F'{name}.{method}'] = False
                hlp = get_help_string(unit, width=97)
                hlp = hlp.replace('\x60', '')
                hlp = self._strip_globals(hlp).strip()
                hlp = (
                    F'This unit is implemented in `{unit.__module__}` and has the following '
                    F'commandline Interface:\n```text\n{hlp}\n```'
                )
                self[name] = hlp
        self._loaded = True

    def items(self):
        self._load()
        return super().items()


__all__ = sorted(__unit_loader__.units, key=lambda x: x.lower()) + [
    Unit.__name__, Arg.__name__, '__unit_loader__', '__pdoc__']


def load(name) -> Optional[Unit]:
    with __unit_loader__:
        return __unit_loader__.resolve(name)


def __getattr__(name):
    with __unit_loader__:
        unit = __unit_loader__.resolve(name)
    if unit is None:
        raise AttributeError(name)
    return unit


def __dir__():
    return __all__

Sub-modules

refinery.data: This module contains data resources.
refinery.explore: A commandline script to search for binary refinery units based on keywords.
refinery.lib: Library functions used by various refinery units.
refinery.shell: Shell-Like Unit Interface …
refinery.units: This package contains all refinery units. To write an executable refinery unit, it is sufficient to write a class that inherits from …

Units

class a3x (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.a3x and has the following commandline Interface:

usage: a3x [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script
bytecode. The unit also works on compiled AutoIt executables.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class a3x(PathExtractorUnit):
    """
    Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script
    bytecode. The unit also works on compiled AutoIt executables.
    """

    def unpack(self, data: bytearray):
        view = memoryview(data)
        cursor = 0
        errors: Dict[int, Exception] = {}
        script_count = 0
        truncated: Set[A3xRecord] = set()
        intact: Set[A3xRecord] = set()

        def _package(records: Iterable[A3xRecord]) -> Generator[UnpackResult, None, None]:
            for k, record in enumerate(records, 1):
                self.log_info(F'record {k} type:', record.type)
                self.log_info(F'record {k} path:', record.src_path)
                if record.path is None:
                    continue
                yield UnpackResult(
                    record.path,
                    record.extract,
                    srcpath=record.src_path,
                    created=record.created.isoformat(' ', 'seconds'),
                    written=record.written.isoformat(' ', 'seconds'),
                )

        while cursor < len(view):
            self.log_debug(F'searching at offset 0x{cursor:08X}')
            nc = data.find(A3xScript.MAGIC, cursor)
            if nc >= 0:
                cursor = nc
            else:
                rp = data.find(A3xRecord.MAGIC, cursor) - A3xScript.WIDTH
                if rp <= cursor:
                    break
                cursor = rp
            try:
                script = A3xScript(view[cursor:])
            except Exception as E:
                errors[cursor] = E
                cursor += 1
                continue
            else:
                valid = script.has_valid_magic()
                if valid:
                    _m = 'correct'
                else:
                    _m = 'invalid'
                if not script.body:
                    cursor += A3xScript.WIDTH
                    if not script.has_valid_magic():
                        cursor += len(A3xRecord.MAGIC)
                    continue
                if script.truncated:
                    _a = 'truncated'
                    truncated.update(script.body)
                else:
                    script_count += 1
                    _a = 'intact'
                    intact.update(script.body)
                self.log_info(
                    F'{_a} script of type', script.type,
                    F'and length 0x{len(script):08X}',
                    F'with {len(script.body)} records and {_m} magic:',
                    script.magic
                )
                cursor += len(script)
                if script.truncated:
                    if not script.has_valid_magic():
                        cursor += len(A3xRecord.MAGIC)
                    continue

            yield from _package(script.body)

        remaining = truncated - intact
        if remaining:
            self.log_warn('emitting records from truncated scripts')
            yield from _package(remaining)
            return
        elif truncated:
            self.log_debug('good news: intact scripts contained all records from truncated scripts')
        if script_count == 0:
            error = None
            for offset, error in errors.items():
                self.log_warn(F'error at offset 0x{offset:08X}:', error)
            if error:
                raise error

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return A3xScript.MAGIC in data or A3xRecord.MAGIC in data

class add (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.add and has the following commandline Interface:

usage: add [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Add the given argument to each block.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class add(BinaryOperationWithAutoBlockAdjustment):
    """
    Add the given argument to each block.
    """
    @staticmethod
    def operate(a, b): return a + b
    @staticmethod
    def inplace(a, b): a += b

class adler32 (text=False)

This unit is implemented in refinery.units.crypto.hash.checksums and has the following commandline Interface:

usage: adler32 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the Adler32 Hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class adler32(HashUnit):
    """
    Returns the Adler32 Hash of the input data.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return struct.pack('>I', zlib.adler32(data))

class aes (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.aes and has the following commandline Interface:

usage: aes [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] [-A N] key

AES encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CCM, CFB, CTR,
                        EAX, ECB, GCM, OCB, OFB. By default, the CBC mode is used when an IV is
                        is provided, and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -M, --mac-len N       Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in
                        bytes.
  -A, --assoc-len N     Only for CCM: Length of the associated data. If not specified, all
                        associated data is buffered internally.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class aes(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(AES)):
    """
    AES encryption and decryption.
    """
    pass

class alu (operator, *argument, seed=0, prologue=None, epilogue=None, inc=False, dec=False, cbc=False, bigendian=False, blocksize=None, precision=None)

This unit is implemented in refinery.units.blockwise.alu and has the following commandline Interface:

usage: alu [-h] [-L] [-Q] [-0] [-v] [-s SEED] [-p E] [-e E | -I | -D | -X] [-E] [-B N] [-P N]
           operator [argument [argument ...]]

The arithmetic-logical unit. It allows you to specify a custom Python expression where the
following variables are allowed:

- the variable A: same as V[0]
- the variable B: current block
- the variable N: number of bytes in the input
- the variable K: current index in the input
- the variable S: the internal state value
- the variable V: the vector of arguments
- the variable I: function that casts to a signed int in current precision
- the variable U: function that casts to unsigned int in current precision
- the variable R: function; R(x,4) rotates x by 4 to the right
- the variable L: function; L(x,4) rotates x by 4 to the left
- the variable M: function; M(x,8) picks the lower 8 bits of x
- the variable X: function that negates the bits of the input

(The rotation operations are interpreted as shifts when arbitrary precision is used.)

Each block of the input is replaced by the value of this expression. Additionally, it is possible
to specify prologue and epilogue expressions which are used to update the state variable S before
and after the update of each block, respectively.

positional arguments:
  operator           A Python expression defining the operation.
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -s, --seed SEED    Optional seed value for the state variable S. The default is zero. This can
                     be an expression involving the variable N.
  -p, --prologue E   Optional expression with which the state variable S is updated before a
                     block is operated on.
  -e, --epilogue E   Optional expression with which the state variable S is updated after a block
                     was operated on.
  -I, --inc          equivalent to --epilogue=S+1
  -D, --dec          equivalent to --epilogue=S-1
  -X, --cbc          equivalent to --epilogue=(B)
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.
  -P, --precision N  The size of the variables used for computing the result. By default, this is
                     equal to the block size. The value may be zero, indicating that arbitrary
                     precision is required.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class alu(ArithmeticUnit):
    """
    The arithmetic-logical unit. It allows you to specify a custom Python expression where the following
    variables are allowed:

    - the variable `A`: same as `V[0]`
    - the variable `B`: current block
    - the variable `N`: number of bytes in the input
    - the variable `K`: current index in the input
    - the variable `S`: the internal state value
    - the variable `V`: the vector of arguments
    - the variable `I`: function that casts to a signed int in current precision
    - the variable `U`: function that casts to unsigned int in current precision
    - the variable `R`: function; `R(x,4)` rotates x by 4 to the right
    - the variable `L`: function; `L(x,4)` rotates x by 4 to the left
    - the variable `M`: function; `M(x,8)` picks the lower 8 bits of x
    - the variable `X`: function that negates the bits of the input

    (The rotation operations are interpreted as shifts when arbitrary precision is used.)

    Each block of the input is replaced by the value of this expression. Additionally, it is possible to
    specify prologue and epilogue expressions which are used to update the state variable `S` before and
    after the update of each block, respectively.
    """

    @staticmethod
    def _parse_op(definition, default=None):
        """
        An argparse type which uses the `refinery.lib.argformats.PythonExpression` parser to parse the
        expressions that can be passed to `refinery.alu`. Essentially, these are Python expressions which can
        contain variables `B`, `A`, `S`, and `V`.
        """
        if not definition:
            if default is None:
                raise ValueError('No definition given')
            definition = default
        return PythonExpression(definition, *'IBASMNVRLX', all_variables_allowed=True)

    def __init__(
        self, operator: Arg(type=str, help='A Python expression defining the operation.'), *argument,
        seed: Arg('-s', type=str, help=(
            'Optional seed value for the state variable S. The default is zero. This can be an expression '
            'involving the variable N.')) = 0,
        prologue: Arg('-p', type=str, metavar='E', help=(
            'Optional expression with which the state variable S is updated before a block is operated on.')) = None,
        epilogue: Arg('-e', type=str, metavar='E', group='EPI', help=(
            'Optional expression with which the state variable S is updated after a block was operated on.')) = None,
        inc: Arg('-I', group='EPI', help='equivalent to --epilogue=S+1') = False,
        dec: Arg('-D', group='EPI', help='equivalent to --epilogue=S-1') = False,
        cbc: Arg('-X', group='EPI', help='equivalent to --epilogue=(B)') = False,
        bigendian=False, blocksize=None, precision=None
    ):
        for flag, flag_is_set, expression in [
            ('--cbc', cbc, '(B)'),
            ('--inc', inc, 'S+1'),
            ('--dec', dec, 'S-1'),
        ]:
            if flag_is_set:
                if epilogue is not None:
                    raise ValueError(
                        F'Ambiguous specification; epilogue was already set to {epilogue} '
                        F'when {flag} was parsed.'
                    )
                epilogue = expression

        self._index = IndexCounter()

        super().__init__(
            self._index,
            *argument,
            bigendian=bigendian,
            blocksize=blocksize,
            precision=precision,
            seed=seed,
            operator=self._parse_op(operator),
            prologue=self._parse_op(prologue, 'S'),
            epilogue=self._parse_op(epilogue, 'S'),
        )

    @property
    def _is_ecb(self):
        return not self.args.epilogue and not self.args.prologue

    def _fastblock(self, _):
        raise FastBlockError

    def process(self, data):
        context = dict(metavars(data))
        seed = self.args.seed
        if isinstance(seed, str):
            seed = PythonExpression(seed, 'N', constants=metavars(data))
        if callable(seed):
            seed = seed(context, N=len(data))
        self._index.init(self.fmask)
        prologue = self.args.prologue.expression
        epilogue = self.args.epilogue.expression
        operator = self.args.operator.expression
        fbits = self.fbits
        fmask = self.fmask

        def cast_unsigned(n) -> int:
            return int(n) & fmask

        def cast_signed(n) -> int:
            n = int(n) & fmask
            if n >> (fbits - 1):
                return -((~n + 1) & fmask)
            else:
                return n

        if fbits is INF:
            def rotate_r(n, k): return n >> k
            def rotate_l(n, k): return n << k
        else:
            def rotate_r(n, k): return (n >> k) | (n << (fbits - k)) & fmask
            def rotate_l(n, k): return (n << k) | (n >> (fbits - k)) & fmask

        def negate_bits(n):
            return n ^ fmask

        def mask_to_bits(x, b):
            return x & ((1 << b) - 1)

        context.update(
            N=len(data),
            S=seed,
            I=cast_signed,
            U=cast_unsigned,
            R=rotate_r,
            L=rotate_l,
            X=negate_bits,
            M=mask_to_bits,
        )

        def operate(block, index, *args):
            context.update(K=index, B=block, V=args)
            if args:
                context['A'] = args[0]
            context['S'] = eval(prologue, None, context)
            context['B'] = eval(operator, None, context)
            context['S'] = eval(epilogue, None, context)
            return context['B']

        placeholder = self.operate
        self.operate = operate
        result = super().process(data)
        self.operate = placeholder
        return result

    @staticmethod
    def operate(block, index, *args):
        raise RuntimeError('This operate method cannot be called.')

    def inplace(self, block, *args) -> None:
        super().inplace(block, *args)

class aplib

This unit is implemented in refinery.units.compression.ap and has the following commandline Interface:

usage: aplib [-h] [-L] [-Q] [-0] [-v] [-R]

APLib compression and decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class aplib(Unit):
    """
    APLib compression and decompression.
    """

    def reverse(self, buf):
        return compressor(buf).compress()

    def process(self, buf):
        return decompressor(buf).decompress()

class asm (mode='x32', *, count=None, until=None, no_address=False, no_hexdump=False)

This unit is implemented in refinery.units.sinks.asm and has the following commandline Interface:

usage: asm [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-A] [-H] [[x32|x64|..]]

Disassembles the input data using capstone and produces a human-readable disassembly listing. It
internally uses the opc unit for this, which is an alternative option if you are looking for more
programmatic disassembly.

positional arguments:
  [x32|x64|..]      Machine code architecture, default is x32. Select from the following list:
                    x16, x32, x64, ppc32, ppc64, mips32, mips64.

optional arguments:
  -c, --count N     Maximum number of bytes to disassemble, infinite by default.
  -u, --until STR   Disassemble until the given string appears among the disassembly.
  -A, --no-address  Disable address display.
  -H, --no-hexdump  Disable opcodes hexdump.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class asm(opc):
    """
    Disassembles the input data using capstone and produces a human-readable disassembly listing.
    It internally uses the `refinery.opc` unit for this, which is an alternative option if you are
    looking for more programmatic disassembly.
    """
    def __init__(
        self, mode='x32', *, count=None, until=None,
        no_address: Arg.Switch('-A', help='Disable address display.') = False,
        no_hexdump: Arg.Switch('-H', help='Disable opcodes hexdump.') = False,
    ):
        super().__init__(
            mode=mode,
            nvar='_name',
            avar='_addr',
            ovar='_arg',
            count=count,
            until=until,
            no_address=no_address,
            no_hexdump=no_hexdump,
        )

    def process(self, data):
        insns = list(super().process(data))
        if not insns:
            return

        no_address = self.args.no_address
        no_hexdump = self.args.no_hexdump

        def _hl(x): return len(hex(x))

        args_width = max(len(insn['_args']) for insn in insns)
        memo_width = max(len(insn['_name']) for insn in insns)
        addr_width = max(_hl(insn['_addr']) for insn in insns)

        if no_address:
            addr_width = 0
            memo_width = memo_width + 2

        max_data_bytes_count = max(len(c) for c in insns)

        padding = addr_width + memo_width + args_width + 10
        metrics_opc = HexDumpMetrics(max_data_bytes_count, padding=padding)

        for insn in insns:
            hd = one(hexdump(insn, metrics_opc))
            name = insn.meta.pop('_name')
            args = insn.meta.pop('_args')
            addr = insn.meta.pop('_addr')
            msg = F' {name:<{memo_width}}  {args:<{args_width}}'
            if not no_hexdump:
                msg = F'{msg}  ; {hd}'
            if not no_address:
                msg = F'{addr:0{addr_width}X}: {msg}'
            yield msg.encode(self.codec)

class atbash

This unit is implemented in refinery.units.encoding.atbash and has the following commandline Interface:

usage: atbash [-h] [-L] [-Q] [-0] [-v] [-R]

https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st
century, except for picking out crypto nerds.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class atbash(Unit):
    """
    https://en.wikipedia.org/wiki/Atbash
    Atbash encoding and decoding. Fairly useless in the 21st century, except
    for picking out crypto nerds.
    """

    def process(self, data: bytearray):
        uc = range(B'A'[0], B'Z'[0] + 1)
        lc = range(B'a'[0], B'z'[0] + 1)
        for k, letter in enumerate(data):
            if letter in uc:
                data[k] = uc[~uc.index(letter)]
                continue
            if letter in lc:
                data[k] = lc[~lc.index(letter)]
                continue
        return data

    reverse = process

class autoxor (range=slice(1, 32, None))

This unit is implemented in refinery.units.misc.autoxor and has the following commandline Interface:

usage: autoxor [-h] [-L] [-Q] [-0] [-v] [start:end:step]

Assumes a XOR-encoded input and automatically attempts to find the correct XOR key. The method is
based on the assumption that the plaintext input contains one letter that occurs with a much
higher frequency than all other letters; this is the case for the null byte in PEs, and also for
the space character in many text files.

positional arguments:
  start:end:step  range of length values to try in Python slice syntax, the default is 1:32.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class autoxor(xkey):
    """
    Assumes a XOR-encoded input and automatically attempts to find the correct XOR key. The method
    is based on the assumption that the plaintext input contains one letter that occurs with a much
    higher frequency than all other letters; this is the case for the null byte in PEs, and also
    for the space character in many text files.
    """
    def process(self, data: bytearray):
        key = super().process(data)
        bin, = data | xor(key)
        txt, = bin | xor(0x20)
        if re.fullmatch(BR'[\s!-~]+', txt):
            key = bytes(key | xor(0x20))
            bin = txt
        return self.labelled(bin, key=key)

class b32

This unit is implemented in refinery.units.encoding.b32 and has the following commandline Interface:

usage: b32 [-h] [-L] [-Q] [-0] [-v] [-R]

Base32 encoding and decoding.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class b32(Unit):
    """
    Base32 encoding and decoding.
    """
    def reverse(self, data):
        return base64.b32encode(data)

    def process(self, data: bytearray):
        before_padding = 0
        for before_padding in range(len(data), 0, -1):
            if data[before_padding - 1:before_padding] != B'=':
                break
        padding_size = -before_padding % 8
        missing = before_padding + padding_size - len(data)
        if missing > 0:
            self.log_info(F'detected incorrect padding: added {missing} padding characters')
            data.extend(B'=' * missing)
        if missing < 0:
            self.log_info(F'detected incorrect padding: removed {-missing} padding characters')
            data[padding_size + before_padding:] = []
        return base64.b32decode(data, casefold=True)

class b58

This unit is implemented in refinery.units.encoding.b58 and has the following commandline Interface:

usage: b58 [-h] [-L] [-Q] [-0] [-v] [-R]

Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the
alphabet omits digits and letters that look similar.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class b58(base):
    """
    Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses
    because the alphabet omits digits and letters that look similar.
    """
    def __init__(self):
        super().__init__(b'123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz')

class b64 (urlsafe=False)

This unit is implemented in refinery.units.encoding.b64 and has the following commandline Interface:

usage: b64 [-h] [-L] [-Q] [-0] [-v] [-R] [-u]

Base64 encoding and decoding.

optional arguments:
  -u, --urlsafe  use URL-safe alphabet

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class b64(Unit):
    """
    Base64 encoding and decoding.
    """
    def __init__(self, urlsafe: Arg.Switch('-u', help='use URL-safe alphabet') = False):
        super().__init__(urlsafe=urlsafe)

    def reverse(self, data):
        altchars = None
        if self.args.urlsafe:
            altchars = B'-_'
        return base64.b64encode(data, altchars=altchars)

    def process(self, data: bytearray):
        if not data:
            return data
        if len(data) == 1:
            raise ValueError('single byte can not be base64-decoded.')
        data.extend(B'===')
        altchars = None
        if (B'-' in data or B'_' in data) and (B'+' not in data and B'/' not in data) or self.args.urlsafe:
            altchars = B'-_'
        return base64.b64decode(data, altchars=altchars)

    @classmethod
    def handles(self, data: bytearray) -> bool:
        from refinery.lib.patterns import formats
        if not formats.b64space.fullmatch(data):
            return False
        return len(set(data)) in range(60, 67)

class b85

This unit is implemented in refinery.units.encoding.b85 and has the following commandline Interface:

usage: b85 [-h] [-L] [-Q] [-0] [-v] [-R]

Base85 encoding and decoding.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class b85(Unit):
    """
    Base85 encoding and decoding.
    """
    def reverse(self, data):
        return base64.b85encode(data)

    def process(self, data):
        if re.search(BR'\s', data) is not None:
            data = re.sub(BR'\s+', B'', data)
        return base64.b85decode(data)

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib.patterns import formats
        return formats.b85space.fullmatch(data)

class base (base=0, strip_padding=False, little_endian=False, strict_digits=False)

This unit is implemented in refinery.units.encoding.base and has the following commandline Interface:

usage: base [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-e] [-d] [base|alphabet]

Encodes and decodes integers in arbitrary base.

positional arguments:
  base|alphabet        Either the base to be used or an alphabet. If an explicit alphabet is
                       given, its length determines the base. The default base 0 treats the input
                       as a Python integer literal. If a numeric base is given, digits from the
                       alphabet "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" are used.

optional arguments:
  -s, --strip-padding  Do not add leading zeros to the output.
  -e, --little-endian  Use little endian byte order instead of big endian.
  -d, --strict-digits  Check that all input digits are part of the alphabet.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Allow partial results as output.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.
  -R, --reverse        Use the reverse operation; Specify twice to normalize (first decode, then
                       encode).

Expand source code Browse git

class base(Unit):
    """
    Encodes and decodes integers in arbitrary base.
    """
    def __init__(
        self,
        base: Arg(type=numseq, metavar='base|alphabet', help=(
            R'Either the base to be used or an alphabet. If an explicit alphabet is given, its length '
            R'determines the base. The default base 0 treats the input as a Python integer literal. If '
            F'a numeric base is given, digits from the alphabet "{_DEFAULT_ALPH_STR}" are used. ')) = 0,
        strip_padding: Arg.Switch('-s', help='Do not add leading zeros to the output.') = False,
        little_endian: Arg.Switch('-e', help='Use little endian byte order instead of big endian.') = False,
        strict_digits: Arg.Switch('-d', help='Check that all input digits are part of the alphabet.') = False,
    ):
        super().__init__(
            base=base,
            strip_padding=strip_padding,
            little_endian=little_endian,
            strict_digits=strict_digits,
        )

    @property
    def _args(self):
        base = self.args.base
        if isinstance(base, int):
            if not base:
                return 0, B''
            if base in _LARGER_ALPHABETS:
                return base, _LARGER_ALPHABETS[base]
            if base not in range(2, len(_DEFAULT_ALPHABET) + 1):
                raise ValueError(F'base may only be an integer between 2 and {len(_DEFAULT_ALPHABET)}')
            return base, _DEFAULT_ALPHABET[:base]
        if len(set(base)) != len(base):
            raise ValueError('the given alphabet contains duplicate letters')
        return len(base), bytearray(base)

    @property
    def byteorder(self):
        return 'little' if self.args.little_endian else 'big'

    def reverse(self, data):
        base, alphabet = self._args
        self.log_info('using byte order', self.byteorder)
        number = int.from_bytes(data, byteorder=self.byteorder)

        if base == 0:
            return B'0x%X' % number
        if base > len(alphabet):
            raise ValueError(F'Only {len(alphabet)} available; not enough to encode base {base}')

        data_bits = len(data) * 8
        base_bits = math.log2(base)
        result = bytearray()

        while data_bits >= 1:
            number, k = divmod(number, base)
            result.append(alphabet[k])
            if not number and self.args.strip_padding:
                break
            data_bits -= base_bits

        result.reverse()
        return result

    def process(self, data: bytearray):
        base, alphabet = self._args
        if base and base != 64 and not self.args.strict_digits:
            check = set(alphabet)
            index = 0
            it = iter(data)
            for b in it:
                if b not in check:
                    break
                index += 1
            for b in it:
                if b in check:
                    data[index] = b
                    index += 1
            self.log_info(F'stripped {len(data) - index} invalid digits from input data')
            del data[index:]
        if len(alphabet) <= len(_DEFAULT_ALPHABET):
            defaults = _DEFAULT_ALPHABET[:base]
            if alphabet != defaults:
                self.log_info('translating input data to a default alphabet for faster conversion')
                data_translated = data.translate(bytes.maketrans(alphabet, defaults))
                result = int(data_translated, base)
            else:
                result = int(data, base)
        elif len(alphabet) == 64:
            import base64
            _b64_alphabet = _LARGER_ALPHABETS[64]
            if alphabet != _b64_alphabet:
                data = data.translate(bytes.maketrans(alphabet, _b64_alphabet))
            return base64.b64decode(data + b'===', validate=self.args.strict_digits)
        elif len(alphabet) == 85:
            import base64
            _b85_alphabet = _LARGER_ALPHABETS[85]
            if alphabet != _b85_alphabet:
                data = data.translate(bytes.maketrans(alphabet, _b85_alphabet))
            return base64.b85decode(data)
        else:
            self.log_warn('very long alphabet, unable to use built-ins; reverting to (slow) fallback.')
            result = 0
            lookup = {digit: k for k, digit in enumerate(alphabet)}
            for digit in data:
                result *= base
                result += lookup[digit]
        if not base or self.args.strip_padding:
            size, rest = divmod(result.bit_length(), 8)
            size += int(bool(rest))
        else:
            size = (len(data) - 1 + alphabet.index(data[0]) / base) * math.log2(base) / 8
            size = math.ceil(size)
        return result.to_bytes(size, byteorder=self.byteorder)

class bat (keep_all=False, keep_comment=False, keep_definitions=False, keep_echo=False)

This unit is implemented in refinery.units.formats.bat and has the following commandline Interface:

usage: bat [-h] [-L] [-Q] [-0] [-v] [-a] [-c] [-d] [-e]

Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script is
interpreted, variables are substituted for previously defined values, including commonly defined
operating system environment variables. Variable definitions that are later evaluated are removed
from the script, as are all echo commands and comments.

optional arguments:
  -a, --keep-all          Do not strip anything after deobfuscation.
  -c, --keep-comment      Do not strip comments from the script.
  -d, --keep-definitions  Do not strip variable definitions.
  -e, --keep-echo         Do not strip echo calls in the script.

generic options:
  -h, --help              Show this help message and exit.
  -L, --lenient           Allow partial results as output.
  -Q, --quiet             Disables all log output.
  -0, --devnull           Do not produce any output.
  -v, --verbose           Specify up to two times to increase log level.

Expand source code Browse git

class bat(Unit):
    """
    Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script
    is interpreted, variables are substituted for previously defined values, including commonly
    defined operating system environment variables. Variable definitions that are later evaluated
    are removed from the script, as are all echo commands and comments.
    """
    def __init__(
        self,
        keep_all         : Unit.Arg.Switch('-a', help='Do not strip anything after deobfuscation.') = False,
        keep_comment     : Unit.Arg.Switch('-c', help='Do not strip comments from the script.') = False,
        keep_definitions : Unit.Arg.Switch('-d', help='Do not strip variable definitions.') = False,
        keep_echo        : Unit.Arg.Switch('-e', help='Do not strip echo calls in the script.') = False,
    ): ...

    @unicoded
    def process(self, data: str) -> str:
        mode = STRIP.ALL
        if self.args.keep_all:
            mode = STRIP.NONE
        elif self.args.keep_comment:
            mode ^= STRIP.COMMENT
        elif self.args.keep_definitions:
            mode ^= STRIP.DEFINITION
        elif self.args.keep_echo:
            mode ^= STRIP.ECHO
        return BatchDeobfuscator().deobfuscate(data, mode)

class bitrev (bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.bitrev and has the following commandline Interface:

usage: bitrev [-h] [-L] [-Q] [-0] [-v] [-E] [-B N]

Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer
multiple of the block size are ignored.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class bitrev(UnaryOperation):
    """
    Reverse the bits of every block. Any excess bytes at the end of the input that are not
    an integer multiple of the block size are ignored.
    """
    @staticmethod
    def operate(arg):
        raise RuntimeError('operate was called before the unit was initialized')

    def __init__(self, bigendian=False, blocksize=None):
        """
        Unreadable bit reversal operations due to:
        https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv
        https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
        """
        super().__init__(bigendian=bigendian, blocksize=blocksize, _truncate=1)

        if self.bytestream:
            def operate(v):
                return ((v * 0x202020202) & 0x10884422010) % 1023
        elif self.blocksize in (2, 4, 8):
            def operate(v):
                s = self.fbits
                m = self.fmask
                w = v
                while s > 1:
                    s >>= 1
                    m = m ^ (m << s)
                    w = ((w << s) & ~m) | ((w >> s) & m)
                return w
        else:
            def operate(v):
                w = v & 0
                for s in range(self.fbits):
                    w |= ((v >> s) & 1) << (self.fbits - s - 1)
                return w
        self.operate = operate

class bitsnip (slices=[slice(0, 1, None)], bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.bitsnip and has the following commandline Interface:

usage: bitsnip [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [slices [slices ...]]

Pick a certain range of bits from each block of the input. The extracted ranges of bits are
concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are
indexed from least significant at index 0 to most significant in each block. When the unit
operates in big endian mode, the internal bit buffer is shifted left in each step and new bits
are inserted as the least significant portion. Conversely, in default (little endian) mode, newly
extracted bits are added as the now most significant ones. After concatenating all bit slices
into a large integer, this integer is converted into a byte string according to the given byte
ordering.

positional arguments:
  slices             Specify start:stop:size, where size can be used to pad or truncate the
                     extracted bits. If size is omitted, it defaults to (stop-start). If no slice
                     is specified, it defaults to 0, which corresponds to 0:1:1, i.e. extracting
                     the lowest bit.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class bitsnip(BlockTransformationBase):
    """
    Pick a certain range of bits from each block of the input. The extracted ranges of bits are
    concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are
    indexed from least significant at index 0 to most significant in each block. When the unit
    operates in big endian mode, the internal bit buffer is shifted left in each step and new bits
    are inserted as the least significant portion. Conversely, in default (little endian) mode,
    newly extracted bits are added as the now most significant ones. After concatenating all bit
    slices into a large integer, this integer is converted into a byte string according to the
    given byte ordering.
    """
    def __init__(
        self, slices: Arg(help=(
            'Specify start:stop:size, where size can be used to pad or truncate the extracted '
            'bits. If size is omitted, it defaults to (stop-start). If no slice is specified, '
            'it defaults to 0, which corresponds to 0:1:1, i.e. extracting the lowest bit.')
        ) = [slice(0, 1)],
        bigendian=False, blocksize=None
    ):
        super().__init__(slices=slices, bigendian=bigendian, blocksize=blocksize)

    def process(self, data: bytearray):
        bitsnip_data = 0
        bitsnip_size = 0
        slices: List[Tuple[int, int, int]] = []
        maxbits = 8 * self.blocksize
        args: Iterable[slice] = iter(self.args.slices)
        bigendian: bool = self.args.bigendian

        for s in args:
            start = s.start
            stop = s.stop
            if start is None:
                start = 0
            if stop is None:
                stop = maxbits
            elif stop > maxbits:
                raise ValueError(F'the selection {start}:{stop} is out of bounds for the block size {self.blocksize}')
            if start >= stop:
                continue
            size = stop - start
            mask = (1 << size) - 1
            size = s.step or size
            slices.append((start, mask, size))

        for item in self.chunk(data):
            for shift, mask, size in slices:
                bits = (item >> shift) & mask
                if bigendian:
                    bitsnip_data <<= size
                    bitsnip_data |= bits
                else:
                    bitsnip_data |= bits << bitsnip_size
                bitsnip_size += size

        length, remainder = divmod(bitsnip_size, 8)

        if remainder != 0:
            self.log_info(F'discarding {bitsnip_size % 8} bits')
            if bigendian:
                bitsnip_data >>= remainder
            else:
                bitsnip_data &= (1 << (8 * length)) - 1

        if bigendian:
            return bitsnip_data.to_bytes(length, 'big')
        else:
            return bitsnip_data.to_bytes(length, 'little')

class blabla (key, nonce=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', rounds=10, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.blabla and has the following commandline Interface:

usage: blabla [-h] [-L] [-Q] [-0] [-v] [-R] [-r N] [-d N] [-s] key [nonce]

Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is
similar to ChaCha in design but operates on 64-bit blocks.

positional arguments:
  key              The encryption key.
  nonce            The 16-byte nonce. The default are 16 null bytes.

optional arguments:
  -r, --rounds N   The number of rounds, default is 10.
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class blabla(StreamCipherUnit):
    """
    Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It
    is similar to ChaCha in design but operates on 64-bit blocks.
    """
    key_size = {32}

    def __init__(
        self, key,
        nonce: Arg(help='The 16-byte nonce. The default are 16 null bytes.') = bytes(16),
        rounds: Arg.Number('-r', help='The number of rounds, default is {default}.') = 10,
        discard=0, stateful=False
    ):
        super().__init__(key=key, nonce=nonce, rounds=rounds, discard=discard, stateful=stateful)

    def keystream(self):
        r = self.args.rounds
        n = self.args.nonce
        k = struct.unpack('<4Q', self.args.key)

        try:
            n = struct.unpack('<2Q', n)
        except Exception:
            raise ValueError(F'The given nonce has invalid length of {len(n)}, it must be 16 bytes in size.')

        q = [
            0x6170786593810fab,  # 0x0
            0x3320646ec7398aee,  # 0x1
            0x79622d3217318274,  # 0x2
            0x6b206574babadada,  # 0x3
            *k,                  # 0x4 .. 0x7
            0x2ae36e593e46ad5f,  # 0x8
            0xb68f143029225fc9,  # 0x9
            0x8da1e08468303aa6,  # 0xA
            0xa48a209acd50a4a7,  # 0xB
            0x7fdc12f23f90778c,  # 0xC
            1,                   # 0xD
            *n                   # 0xE .. 0xF
        ]
        while True:
            v = [*q]
            for _ in range(r):
                for a, b, c, d in [
                    (0x0, 0x4, 0x8, 0xC),
                    (0x1, 0x5, 0x9, 0xD),
                    (0x2, 0x6, 0xA, 0xE),
                    (0x3, 0x7, 0xB, 0xF),
                    (0x0, 0x5, 0xA, 0xF),
                    (0x1, 0x6, 0xB, 0xC),
                    (0x2, 0x7, 0x8, 0xD),
                    (0x3, 0x4, 0x9, 0xE),
                ]:
                    v[a] = v[a] + v[b] & _M64
                    v[d] = rotr64(v[d] ^ v[a], 32)
                    v[c] = v[c] + v[d] & _M64
                    v[b] = rotr64(v[b] ^ v[c], 24)
                    v[a] = v[a] + v[b] & _M64
                    v[d] = rotr64(v[d] ^ v[a], 16)
                    v[c] = v[c] + v[d] & _M64
                    v[b] = rotr64(v[b] ^ v[c], 63)
            v = [x + y & _M64 for x, y in zip(q, v)]
            q[0xD] += 1
            yield from struct.pack('<16Q', *v)

class blk224 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: blk224 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the BLK224 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class blk224(HashUnit):
    """
    Returns the BLK224 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.blake2b(data, digest_size=28)

class blk256 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: blk256 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the BLK256 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class blk256(HashUnit):
    """
    Returns the BLK256 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.blake2b(data, digest_size=32)

class blk384 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: blk384 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the BLK384 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class blk384(HashUnit):
    """
    Returns the BLK384 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.blake2b(data, digest_size=48)

class blk512 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: blk512 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the BLK512 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class blk512(HashUnit):
    """
    Returns the BLK512 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.blake2b(data, digest_size=64)

class blowfish (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.blowfish and has the following commandline Interface:

usage: blowfish [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key

Blowfish encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -M, --mac-len N       Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in
                        bytes.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class blowfish(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(Blowfish)):
    """
    Blowfish encryption and decryption.
    """
    pass

class blz

This unit is implemented in refinery.units.compression.blz and has the following commandline Interface:

usage: blz [-h] [-L] [-Q] [-0] [-v] [-R]

BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree
implementation: It requires a lot of time & memory.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class blz(Unit):
    """
    BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree
    implementation: It requires a lot of time & memory.
    """
    def _begin(self, data):
        self._src = StructReader(memoryview(data))
        self._dst = MemoryFile(bytearray())
        return self

    def _reset(self):
        self._src.seek(0)
        self._dst.seek(0)
        self._dst.truncate()
        return self

    def _decompress(self):
        (
            signature,
            version,
            src_count,
            src_crc32,
            dst_count,
            dst_crc32,
        ) = self._src.read_struct('>6L')
        if signature != 0x626C7A1A:
            raise ValueError(F'Invalid BriefLZ signature: {signature:08X}, should be 626C7A1A.')
        if version > 10:
            raise ValueError(F'Invalid version number {version}, should be less than 10.')
        self.log_debug(F'signature: 0x{signature:08X} V{version}')
        self.log_debug(F'src count: 0x{src_count:08X}')
        self.log_debug(F'src crc32: 0x{src_crc32:08X}')
        self.log_debug(F'dst count: 0x{dst_count:08X}')
        self.log_debug(F'dst crc32: 0x{dst_crc32:08X}')
        src = self._src.getbuffer()
        src = src[24:24 + src_count]
        if len(src) < src_count:
            self.log_warn(F'Only {len(src)} bytes in buffer, but header annoucned a length of {src_count}.')
        if src_crc32:
            check = zlib.crc32(src)
            if check != src_crc32:
                self.log_warn(F'Invalid source data CRC {check:08X}, should be {src_crc32:08X}.')
        dst = self._decompress_chunk(dst_count)
        if not dst_crc32:
            return dst
        check = zlib.crc32(dst)
        if check != dst_crc32:
            self.log_warn(F'Invalid result data CRC {check:08X}, should be {dst_crc32:08X}.')
        return dst

    def _decompress_modded(self):
        self._src.seekrel(8)
        total_size = self._src.u64()
        chunk_size = self._src.u64()
        remaining = total_size
        self.log_debug(F'total size: 0x{total_size:016X}')
        self.log_debug(F'chunk size: 0x{chunk_size:016X}')
        while remaining > chunk_size:
            self._decompress_chunk(chunk_size)
            remaining -= chunk_size
        return self._decompress_chunk(remaining)

    def _decompress_chunk(self, size=None):
        bitcount = 0
        bitstore = 0
        decompressed = 1

        def readbit():
            nonlocal bitcount, bitstore
            if not bitcount:
                bitstore = int.from_bytes(self._src.read_exactly(2), 'little')
                bitcount = 0xF
            else:
                bitcount = bitcount - 1
            return (bitstore >> bitcount) & 1

        def readint():
            result = 2 + readbit()
            while readbit():
                result <<= 1
                result += readbit()
            return result

        self._dst.write(self._src.read_exactly(1))

        try:
            while not size or decompressed < size:
                if readbit():
                    length = readint() + 2
                    sector = readint() - 2
                    offset = self._src.read_byte() + 1
                    delta = offset + 0x100 * sector
                    available = self._dst.tell()
                    if delta not in range(available + 1):
                        raise RefineryPartialResult(
                            F'Requested rewind by 0x{delta:08X} bytes with only 0x{available:08X} bytes in output buffer.',
                            partial=self._dst.getvalue())
                    quotient, remainder = divmod(length, delta)
                    replay = memoryview(self._dst.getbuffer())
                    replay = bytes(replay[-delta:] if quotient else replay[-delta:length - delta])
                    replay = quotient * replay + replay[:remainder]
                    self._dst.write(replay)
                    decompressed += length
                else:
                    self._dst.write(self._src.read_exactly(1))
                    decompressed += 1
        except EOF as E:
            raise RefineryPartialResult(str(E), partial=self._dst.getbuffer())
        dst = self._dst.getbuffer()
        if decompressed < size:
            raise RefineryPartialResult(
                F'Attempted to decompress {size} bytes, got only {len(dst)}.', dst)
        if decompressed > size:
            raise RuntimeError('Decompressed buffer contained more bytes than expected.')
        return dst

    def _compress(self):
        from refinery.lib.suffixtree import SuffixTree

        try:
            self.log_info('computing suffix tree')
            tree = SuffixTree(self._src.getbuffer())
        except Exception:
            raise

        bitstore = 0  # The bit stream to be written
        bitcount = 0  # The number of bits in the bit stream
        buffer = MemoryFile(bytearray())

        # Write empty header and first byte of source
        self._dst.write(bytearray(24))
        self._dst.write(self._src.read_exactly(1))

        def writeint(n: int) -> None:
            """
            Write an integer to the bit stream.
            """
            nonlocal bitstore, bitcount
            nbits = n.bit_length()
            if nbits < 2:
                raise ValueError
            # The highest bit is implicitly assumed:
            n ^= 1 << (nbits - 1)
            remaining = nbits - 2
            while remaining:
                remaining -= 1
                bitstore <<= 2
                bitcount += 2
                bitstore |= ((n >> remaining) & 3) | 1
            bitstore <<= 2
            bitcount += 2
            bitstore |= (n & 1) << 1

        src = self._src.getbuffer()
        remaining = len(src) - 1
        self.log_info('compressing data')

        while True:
            cursor = len(src) - remaining
            rest = src[cursor:]
            if bitcount >= 0x10:
                block_count, bitcount = divmod(bitcount, 0x10)
                info_channel = bitstore >> bitcount
                bitstore = info_channel << bitcount ^ bitstore
                # The decompressor will read bits from top to bottom, and each 16 bit block has to be
                # little-endian encoded. The bit stream is encoded top to bottom bit in the bitstore
                # variable, and by encoding it as a big endian integer, the stream is in the correct
                # order. However, we need to swap adjacent bytes to achieve little endian encoding for
                # each of the blocks:
                info_channel = bytearray(info_channel.to_bytes(block_count * 2, 'big'))
                for k in range(block_count):
                    k0 = 2 * k + 0
                    k1 = 2 * k + 1
                    info_channel[k0], info_channel[k1] = info_channel[k1], info_channel[k0]
                info_channel = memoryview(info_channel)
                data_channel = memoryview(buffer.getbuffer())
                self._dst.write(info_channel[:2])
                self._dst.write(data_channel[:-1])
                self._dst.write(info_channel[2:])
                data_channel = bytes(data_channel[-1:])
                buffer.truncate(0)
                store = buffer if bitcount else self._dst
                store.write(data_channel)
            if remaining + bitcount < 0x10:
                buffer = buffer.getbuffer()
                if rest or buffer:
                    bitstore <<= 0x10 - bitcount
                    self._dst.write(bitstore.to_bytes(2, 'little'))
                    self._dst.write(buffer)
                    self._dst.write(rest)
                elif bitcount:
                    raise RuntimeError('Bitbuffer Overflow')
                break
            node = tree.root
            length = 0
            offset = 0
            sector = None
            while node.children and length < len(rest):
                for child in node.children.values():
                    if tree.data[child.start] == rest[length]:
                        node = child
                        break
                if node.start >= cursor:
                    break
                offset = node.start - length
                length = node.end + 1 - offset
            length = min(remaining, length)
            if length >= 4:
                sector, offset = divmod(cursor - offset - 1, 0x100)
            bitcount += 1
            bitstore <<= 1
            if sector is None:
                buffer.write(rest[:1])
                remaining -= 1
                continue
            bitstore |= 1
            buffer.write(bytes((offset,)))
            writeint(length - 2)
            writeint(sector + 2)
            remaining -= length

        self._dst.seek(24)
        dst = self._dst.peek()
        self._dst.seek(0)
        self._dst.write(struct.pack('>6L', 0x626C7A1A, 1, len(dst), zlib.crc32(dst), len(src), zlib.crc32(src)))
        return self._dst.getbuffer()

    def process(self, data):
        self._begin(data)
        partial = None
        try:
            return self._decompress()
        except ValueError as error:
            if isinstance(error, RefineryPartialResult):
                partial = error
            self.log_warn(F'Reverting to modified BriefLZ after decompression error: {error!s}')
            self._reset()

        try:
            return self._decompress_modded()
        except RefineryPartialResult:
            raise
        except Exception as error:
            if not partial:
                raise
            raise partial from error

    def reverse(self, data):
        return self._begin(data)._compress()

class bruteforce (name, length=slice(1, None, None), format=None, alphabet=None, pattern=None, printable=False, digits=False, identifier=False, letters=False)

This unit is implemented in refinery.units.strings.bruteforce and has the following commandline Interface:

usage: bruteforce [-h] [-L] [-Q] [-0] [-v] [-a B | -r REGEX | -p | -d | -i | -l]
                  name [length] [format]

Generates all possible combinations of letters in a given alphabet. For each generated string,
one copy of each input chunk is generated and populated with a meta variable containing that
string. This can be used for simple brute forcing checks.

positional arguments:
  name                 Name of the meta variable to be populated.
  length               Specifies the range of characters to brute force, default is 1:.
  format               Optional format expression for the output string. The format sequence
                       "{0}" is the current brute force string, the sequence "{1}" represents the
                       input data.

optional arguments:
  -a, --alphabet B     The alphabet from which to choose the letters. Entire byte range by
                       default.
  -r, --pattern REGEX  Provide a regular expression pattern to define the alphabet.
  -p, --printable      Equivalent to --pattern=[\s\x20-\x7E]
  -d, --digits         Equivalent to --pattern=\d
  -i, --identifier     Equivalent to --pattern=\w
  -l, --letters        Equivalent to --pattern=[a-zA-Z]

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Allow partial results as output.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class bruteforce(Unit):
    """
    Generates all possible combinations of letters in a given alphabet. For each generated string,
    one copy of each input chunk is generated and populated with a meta variable containing that
    string. This can be used for simple brute forcing checks.
    """
    def __init__(
        self,
        name  : Arg.String(help='Name of the meta variable to be populated.'),
        length: Arg.Bounds(metavar='length', help=(
            'Specifies the range of characters to brute force, default is {default}.'
        )) = slice(1, None),
        format: Arg.String(help=(
            'Optional format expression for the output string. The format sequence "{0}" is the '
            'current brute force string, the sequence "{1}" represents the input data.'
        )) = None,
        alphabet  : Arg.Binary('-a', group='ALPH', help=(
            'The alphabet from which to choose the letters. Entire byte range by default.'
        )) = None,
        pattern   : Arg.RegExp('-r', group='ALPH',
            help='Provide a regular expression pattern to define the alphabet.') = None,
        printable : Arg.Switch('-p', group='ALPH',
            help='Equivalent to --pattern=[\\s\\x20-\\x7E]') = False,
        digits    : Arg.Switch('-d', group='ALPH',
            help='Equivalent to --pattern=\\d') = False,
        identifier: Arg.Switch('-i', group='ALPH',
            help='Equivalent to --pattern=\\w') = False,
        letters   : Arg.Switch('-l', group='ALPH',
            help='Equivalent to --pattern=[a-zA-Z]') = False,
    ):
        options = sum(1 for x in [printable, digits, identifier, letters] if x)

        if options > 1 or options and pattern:
            raise ValueError('Invalid selection.')

        if printable:
            pattern = b'[\\s\\x20-\\x7E]'
        if digits:
            pattern = b'\\d'
        if identifier:
            pattern = b'\\w'
        if letters:
            pattern = b'[a-zA-Z]'

        super().__init__(
            name=name,
            length=length,
            format=format,
            alphabet=alphabet,
            pattern=pattern,
        )

    def _alphabet(self) -> bytes:
        alphabet = self.args.alphabet
        if alphabet:
            return alphabet
        alphabet = bytes(range(0x100))
        pattern = self.args.pattern
        if not pattern:
            return alphabet
        alphabet = B''.join(re.findall(pattern, alphabet, flags=re.DOTALL))
        if alphabet:
            return alphabet
        raise ValueError(F'Invalid regular expression: {pattern}')

    def process(self, data: bytearray):
        format_spec: str = self.args.format
        meta = metavars(data)
        name = self.args.name
        kwargs = {name: None}

        for length in integers_of_slice(self.args.length):
            self.log_info(F'generating {length} digits')
            if not length or length <= 0:
                raise ValueError(F'Unable to brute force {length} characters.')
            for string in itertools.product(self._alphabet(), repeat=length):
                string = bytes(string)
                if format_spec:
                    string = meta.format_bin(format_spec, self.codec, [string, data])
                kwargs[name] = string
                yield self.labelled(data, **kwargs)

class byteswap (size=4)

This unit is implemented in refinery.units.blockwise.byteswap and has the following commandline Interface:

usage: byteswap [-h] [-L] [-Q] [-0] [-v] [N]

Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the
block size are discarded.

positional arguments:
  N              the block size in bytes; the default is 4.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class byteswap(UnaryOperation):
    """
    Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block
    size are discarded.
    """
    def __init__(self, size: Arg.Number(help='the block size in bytes; the default is {default}.') = 4):
        super().__init__(blocksize=size, _truncate=2)

    def inplace(self, block: ndarray) -> None:
        block.byteswap(True)

    operate = NotImplemented

    def process(self, data):
        try:
            return self._fastblock(data)
        except FastBlockError:
            b = self.blocksize
            n = len(data)
            m = n - n % b
            v = memoryview(data)
            if b == 1:
                self.log_warn('running this unit with a block size of 1 does not have any effect')
                return data
            for k in range(0, m, b):
                _end = k and k - 1 or None
                data[k : k + b] = v[k + b - 1:_end:-1]
            if m < n:
                del v
                del data[m:]
            return data

class bz2 (level=9)

This unit is implemented in refinery.units.compression.bz2 and has the following commandline Interface:

usage: bz2 [-h] [-L] [-Q] [-0] [-v] [-R] [-l LEVEL]

BZip2 compression and decompression.

optional arguments:
  -l, --level LEVEL  compression level preset between 1 and 9

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class bz2(Unit):
    """
    BZip2 compression and decompression.
    """
    def __init__(self, level: Arg('-l', type=number[1:9], help='compression level preset between 1 and 9') = 9):
        super().__init__(level=level)

    def process(self, data):
        return bz2_.decompress(data)

    def reverse(self, data):
        return bz2_.compress(data, self.args.level)

    @classmethod
    def handles(self, data: bytearray):
        return data[:3] == B'BZh'

class camellia (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.camellia and has the following commandline Interface:

usage: camellia [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] key

Camellia encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class camellia(StandardBlockCipherUnit, cipher=BlockCipherFactory(Camellia)):
    """
    Camellia encryption and decryption.
    """
    pass

class carve (format, unique=False, decode=False, single=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True)

This unit is implemented in refinery.units.pattern.carve and has the following commandline Interface:

usage: carve [-h] [-L] [-Q] [-0] [-v] [-q] [-d] [-s] [-n N] [-m N] [-e N] [-x] [-l] [-t N]
             [-a | -u]
             format

Extracts patches of data in particular formats from the input.

positional arguments:
  format            Specify one of the following formats: integer, float, number, string,
                    multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote,
                    urlquote-coarse, urlquote-narrow, intarray, numarray, word, letters, wshenc,
                    alphanumeric, b32, b64, b85, b64any, b64url, hex, uppercase-hex, spaced-hex,
                    spaced-b64, spaced-b85, utf8, hexdump, hexarray, uuencode

optional arguments:
  -q, --unique      Yield every match only once.
  -d, --decode      Automatically decode known patterns.
  -s, --single      Only get the biggest match; equivalent to -qlt1
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -l, --longest     Sort results by length.
  -t, --take N      Return only the first N occurrences in order of appearance.
  -a, --no-utf16    Search for ASCII encoded patterns only.
  -u, --no-ascii    Search for UTF16 encoded patterns only.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class carve(PatternExtractor):
    """
    Extracts patches of data in particular formats from the input.
    """
    def __init__(
        self, format: Arg.Choice(choices=[p.display for p in formats], metavar='format',
            help='Specify one of the following formats: {choices}'),
        unique: Arg.Switch('-q', help='Yield every match only once.') = False,
        decode: Arg.Switch('-d', help='Automatically decode known patterns.') = False,
        single: Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1') = False,
        min=1, max=None, len=None,
        stripspace=False, longest=False, take=None, utf16=True, ascii=True
    ):
        if single:
            take = 1
            longest = True
            unique = True
        super().__init__(
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            duplicates=not unique,
            longest=longest,
            take=take,
            ascii=ascii,
            utf16=utf16,
            format=formats.from_dashname(format)
        )
        if not decode:
            decoder = NotImplemented
        elif self.args.format in (formats.multiline_string, formats.string):
            from ..encoding.esc import esc
            decoder = esc(unicode=True, quoted=True)
        elif self.args.format is formats.integer:
            from ..encoding.base import base
            decoder = base()
        elif self.args.format in (formats.uppercase_hex, formats.spaced_hex, formats.hex):
            from ..encoding.hex import hex
            decoder = hex()
        elif self.args.format is formats.hexdump:
            from ..formats.hexload import hexload
            decoder = hexload()
        elif self.args.format is formats.intarray:
            from ..blockwise.pack import pack
            decoder = pack()
        elif self.args.format in (formats.b64, formats.b64any, formats.spaced_b64):
            from ..encoding.b64 import b64
            decoder = b64()
        elif self.args.format in (formats.b85, formats.spaced_b85):
            from ..encoding.b85 import b85
            decoder = b85()
        elif self.args.format is formats.b64url:
            from ..encoding.b64 import b64
            decoder = b64(urlsafe=True)
        elif self.args.format is formats.b32:
            from ..encoding.b32 import b32
            decoder = b32()
        elif self.args.format is formats.ps1str:
            from ..encoding.ps1str import ps1str
            decoder = ps1str()
        elif self.args.format is formats.vbastr:
            from ..encoding.ps1str import ps1str
            decoder = ps1str()
        elif self.args.format is formats.hexarray:
            from ..blockwise.pack import pack
            decoder = pack(0x10)
        elif self.args.format is formats.wshenc:
            from ..encoding.wshenc import wshenc
            decoder = wshenc()
        elif self.args.format is formats.uuencode:
            from ..encoding.uuenc import uuenc
            decoder = uuenc()
        elif self.args.format in (
            formats.urlquote,
            formats.urlquote_coarse,
            formats.urlquote_narrow,
        ):
            from ..encoding.url import url
            decoder = url()
        else:
            decoder = NotImplemented
        self.decoder = decoder

    def process(self, data):
        it = iter(self.matches_filtered(memoryview(data), self.args.format.value.bin_compiled))
        if self.decoder is NotImplemented:
            yield from it
        for chunk in it:
            try:
                yield self.decoder(chunk)
            except Exception as E:
                self.log_info(F'decoder failure: {E!s}')

class carve_7z

This unit is implemented in refinery.units.pattern.carve_7z and has the following commandline Interface:

usage: carve-7z [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a 7zip archive file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_7z(Unit):
    """
    Extracts anything from the input data that looks like a 7zip archive file.
    """
    @Unit.Requires('py7zr', 'arc', 'default', 'extended')
    def _py7zr():
        import py7zr
        return py7zr

    HEADER_SIGNATURE = B'7z\xBC\xAF\x27\x1C'

    def process(self, data: bytearray):
        cursor = 0
        mv = memoryview(data)
        while True:
            start = data.find(self.HEADER_SIGNATURE, cursor)
            if start < cursor:
                break
            self.log_debug(F'found header at offset: 0x{start:08X}')
            try:
                mf = MemoryFileRecorder(mv[start:])
                self.log_debug('attempting to read archive')
                archive = self._py7zr.SevenZipFile(mf)
                self.log_debug('attempting to test archive')
                success = archive.test() is not False
            except ImportError:
                raise
            except Exception as error:
                self.log_debug('parsing archive failed:', error)
                success = False
            if success:
                self.log_info(F'identified archive of size 0x{mf.max_cursor:08X} at offset 0x{start:08X}')
                cursor = start + mf.max_cursor
                yield self.labelled(mv[start:cursor], offset=start)
            else:
                cursor = start + 5

class carve_json (dictonly=False)

This unit is implemented in refinery.units.pattern.carve_json and has the following commandline Interface:

usage: carve-json [-h] [-L] [-Q] [-0] [-v] [-d]

Extracts anything from the input data that looks like JSON.

optional arguments:
  -d, --dictonly  only extract JSON dictionaries, do not extract lists.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class carve_json(Unit):
    """
    Extracts anything from the input data that looks like JSON.
    """
    def __init__(self, dictonly: Arg.Switch('-d', help='only extract JSON dictionaries, do not extract lists.') = False):
        super().__init__(dictonly=dictonly)

    def process(self, data):
        for start, chunk in JSONCarver(data, dictonly=self.args.dictonly):
            yield self.labelled(chunk, offset=start)

class carve_lnk

This unit is implemented in refinery.units.pattern.carve_lnk and has the following commandline Interface:

usage: carve-lnk [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_lnk(Unit):
    """
    Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)
    """

    @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'extended')
    def _LnkParse3():
        import LnkParse3
        import LnkParse3.extra_factory
        return LnkParse3

    def process(self, data: bytearray):
        pos = 0
        mem = memoryview(data)
        sig = B'\x4C\x00\x00\x00\x01\x14\x02\x00'
        lnk = self._LnkParse3

        while True:
            pos = data.find(sig, pos)
            if pos < 0:
                break
            try:
                parsed = lnk.lnk_file(indata=mem[pos:])
            except Exception:
                pos += 1
                continue
            end = pos + parsed.header.size() + parsed.string_data.size()
            if parsed.has_target_id_list():
                end += parsed.targets.size()
            if parsed.has_link_info() and not parsed.force_no_link_info():
                with suppress(AttributeError):
                    end += parsed.info.size()
            with NoLogging():
                while end < len(mem):
                    extra = lnk.extra_factory.ExtraFactory(mem[end:])
                    try:
                        ec = extra.extra_class()
                    except Exception:
                        break
                    if ec is None:
                        break
                    if 'UNKNOWN' in ec().name():
                        break
                    end += extra.item_size()

            terminal_block = mem[end:end + 4]
            if terminal_block != B'\0\0\0\0':
                self.log_warn(F'detected LNK at offset 0x{pos:X}, but size calculation did not end on a terminal block')
                continue
            else:
                end += 4
            yield self.labelled(mem[pos:end], offset=pos)
            pos = end

class carve_pe (*paths, list=False, join_path=False, drop_path=False, path=b'name', recursive=False, keep_root=False, memdump=False, fileinfo=False)

This unit is implemented in refinery.units.pattern.carve_pe and has the following commandline Interface:

usage: carve-pe [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-P NAME] [-r] [-k] [-m] [-f]
                [path [path ...]]

Extracts anything from the input data that looks like a Portable Executable (PE) file.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "name".
  -r, --recursive  Extract PE files that are contained in already extracted PEs.
  -k, --keep-root  If the input chunk is itself a PE, include it as an output chunk.
  -m, --memdump    Use the virtual memory layout of a PE file to calculate its size.
  -f, --fileinfo   Use the PE meta information to deduce a file name meta variable.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class carve_pe(PathExtractorUnit):
    """
    Extracts anything from the input data that looks like a Portable
    Executable (PE) file.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, path=b'name',
        recursive: Arg.Switch('-r', help='Extract PE files that are contained in already extracted PEs.') = False,
        keep_root: Arg.Switch('-k', help='If the input chunk is itself a PE, include it as an output chunk.') = False,
        memdump  : Arg.Switch('-m', help='Use the virtual memory layout of a PE file to calculate its size.') = False,
        fileinfo : Arg.Switch('-f', help='Use the PE meta information to deduce a file name meta variable.') = False
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            path=path,
            recursive=recursive,
            keep_root=keep_root,
            memdump=memdump,
            fileinfo=fileinfo,
        )

    def unpack(self, data):
        cursor = 0
        mv = memoryview(data)

        while True:
            offset = data.find(B'MZ', cursor)
            if offset < cursor: break
            cursor = offset + 2
            ntoffset = mv[offset + 0x3C:offset + 0x3E]
            if len(ntoffset) < 2:
                return
            ntoffset, = unpack('H', ntoffset)
            if mv[offset + ntoffset:offset + ntoffset + 2] != B'PE':
                self.log_debug(F'invalid NT header signature for candidate at 0x{offset:08X}')
                continue
            try:
                pe = PE(data=data[offset:], fast_load=True)
            except PEFormatError as err:
                self.log_debug(F'parsing of PE header at 0x{offset:08X} failed:', err)
                continue

            pesize = get_pe_size(pe, memdump=self.args.memdump)
            pedata = mv[offset:offset + pesize]
            info = {}
            if self.args.fileinfo:
                pe_meta_parser = pemeta()
                try:
                    info = pe_meta_parser.parse_version(pe) or {}
                except Exception as error:
                    self.log_warn(F'Unable to obtain file information: {error!s}')
                try:
                    info.update(pe_meta_parser.parse_header(pe) or {})
                except Exception:
                    pass
            try:
                path = info['OriginalFilename']
            except KeyError:
                try:
                    path = info['ExportName']
                except KeyError:
                    extension = 'exe' if pe.is_exe() else 'dll' if pe.is_dll() else 'sys'
                    path = F'carve-0x{offset:08X}.{extension}'

            if offset > 0 or self.args.keep_root:
                yield UnpackResult(path, pedata, offset=offset)
                self.log_info(F'extracted PE file of size 0x{pesize:08X} from 0x{offset:08X}')
            else:
                self.log_info(F'ignored root file of size 0x{pesize:08X} from 0x{offset:08X}')
                continue

            if not offset or self.args.recursive:
                cursor += pe.OPTIONAL_HEADER.SizeOfHeaders
            else:
                cursor += pesize - 2

class carve_rtf

This unit is implemented in refinery.units.pattern.carve_rtf and has the following commandline Interface:

usage: carve-rtf [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like an RTF document.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_rtf(Unit):
    """
    Extracts anything from the input data that looks like an RTF document.
    """

    def process(self, data: bytearray):
        pos = 0
        mem = memoryview(data)
        sig = re.escape(b'{\\rtf')

        while True:
            match = re.search(sig, mem[pos:], flags=re.IGNORECASE)
            if match is None:
                break
            pos = pos + match.start()
            end = pos + 1
            depth = 1
            while depth and end < len(mem):
                if mem[end] == 0x7B:  # {
                    depth += 1
                if mem[end] == 0x7D:  # }
                    depth -= 1
                end += 1
            if depth > 0:
                break
            yield self.labelled(mem[pos:end], offset=pos)
            pos = end

class carve_xml

This unit is implemented in refinery.units.pattern.carve_xml and has the following commandline Interface:

usage: carve-xml [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like XML.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_xml(Unit):
    """
    Extracts anything from the input data that looks like XML.
    """

    def process(self, data):
        for offset, chunk in XMLCarver(data):
            yield self.labelled(chunk, offset=offset)

class carve_zip

This unit is implemented in refinery.units.pattern.carve_zip and has the following commandline Interface:

usage: carve-zip [-h] [-L] [-Q] [-0] [-v]

Extracts anything from the input data that looks like a zip archive file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class carve_zip(Unit):
    """
    Extracts anything from the input data that looks like a zip archive file.
    """

    def process(self, data: bytearray):
        end = len(data)
        mem = memoryview(data)
        rev = []
        while True:
            end = data.rfind(ZipEndOfCentralDirectory.SIGNATURE, 0, end)
            if end < 0:
                break
            try:
                end_marker = ZipEndOfCentralDirectory(mem[end:])
            except ValueError as e:
                self.log_info(F'error parsing end of central directory at 0x{end:X}: {e!s}')
                continue
            else:
                self.log_info(F'successfully parsed end of central directory at 0x{end:X}')
            start = end - end_marker.directory_size
            shift = start - end_marker.directory_offset
            if start < 0:
                self.log_debug('end of central directory size is invalid')
                continue
            try:
                central_directory = ZipCentralDirectory(mem[start:])
            except ValueError:
                self.log_debug('computed location of central directory is invalid')
                end = end - len(ZipEndOfCentralDirectory.SIGNATURE)
                continue
            start = central_directory.header_offset + shift
            if mem[start:start + 4] not in (B'PK\x03\x04', B'\0\0\0\0'):
                # SFX payloads seem to have a nulled header, so we permit this.
                self.log_debug('computed start of ZIP archive does not have the correct signature bytes')
                continue
            rev.append((start, end + len(end_marker)))
            end = start
        for start, end in reversed(rev):
            zip = mem[start:end + len(end_marker)]
            yield self.labelled(zip, offset=start)

class cast (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.cast and has the following commandline Interface:

usage: cast [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key

CAST encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -M, --mac-len N       Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in
                        bytes.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class cast(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(CAST)):
    """
    CAST encryption and decryption.
    """
    pass

class cca (data)

This unit is implemented in refinery.units.strings.cca and has the following commandline Interface:

usage: cca [-h] [-L] [-Q] [-0] [-v] data

Short for ConCatAppend: This unit concatenates the input data with its argument by appending the
latter to the former. See also ccp for the unit that prepends instead.

positional arguments:
  data           Binary string to be appended to the input.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cca(Unit):
    """
    Short for ConCatAppend: This unit concatenates the input data with its argument by
    appending the latter to the former. See also `refinery.ccp` for the unit that prepends
    instead.
    """

    def __init__(self, data: Arg(help='Binary string to be appended to the input.')):
        super().__init__(data=data)

    def process(self, data: bytearray):
        data.extend(self.args.data)
        return data

class ccp (data)

This unit is implemented in refinery.units.strings.ccp and has the following commandline Interface:

usage: ccp [-h] [-L] [-Q] [-0] [-v] data

Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending
the latter to the former. See also cca for the unit that appends instead.

positional arguments:
  data           Binary string to be prepended to the input.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ccp(Unit):
    """
    Short for ConCatPrepend: This unit concatenates the input data with its argument by
    prepending the latter to the former. See also `refinery.cca` for the unit that appends
    instead.
    """

    def __init__(self, data: Arg(help='Binary string to be prepended to the input.')):
        super().__init__(data=data)

    def process(self, data: bytearray):
        data[:0] = self.args.data
        return data

class cfmt (*formats, variable=None, separator=' ', multiplex=False, binary=False)

This unit is implemented in refinery.units.strings.cfmt and has the following commandline Interface:

usage: cfmt [-h] [-L] [-Q] [-0] [-v] [-n NAME] [-s S | -m] [-b] [format [format ...]]

Stands for "Convert to ForMaT": Transform a given chunk by applying a format string operation.
The positional format string placeholder {} will be replaced by the incoming data, named
placeholders have to exist as meta variables in the current chunk. For example, the following
pipeline can be used to print all files in a given directory with their corresponding SHA-256
hash:

    ef ** [| sha256 -t | cfmt {} {path} ]]

By default, format string arguments are simply joined along a space character to form a single
format string.

positional arguments:
  format               Format strings.

optional arguments:
  -n, --variable NAME  Store the formatted string in a meta variable.
  -s, --separator S    Separator to insert between format strings. The default is a space
                       character.
  -m, --multiplex      Do not join the format strings along the separator, generate one output
                       for each.
  -b, --binary         Use the binary formatter instead of the string formatter.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Allow partial results as output.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class cfmt(Unit):
    """
    Stands for "Convert to ForMaT": Transform a given chunk by applying a format string operation.
    The positional format string placeholder `{}` will be replaced by the incoming data, named
    placeholders have to exist as meta variables in the current chunk. For example, the following
    pipeline can be used to print all files in a given directory with their corresponding SHA-256
    hash:

        ef ** [| sha256 -t | cfmt {} {path} ]]

    By default, format string arguments are simply joined along a space character to form a single
    format string.
    """

    def __init__(
        self,
        *formats : Arg(help='Format strings.', type=str, metavar='format'),
        variable : Arg('-n', type=str, metavar='NAME', help='Store the formatted string in a meta variable.') = None,
        separator: Arg('-s', group='SEP', metavar='S',
            help='Separator to insert between format strings. The default is a space character.') = ' ',
        multiplex: Arg.Switch('-m', group='SEP',
            help='Do not join the format strings along the separator, generate one output for each.') = False,
        binary   : Arg.Switch('-b', help='Use the binary formatter instead of the string formatter.') = False,
    ):
        def fixfmt(fmt):
            if not isinstance(fmt, str):
                fmt = fmt.decode(self.codec)
            return decode(encode(fmt, 'latin-1', 'backslashreplace'), 'unicode-escape')
        formats = [fixfmt(f) for f in formats]
        if not multiplex:
            formats = [fixfmt(separator).join(formats)]
        super().__init__(formats=formats, variable=variable, binary=binary)

    def process(self, data):
        meta = metavars(data)
        meta.ghost = True
        args = [data]
        variable = self.args.variable
        if self.args.binary:
            formatter = partial(meta.format_bin, codec=self.codec, args=args)
        else:
            def formatter(spec):
                return meta.format_str(spec, self.codec, args, escaped=True).encode(self.codec)
        for spec in self.args.formats:
            result = formatter(spec)
            if variable is not None:
                result = self.labelled(data, **{variable: result})
            yield result

class chacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

This unit is implemented in refinery.units.crypto.cipher.chacha and has the following commandline Interface:

usage: chacha [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce]

ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original
Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is
interpreted as the initial state box and all other parameters are ignored.

positional arguments:
  key                The encryption key.
  nonce              The nonce. Default is the string REFINERY.

optional arguments:
  -s, --stateful     Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N    Discard the first N bytes of the keystream, 0 by default.
  -m, --magic MAGIC  The magic constant; depends on the key size by default.
  -x, --offset N     Optionally specify the stream index, default is 0.
  -r, --rounds N     The number of rounds. Has to be an even number.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class chacha(LatinCipherUnit):
    """
    ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the
    original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this
    data is interpreted as the initial state box and all other parameters are ignored.
    """
    def keystream(self) -> Iterable[int]:
        key = self.args.key
        if len(key) == 64:
            it = ChaChaCipher.FromState(key)
        else:
            it = ChaChaCipher(
                key,
                self.args.nonce,
                self.args.magic,
                self.args.rounds,
                self.args.offset,
            )
        yield from it

class chacha20 (key, nonce=b'REFINERY')

This unit is implemented in refinery.units.crypto.cipher.chacha and has the following commandline Interface:

usage: chacha20 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce]

ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12
bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20
is functionally equivalent to chacha with 20 rounds, but this unit uses the PyCryptodome library
C implementation rather than the pure Python implementation used by chacha.

positional arguments:
  key            The encryption key.
  nonce          The nonce. Default is the string REFINERY.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class chacha20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20)):
    """
    ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must
    be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking
    this unit for ChaCha20 is functionally equivalent to `refinery.chacha` with 20 rounds,
    but this unit uses the PyCryptodome library C implementation rather than the pure
    Python implementation used by `refinery.chacha`.
    """
    pass

class chaskey (key, iv=b'', padding=None, mode=None, raw=False, rounds=12, swap=False, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False)

This unit is implemented in refinery.units.crypto.cipher.chaskey and has the following commandline Interface:

usage: chaskey [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-k N] [-s] [-e] [-S N]
               key

This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the
default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey
with 16 rounds and in CTR mode.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -k, --rounds N        Number of rounds to use, the default is 12
  -s, --swap            Use big endian byte order for all blocks.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class chaskey(StandardBlockCipherUnit, cipher=BlockCipherFactory(Chaskey)):
    """
    This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the
    default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey
    with 16 rounds and in CTR mode.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        rounds: Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R,
        swap: Arg.Switch('-s', help='Use big endian byte order for all blocks.') = False,
        **more
    ):
        super().__init__(key, iv, padding=padding, mode=mode, raw=raw, rounds=rounds, swap=swap, **more)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            swap=self.args.swap,
            rounds=self.args.rounds,
            **optionals
        )

class chop (size, step=None, truncate=False)

This unit is implemented in refinery.units.meta.chop and has the following commandline Interface:

usage: chop [-h] [-L] [-Q] [-0] [-v] [-t] N [N]

Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.

positional arguments:
  N               Chop data into chunks of this size
  N               Optionally specify a step size (which is equal to the size by default) which
                  indicates the number of bytes by which the cursor will be increased after
                  extracting a chunk.

optional arguments:
  -t, --truncate  Truncate possible excess bytes at the end of the input, by default they are
                  appended as a single chunk.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class chop(Unit):
    """
    Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.
    """

    def __init__(
        self,
        size: Arg.Number('size', help='Chop data into chunks of this size'),
        step: Arg.Number('step', help=(
            'Optionally specify a step size (which is equal to the size by default) which indicates the number of bytes by '
            'which the cursor will be increased after extracting a chunk.')) = None,
        truncate: Arg.Switch('-t', help=(
            'Truncate possible excess bytes at the end of the input, by default they are appended as a single chunk.')) = False,
    ):
        return super().__init__(size=size, step=step, truncate=truncate)

    def process(self, data):
        view = memoryview(data)
        size = self.args.size
        step = self.args.step
        if size < 1:
            raise ValueError('The chunk size has to be a positive integer value.')
        yield from splitchunks(view, size, step, self.args.truncate)

class clower

This unit is implemented in refinery.units.strings.clower and has the following commandline Interface:

usage: clower [-h] [-L] [-Q] [-0] [-v]

Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the
input to lowercase.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class clower(Unit):
    """
    Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the
    input to lowercase.
    """
    def process(self, data):
        return data.lower()

class cm (invert=False, all=False, reset=False, size=False, ext=False, entropy=False, ic=False, magic=False, sha1=False, sha256=False, crc32=False, md5=False, hashes=False, *names)

This unit is implemented in refinery.units.meta.cm and has the following commandline Interface:

usage: cm [-h] [-L] [-Q] [-0] [-v] [-x | -a] [-r] [-S] [-F] [-E] [-C] [-M] [-1] [-2] [-3] [-5]
          [-H]
          [name [name ...]]

The Common Meta variables unit populates the set of meta variables of the current chunk with
commonly used metadata. The unit has no effect outside a frame.

positional arguments:
  name           A variable name that can include the common properties: mime, ext, magic, size,
                 entropy, ic, crc32, sha1, sha256, sha512, md5. If none is given, the size
                 variable is populated. For most of these, an optional argument is available that
                 can be used as a shorthand:

optional arguments:
  -x, --invert   populate only options that have not been specified
  -a, --all      populate all options
  -r, --reset    discard all meta variables that were not explicitly specified
  -S, --size     size of the chunk
  -F, --ext      guess file extension
  -E, --entropy  compute data entropy
  -C, --ic       compute the index of coincidence
  -M, --magic    compute file magic
  -1, --sha1     compute hash: SHA-1
  -2, --sha256   compute hash: SHA-256
  -3, --crc32    compute hash: CRC32
  -5, --md5      compute hash: MD5
  -H, --hashes   compute all common hashes

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cm(Unit):
    """
    The Common Meta variables unit populates the set of meta variables of the current chunk with commonly
    used metadata. The unit has no effect outside a frame.
    """
    def __init__(
        self,
        invert  : Arg.Switch('-x', group='ALL', help='populate only options that have not been specified') = False,
        all     : Arg.Switch('-a', group='ALL', help='populate all options') = False,
        reset   : Arg.Switch('-r', help='discard all meta variables that were not explicitly specified') = False,
        size    : Arg.Switch('-S', help='size of the chunk') = False,
        ext     : Arg.Switch('-F', help='guess file extension') = False,
        entropy : Arg.Switch('-E', help='compute data entropy') = False,
        ic      : Arg.Switch('-C', help='compute the index of coincidence') = False,
        magic   : Arg.Switch('-M', help='compute file magic') = False,
        sha1    : Arg.Switch('-1', help='compute hash: SHA-1') = False,
        sha256  : Arg.Switch('-2', help='compute hash: SHA-256') = False,
        crc32   : Arg.Switch('-3', help='compute hash: CRC32') = False,
        md5     : Arg.Switch('-5', help='compute hash: MD5') = False,
        hashes  : Arg.Switch('-H', help='compute all common hashes') = False,
        *names  : Arg(metavar='name', help=(
            F'A variable name that can include the common properties: {_COMMON_PROPERTIES_LIST}.'
            R' If none is given, the size variable is populated. For most of these, an optional '
            R'argument is available that can be used as a shorthand:'))
    ):
        def stringify(name):
            if isinstance(name, str):
                return name
            return name.decode(self.codec)

        names = {stringify(name) for name in names}
        if hashes:
            md5 = sha256 = sha1 = crc32 = True
        if size:
            names.add('size')
        if ext:
            names.add('ext')
        if entropy:
            names.add('entropy')
        if ic:
            names.add('ic')
        if magic:
            names.add('magic')
        if sha1:
            names.add('sha1')
        if sha256:
            names.add('sha256')
        if crc32:
            names.add('crc32')
        if md5:
            names.add('md5')
        if not names and not reset:
            names.add('size')
        if all:
            if invert:
                raise ValueError('invert and all are both enabled, resulting in empty configuration.')
            names = set(LazyMetaOracle.derivations)
        elif invert:
            names = set(LazyMetaOracle.derivations) - names
        super().__init__(names=names, reset=reset)

    def process(self, data):
        return data

    def filter(self, chunks):
        names = self.args.names
        reset = self.args.reset
        for chunk in chunks:
            chunk: Chunk
            if not chunk.visible:
                yield chunk
                continue
            meta = metavars(chunk)
            if reset:
                chunk.meta.clear()
            for name in names:
                chunk[name] = meta[name]
            yield chunk

class couple (*commandline, buffer=False, noerror=False, timeout=0.0)

This unit is implemented in refinery.units.misc.couple and has the following commandline Interface:

usage: couple [-h] [-L] [-Q] [-0] [-v] [-b] [-e] [-t T] ...

Turns any command into a refinery unit. Data is processed by feeding it to the standard input of
a process spawned from the given command line, and then reading the standard output of that
process as the result of the operation. The main purpose of this unit is to allow using the
syntax from frame with other command line tools. By default, the couple unit streams the output
from the executed command as individual outputs, but the buffer option can be set to buffer all
output of a single execution. The format string expression {} or {0} can be used as one of the
arguments passed to the external command to represent the incoming data. In this case, the data
will not be sent to the standard input device of the new process.

positional arguments:
  (all remaining)  All remaining command line tokens form an arbitrary command line to be
                   executed. Use format string syntax to insert meta variables and incoming data
                   chunks.

optional arguments:
  -b, --buffer     Buffer the command output for one execution rather than streaming it.
  -e, --noerror    do not merge stdin and stderr; stderr will only be output if -v is also
                   specified.
  -t, --timeout T  Set an execution timeout as a floating point number in seconds, there is none
                   by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class couple(Unit):
    """
    Turns any command into a refinery unit. Data is processed by feeding it to the standard input of a process spawned from
    the given command line, and then reading the standard output of that process as the result of the operation. The main
    purpose of this unit is to allow using the syntax from `refinery.lib.frame` with other command line tools. By default,
    the `refinery.couple` unit streams the output from the executed command as individual outputs, but the `buffer` option
    can be set to buffer all output of a single execution. The format string expression `{}` or `{0}` can be used as one of
    the arguments passed to the external command to represent the incoming data. In this case, the data will not be sent
    to the standard input device of the new process.
    """

    _JOIN_TIME = 0.1

    def __init__(
        self, *commandline : Arg(nargs='...', type=str, metavar='(all remaining)', help=(
            'All remaining command line tokens form an arbitrary command line to be executed. Use format string syntax '
            'to insert meta variables and incoming data chunks.')),
        buffer: Arg.Switch('-b', help='Buffer the command output for one execution rather than streaming it.') = False,
        noerror: Arg('-e', help='do not merge stdin and stderr; stderr will only be output if -v is also specified.') = False,
        timeout: Arg('-t', metavar='T',
            help='Set an execution timeout as a floating point number in seconds, there is none by default.') = 0.0
    ):
        if not commandline:
            raise ValueError('you need to provide a command line.')
        super().__init__(commandline=commandline, noerror=noerror, buffer=buffer, timeout=timeout)

    def process(self, data):
        def shlexjoin():
            import shlex
            return ' '.join(shlex.quote(cmd) for cmd in commandline)

        meta = metavars(data)
        meta.ghost = True
        used = set()
        commandline = [
            meta.format(cmd, self.codec, [data], None, False, used=used)
            for cmd in self.args.commandline
        ]

        if 0 in used:
            self.log_info('input used as command-line argument; sending no input to process stdin')
            data = None

        self.log_debug(shlexjoin)

        posix = 'posix' in sys.builtin_module_names
        process = Popen(commandline,
            stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False, close_fds=posix)

        if self.args.buffer and not self.args.timeout:
            out, err = process.communicate(data)
            for line in err.splitlines():
                self.log_info(line)
            yield out
            return

        import io
        from threading import Thread, Event
        from queue import Queue, Empty
        from time import process_time, sleep

        start = 0
        result = None

        qerr = Queue()
        qout = Queue()
        done = Event()

        def adapter(stream, queue: Queue, event: Event):
            while not event.is_set():
                out = stream.read1()
                if out: queue.put(out)
                else: break
            stream.close()

        recvout = Thread(target=adapter, args=(process.stdout, qout, done), daemon=True)
        recverr = Thread(target=adapter, args=(process.stderr, qerr, done), daemon=True)

        recvout.start()
        recverr.start()

        if data:
            process.stdin.write(data)
        process.stdin.close()
        start = process_time()

        if self.args.buffer or self.args.timeout:
            result = io.BytesIO()

        def queue_read(q: Queue):
            try: return q.get_nowait()
            except Empty: return None

        errbuf = io.BytesIO()

        while True:
            out = queue_read(qout)
            err = None

            if self.args.noerror:
                err = queue_read(qerr)
            else:
                out = out or queue_read(qerr)

            if err and self.log_info():
                errbuf.write(err)
                errbuf.seek(0)
                lines = errbuf.readlines()
                errbuf.seek(0)
                errbuf.truncate()
                if lines:
                    if not (done.is_set() or lines[~0].endswith(B'\n')):
                        errbuf.write(lines.pop())
                    for line in lines:
                        msg = line.rstrip(B'\n')
                        if msg: self.log_info(msg)
            if out:
                if self.args.buffer or self.args.timeout:
                    result.write(out)
                if not self.args.buffer:
                    yield out

            if done.is_set():
                if recverr.is_alive():
                    self.log_warn('stderr receiver thread zombied')
                if recvout.is_alive():
                    self.log_warn('stdout receiver thread zombied')
                break
            elif not err and not out and process.poll() is not None:
                recverr.join(self._JOIN_TIME)
                recvout.join(self._JOIN_TIME)
                done.set()
            elif self.args.timeout:
                if process_time() - start > self.args.timeout:
                    self.log_info('terminating process after timeout expired')
                    done.set()
                    process.terminate()
                    for wait in range(4):
                        if process.poll() is not None:
                            break
                        sleep(self._JOIN_TIME)
                    else:
                        self.log_warn('process termination may have failed')
                    recverr.join(self._JOIN_TIME)
                    recvout.join(self._JOIN_TIME)
                    if not len(result.getbuffer()):
                        result = RuntimeError('timeout reached, process had no output')
                    else:
                        result = RefineryPartialResult(
                            'timeout reached, returning all collected output',
                            partial=result.getvalue())

        if isinstance(result, Exception):
            raise result
        elif self.args.buffer:
            yield result.getvalue()

class cp1252

This unit is implemented in refinery.units.encoding.cp1252 and has the following commandline Interface:

usage: cp1252 [-h] [-L] [-Q] [-0] [-v] [-R]

Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class cp1252(Unit):
    """
    Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.
    """

    def process(self, data):
        return data.decode(self.codec).encode('cp1252')

    def reverse(self, data):
        return data.decode('cp1252').encode(self.codec)

class crc32 (text=False)

This unit is implemented in refinery.units.crypto.hash.checksums and has the following commandline Interface:

usage: crc32 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the CRC32 Hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class crc32(HashUnit):
    """
    Returns the CRC32 Hash of the input data.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return struct.pack('>I', zlib.crc32(data))

class csv (quote=b'"', delim=b',')

This unit is implemented in refinery.units.formats.csv and has the following commandline Interface:

usage: csv [-h] [-L] [-Q] [-0] [-v] [-R] [-q QUOTE] [-d DELIM]

Extracts the rows of a CSV document with header and converts them into JSON chunks.

optional arguments:
  -q, --quote QUOTE  Specify the quote character, the default is a double quote.
  -d, --delim DELIM  Specify the delimiter, the default is a single comma.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class csv(Unit):
    """
    Extracts the rows of a CSV document with header and converts them into JSON chunks.
    """
    def __init__(
        self,
        quote: Unit.Arg('-q', help='Specify the quote character, the default is a double quote.') = B'"',
        delim: Unit.Arg('-d', help='Specify the delimiter, the default is a single comma.') = B','
    ):
        super().__init__(quote=quote, delim=delim)

    def reverse(self, data: bytearray):
        quote = self.args.quote.decode(self.codec)
        delim = self.args.delim.decode(self.codec)

        try:
            table: List[Dict[str, Any]] = json.loads(data)
        except Exception:
            table: List[Dict[str, Any]] = [json.loads(line) for line in data.splitlines()]

        if not isinstance(table, list):
            raise ValueError('Input must be a JSON list.')

        keys = {}
        # A dictionary is used here over a set because dictionaries remember insertion order.
        # When feeding the unit a sequence of JSON objects, the user would likely expect the
        # column order in the resulting CSV to derive from the entry oder in the JSON data.

        for row in table:
            for key in row:
                if not isinstance(key, str):
                    continue
                keys[key] = None

        keys = list(keys)
        out = MemoryFile()

        with io.TextIOWrapper(out, self.codec, newline='') as stream:
            writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            for row in table:
                writer.writerow([str(row.get(key, '')) for key in keys])
            return out.getvalue()

    def process(self, data):
        quote = self.args.quote.decode(self.codec)
        delim = self.args.delim.decode(self.codec)

        def convert(field: str):
            if field.isdigit() and not field.startswith('0'):
                return int(field)
            date = isodate(field)
            if date is not None:
                return date.isoformat(' ', 'seconds')
            return field

        with io.TextIOWrapper(MemoryFile(data), self.codec) as stream:
            rows = _csv.reader(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            keys = next(rows)
            for row in rows:
                out = {key: convert(value) for key, value in zip(keys, row)}
                yield json.dumps(out, indent=4).encode(self.codec)

class cswap

This unit is implemented in refinery.units.strings.cswap and has the following commandline Interface:

usage: cswap [-h] [-L] [-Q] [-0] [-v]

Swap the case of the input string; all lowercase letters are turned into their uppercase variant
and vice-versa.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cswap(Unit):
    """
    Swap the case of the input string; all lowercase letters are turned into their uppercase
    variant and vice-versa.
    """
    def process(self, data: bytearray):
        lcase = bytes(range(B'a'[0], B'z'[0] + 1))
        ucase = bytes(range(B'A'[0], B'Z'[0] + 1))
        delta = lcase[0] - ucase[0]
        for k, letter in enumerate(data):
            if letter in ucase:
                data[k] += delta
            elif letter in lcase:
                data[k] -= delta
        return data

class cull

This unit is implemented in refinery.units.meta.cull and has the following commandline Interface:

usage: cull [-h] [-L] [-Q] [-0] [-v]

Remove all chunks from the current frame if they are not visible. Chunks can become invisible by
exclusion through iff, iffp, iffs, iffx, or scope.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cull(Unit):
    """
    Remove all chunks from the current `refinery.lib.frame` if they are not visible. Chunks can become invisible
    by exclusion through `refinery.iff`, `refinery.iffp`, `refinery.iffs`, `refinery.iffx`, or `refinery.scope`.
    """
    def filter(self, chunks: Iterable[Chunk]):
        for chunk in chunks:
            if chunk.visible:
                yield chunk

class cupper

This unit is implemented in refinery.units.strings.cupper and has the following commandline Interface:

usage: cupper [-h] [-L] [-Q] [-0] [-v]

Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the
input to uppercase.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class cupper(Unit):
    """
    Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the
    input to uppercase.
    """
    def process(self, data):
        return data.upper()

class datefix (format='%Y-%m-%d %H:%M:%S', dos=False)

This unit is implemented in refinery.units.misc.datefix and has the following commandline Interface:

usage: datefix [-h] [-L] [-Q] [-0] [-v] [-d] [format]

Parses all kinds of date formats and unifies them into the same format.

positional arguments:
  format         Specify the output format as a strftime-like string, using ISO by default.

optional arguments:
  -d, --dos      Parse timestamps in DOS rather than Unix format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class datefix(Unit):
    """
    Parses all kinds of date formats and unifies them into the same format.
    """

    _FORMATS = [
        '%B %dth %Y %H:%M:%S (UTC)',  # November 27th 2019 17:37:02 (UTC)
        '%B %dnd %Y %H:%M:%S (UTC)',  # November 22nd 2019 17:37:02 (UTC)
        '%B %dst %Y %H:%M:%S (UTC)',  # November 21st 2019 17:37:02 (UTC)
        '%Y-%m-%dT%H:%M:%S',          # 2010-03-15T06:27:50
        '%Y-%m-%d %H:%M:%S',          # iso (2010-03-15 06:27:50.000000)
        '%Y-%m-%d %H:%M:%SZ%f',
        '%Y-%m-%dT%H:%M:%S.%f',
        '%Y-%m-%dT%H:%M:%SZ%f',
        '%a %b %d %Y %H:%M:%S',       # Thu Apr 24 2014 12:32:21
        '%m/%d/%Y %H:%M:%S',
        '%m/%d/%Y',
    ]

    _TIMEZONE_REGEXES = [re_compile(p) for p in [
        R'([+-])(\d{2})(\d{2})$',           # Thu Apr 24 2014 12:32:21 GMT-0700
        R'([+-])(\d{2}):(\d{2})$',          # 2017:09:11 23:47:22+02:00
        R'GMT([+-])(\d{2})(\d{2}) \(.+\)$'  # Thu Apr 24 2014 12:32:21 GMT-0700 (PDT)
    ]]

    def __init__(
        self,
        format: Arg(help='Specify the output format as a strftime-like string, using ISO by default.') = '%Y-%m-%d %H:%M:%S',
        dos: Arg('-d', help='Parse timestamps in DOS rather than Unix format.') = False
    ):
        super().__init__(format=format, dos=dos)

    @staticmethod
    def dostime(stamp: int) -> datetime:
        """
        Parses a given DOS timestamp into a datetime object.
        """
        d, t = stamp >> 16, stamp & 0xFFFF
        s = (t & 0x1F) << 1

        return datetime(
            year   = ((d & 0xFE00) >> 0x9) + 1980,  # noqa
            month  = ((d & 0x01E0) >> 0x5),         # noqa
            day    = ((d & 0x001F) >> 0x0),         # noqa
            hour   = ((t & 0xF800) >> 0xB),         # noqa
            minute = ((t & 0x07E0) >> 0x5),         # noqa
            second = 59 if s == 60 else s,          # noqa
        )

    def _format(self, dt: datetime) -> str:
        return dt.strftime(self.args.format)

    def _extract_timezone(self, data):
        for r in self._TIMEZONE_REGEXES:
            m = r.search(data)
            if not m:
                continue
            pm = m[1]
            td = timedelta(
                hours=int(m[2]), minutes=int(m[3]))
            if pm == '-':
                td = -td
            return data[:-len(m[0])].strip(), td

        return data, None

    @linewise
    def process(self, data: str) -> str:
        data = data.strip()

        # replace colons (i.e. for exiftool dates: 2017:01:01)
        if len(data) > 10 and data[4] == ':' and data[7] == ':':
            data = F'{data[0:4]}-{data[5:7]}-{data[8:]}'

        # strips Z at end (i.e. 20171022055144Z)
        if data.endswith('Z'):
            data = data[:-1]

        if data.startswith('0x'):
            try:
                data = str(int(data, 16))
            except Exception:
                pass

        # parses timestamps and dates without much format
        if data.isdigit():
            time_stamp = int(data)
            if len(data) > 14:
                raise Exception('cannot parse all-numeric string as date: %s' % data)
            elif len(data) == 14:
                # i.e. 20111020193727
                return self._format(datetime.strptime(data, '%Y%m%d%H%M%S'))
            elif len(data) == 13:
                # i.e. 1458016535000
                time_stamp //= 1000
                data = data[:-3]
            if self.args.dos:
                return self._format(self.dostime(time_stamp))
            else:
                return self._format(date_from_timestamp(time_stamp))

        data, time_delta = self._extract_timezone(data)

        for f in self._FORMATS:
            try:
                dt = datetime.strptime(data, f)
            except ValueError:
                continue
            return self._format(dt if time_delta is None else dt - time_delta)

        return data

Static methods

def dostime(stamp)

Parses a given DOS timestamp into a datetime object.

Expand source code Browse git

@staticmethod
def dostime(stamp: int) -> datetime:
    """
    Parses a given DOS timestamp into a datetime object.
    """
    d, t = stamp >> 16, stamp & 0xFFFF
    s = (t & 0x1F) << 1

    return datetime(
        year   = ((d & 0xFE00) >> 0x9) + 1980,  # noqa
        month  = ((d & 0x01E0) >> 0x5),         # noqa
        day    = ((d & 0x001F) >> 0x0),         # noqa
        hour   = ((t & 0xF800) >> 0xB),         # noqa
        minute = ((t & 0x07E0) >> 0x5),         # noqa
        second = 59 if s == 60 else s,          # noqa
    )

class decompress (prepend=True, tolerance=12, max_ratio=1, min_ratio=0.0001)

This unit is implemented in refinery.units.compression.decompress and has the following commandline Interface:

usage: decompress [-h] [-L] [-Q] [-0] [-v] [-P] [-t N] [-m R] [-n R]

Attempts all available decompression units against the input and returns the output of the first
successful one. If none succeeds, the data is returned unaltered. The process is heavily biased
against LZNT1 decompression due to a large tendency for LZNT1 false positives.

optional arguments:
  -P, --no-prepend   By default, if decompression fails, the unit attempts to prefix the data
                     with all possible values of a single byte and decompress the result. This
                     behavior can be disabled with this flag.
  -t, --tolerance N  Maximum number of bytes to strip from the beginning of the data; The default
                     value is 12.
  -m, --max-ratio R  To determine whether a decompression algorithm was successful, the ratio of
                     compressed size to decompressed size may at most be as large as this number,
                     a floating point value R; default value is 1.
  -n, --min-ratio R  Require that compression ratios must be at least as large as R. This is a
                     "too good to be true" heuristic against algorithms like lznt1 that can
                     produce false positives. The default is 0.0001.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class decompress(Unit):
    """
    Attempts all available decompression units against the input and returns
    the output of the first successful one. If none succeeds, the data is
    returned unaltered. The process is heavily biased against LZNT1 decompression
    due to a large tendency for LZNT1 false positives.
    """
    def __init__(
        self,
        prepend: Arg.Switch('-P', '--no-prepend', off=True, help=(
            'By default, if decompression fails, the unit attempts to prefix '
            'the data with all possible values of a single byte and decompress '
            'the result. This behavior can be disabled with this flag.')
        ) = True,
        tolerance: Arg.Number('-t', help=(
            'Maximum number of bytes to strip from the beginning of the data; '
            'The default value is 12.')
        ) = 12,
        max_ratio: Arg('-m', metavar='R', help=(
            'To determine whether a decompression algorithm was successful, the '
            'ratio of compressed size to decompressed size may at most be as large '
            'as this number, a floating point value R; default value is {default}.')
        ) = 1,
        min_ratio: Arg('-n', metavar='R', help=(
            'Require that compression ratios must be at least as large as R. This '
            'is a "too good to be true" heuristic against algorithms like lznt1 '
            'that can produce false positives. The default is {default}.')
        ) = 0.0001,
    ):
        if min_ratio <= 0:
            raise ValueError('The compression factor must be nonnegative.')
        super().__init__(
            tolerance=tolerance,
            prepend=prepend,
            min_ratio=min_ratio,
            max_ratio=max_ratio
        )
        self.engines: List[Unit] = [
            engine.assemble() for engine in [
                szdd, zl, lzma, aplib, qlz, lzf, lzw, jcalg, bz2, blz, lzjb, lz4, lzo, lznt1, nrv2e, nrv2d, nrv2b]
        ]
        for engine in self.engines:
            engine.log_detach()

    def process(self, data):

        data = memoryview(data)

        class Decompression(NamedTuple):
            engine: Unit
            result: Optional[ByteString] = None
            cutoff: int = 0
            prefix: Optional[int] = None
            failed: bool = False

            def __str__(self):
                status = 'partial' if self.failed else 'success'
                prefix = self.prefix
                if prefix is not None:
                    prefix = F'0x{prefix:02X}'
                return F'prefix={prefix}, cutoff=0x{self.cutoff:02X}, [{status}] engine={self.engine.name}'

            def __len__(self):
                return len(self.result)

            @property
            def ratio(self):
                if not self.result:
                    return INF
                return len(data) / len(self.result)

            @property
            def unmodified(self):
                return self.cutoff == 0
                return self.prefix is None and self.cutoff == 0

            @property
            def method(self):
                return self.engine.name

        if self.args.prepend:
            buffer = bytearray(1 + len(data))
            buffer[1:] = data

        best_only_success: Optional[Decompression] = None
        best_with_failure: Optional[Decompression] = None

        def decompress(engine: Unit, cutoff: int = 0, prefix: Optional[int] = None):
            ingest = data[cutoff:]
            failed = True
            if prefix is not None:
                buffer[0] = prefix
                ingest = buffer
            if engine.handles(ingest) is False:
                return Decompression(engine, None, cutoff, prefix)
            try:
                result = engine.process(ingest)
            except RefineryPartialResult as pr:
                result = pr.partial
            except Exception:
                result = None
            else:
                failed = False
            return Decompression(engine, result, cutoff, prefix, failed)

        def update(new: Decompression, best: Optional[Decompression] = None, discard_if_too_good=False) -> Decompression:
            ratio = new.ratio
            if ratio > self.args.max_ratio:
                return best
            if ratio < self.args.min_ratio:
                return best
            prefix = new.prefix
            if prefix is not None:
                prefix = F'0x{prefix:02X}'
            r = 1 if new.unmodified and best and not best.unmodified else 0.95
            if not best or len(new) < len(best):
                q = 0
            else:
                q = ratio / best.ratio
            if q < r:
                if best and discard_if_too_good:
                    if q < 0.5:
                        return best
                    if new.failed:
                        return best
                self.log_info(lambda: F'obtained {ratio * 100:07.4f}% compression ratio [q={q:07.4f}] with: {new!s}')
                return new
            else:
                self.log_debug(F'obtained {ratio * 100:07.4f}% compression ratio [q={q:07.4f}] with: {new!s}')
                return best

        for engine in self.engines:
            self.log_debug(F'attempting engine: {engine.name}')
            careful = isinstance(engine, (lznt1, lzf, lzjb))
            for t in range(self.args.tolerance):
                if best_only_success and careful and t > 0:
                    break
                dc = decompress(engine, t)
                if not dc.failed:
                    best_only_success = update(dc, best_only_success, careful)
                else:
                    best_with_failure = update(dc, best_with_failure, careful)
            if self.args.prepend and not best_only_success:
                for p in range(0x100):
                    dc = decompress(engine, 0, p)
                    if not dc.failed:
                        best_only_success = update(dc, best_only_success, careful)
                    else:
                        best_with_failure = update(dc, best_with_failure, careful)

        if best_only_success is not None:
            return self.labelled(best_only_success.result, method=best_only_success.method)
        if best_with_failure is not None:
            self.log_info('the only decompression with result returned only a partial result.')
            return self.labelled(best_with_failure.result, method=best_with_failure.method)
        self.log_warn('no compression engine worked, returning original data.')
        return data

class dedup (count=False)

This unit is implemented in refinery.units.meta.dedup and has the following commandline Interface:

usage: dedup [-h] [-L] [-Q] [-0] [-v] [-c]

Deduplicates a sequence of multiple inputs. The deduplication is limited to the current frame.

optional arguments:
  -c, --count    Store the count of each deduplicated chunk.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dedup(Unit):
    """
    Deduplicates a sequence of multiple inputs. The deduplication is limited to the current `refinery.lib.frame`.
    """
    def __init__(self, count: Arg.Switch('-c', help='Store the count of each deduplicated chunk.') = False):
        super().__init__(count=count)

    def filter(self, chunks):
        if self.args.count:
            from collections import Counter
            barrier = Counter(chunks)
            for chunk in chunks:
                if not chunk.visible:
                    yield chunk
                    continue
                barrier.update(chunk)
            for chunk, count in barrier.items():
                chunk.meta['count'] = count
                yield chunk
        else:
            from hashlib import md5
            barrier = set()
            for chunk in chunks:
                if not chunk.visible:
                    yield chunk
                    continue
                hashed = md5(chunk).digest()
                if hashed not in barrier:
                    barrier.add(hashed)
                    yield chunk

class defang (url_only=False, url_protocol=False, dot_only=False, quote_md=False)

This unit is implemented in refinery.units.pattern.defang and has the following commandline Interface:

usage: defang [-h] [-L] [-Q] [-0] [-v] [-R] [-u] [-p] [-d] [-q]

Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot
in the expression by [.]. For example, 127.0.0.1 will be replaced by 127.0.0[.]1. For URL
indicators, the colon after the procol scheme is also wrapped in brackets.

optional arguments:
  -u, --url-only      Only defang URLs, do not look for domains or IPs.
  -p, --url-protocol  Escape the protocol in URLs.
  -d, --dot-only      Do not escape the protocol colon in URLs.
  -q, --quote-md      Wrap all indicators in backticks for markdown code.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Allow partial results as output.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.
  -R, --reverse       Use the reverse operation; Specify twice to normalize (first decode, then
                      encode).

Expand source code Browse git

class defang(Unit):
    """
    Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot
    in the expression by `[.]`. For example, `127.0.0.1` will be replaced by `127.0.0[.]1`. For URL
    indicators, the colon after the procol scheme is also wrapped in brackets.
    """

    _WHITELIST = [
        B'wscript.shell',
    ]

    _PROTOCOL_ESCAPES = {
        B'http': B'hxxp',
        B'https': B'hxxps',
        B'ftp': B'fxp',
        B'ftps': B'fxps',
    }

    def __init__(
        self,
        url_only: Arg.Switch('-u', help='Only defang URLs, do not look for domains or IPs.') = False,
        url_protocol: Arg.Switch('-p', help='Escape the protocol in URLs.') = False,
        dot_only: Arg.Switch('-d', help='Do not escape the protocol colon in URLs.') = False,
        quote_md: Arg.Switch('-q', help='Wrap all indicators in backticks for markdown code.') = False
    ):
        self.superinit(super(), **vars())

    def _quote(self, word):
        return word if not self.args.quote_md else B'`%s`' % word

    def reverse(self, data: bytearray):
        def refang(hostname):
            return hostname[0].replace(B'[.]', B'.')
        data = defanged.hostname.sub(refang, data)
        data = data.replace(B'[:]//', B'://')
        data = data.replace(B'[://]', B'://')
        data = re.sub(B'h.{3}?(s?)://', B'http\\1://', data)
        data = re.sub(B'fxp(s?)://', B'ftp\\1://', data)
        return data

    def process(self, data):
        def replace_hostname(hostname: bytes, match=True):
            if match:
                return self._quote(replace_hostname(hostname[0], False))
            self.log_info('replace:', hostname)
            host = hostname
            user, atsgn, host = host.rpartition(B'@')
            host, colon, port = host.rpartition(B':')
            host = host.lower()
            if not colon:
                host = port
                port = B''
            if host in self._WHITELIST:
                return hostname
            host = re.split(R'(?:\[\.\]|\.)', host.decode('latin1'))
            if len(host) == 1:
                return hostname
            components = iter(reversed(host))
            defanged_parts = [next(components)]
            separator = '[.]'
            for part in components:
                defanged_parts.append(separator)
                defanged_parts.append(part)
                separator = '[.]' if part in tlds else '.'
            defanged_host = ''.join(reversed(defanged_parts)).encode('latin1')
            return user + atsgn + defanged_host + colon + port

        def replace_url(url: bytes):
            if not url:
                return url
            self.log_info('replace:', url)
            url = url.replace(B'[:]//', B'://', 1)
            url = url.replace(B'[.]', B'.')
            prefix = B'tcp'
            if url.startswith(B'://'):
                scheme = 0
            elif url.startswith(B'//'):
                scheme = 1
                prefix = prefix + B':'
            else:
                scheme = 2
                prefix = B''
            parsed = urlparse(prefix + url)
            operations = {
                name: self.process(getattr(parsed, name))
                for name in ('path', 'params', 'query', 'fragment')
            }
            if self.args.url_protocol and parsed.scheme:
                operations.update(scheme=self._PROTOCOL_ESCAPES.get(parsed.scheme.lower(), scheme))
            if scheme < 2:
                operations.update(scheme=B'')
            operations.update(netloc=replace_hostname(parsed.netloc, False))
            url = urlunparse(parsed._replace(**operations))
            if scheme == 0:
                url = B':' + url
            if not self.args.dot_only:
                url = url.replace(B'://', B'[:]//')
            return self._quote(url)

        urlsplit = defanged.url.split(data)
        step = defanged.url.value.groups + 1
        urlsplit[1::step] = [replace_url(t) for t in itertools.islice(iter(urlsplit), 1, None, step)]

        if not self.args.url_only:
            urlsplit[0::step] = [
                indicators.hostname.sub(replace_hostname, t)
                for t in itertools.islice(iter(urlsplit), 0, None, step)
            ]

        def fuse(urlsplit):
            txt = itertools.islice(iter(urlsplit), 0, None, step)
            url = itertools.islice(iter(urlsplit), 1, None, step)
            while True:
                try:
                    yield next(txt)
                    yield next(url)
                except StopIteration:
                    break

        return B''.join(fuse(urlsplit))

class deob_js_arrays

This unit is implemented in refinery.units.obfuscation.js.arrays and has the following commandline Interface:

usage: deob-js-arrays [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator to turn ["Z", "t", "s", "e"][0] into "Z".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_arrays(Deobfuscator):
    """
    JavaScript deobfuscator to turn `["Z", "t", "s", "e"][0]` into `"Z"`.
    """

    def deobfuscate(self, data):

        def litpick(match):
            try:
                array = match[1]
                index = int(match[2])
                lpick = array.split(',')[index].strip()
                self.log_debug(lambda: F'{lpick} = {match[0]}')
            except (TypeError, IndexError):
                lpick = match[0]
            return lpick

        p = R'\s{{0,5}}'.join([
            '\\[', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\]', '\\[', '({i})', '\\]'
        ]).format(i=formats.integer, s=formats.string)
        return re.sub(p, litpick, data)

class deob_js_getattr

This unit is implemented in refinery.units.obfuscation.js.getattr and has the following commandline Interface:

usage: deob-js-getattr [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator to turn WScript["CreateObject"] into WScript.CreateObject.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_getattr(Deobfuscator):
    """
    JavaScript deobfuscator to turn `WScript["CreateObject"]` into `WScript.CreateObject`.
    """

    def deobfuscate(self, data):
        def dottify(match):
            name = match[2][1:-1]
            if name.isidentifier():
                return F'{match[1]}.{name}'
            return match[0]
        return re.sub(FR'(\w+)\[({formats.string})\]', dottify, data)

class deob_js_tuples

This unit is implemented in refinery.units.obfuscation.js.tuples and has the following commandline Interface:

usage: deob-js-tuples [-h] [-L] [-Q] [-0] [-v]

JavaScript deobfuscator to turn ("Z", "t", "s", "e") into "e".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_js_tuples(Deobfuscator):
    """
    JavaScript deobfuscator to turn `("Z", "t", "s", "e")` into `"e"`.
    """

    def deobfuscate(self, data):

        def litpick(match):
            try:
                array = match[1]
                lpick = array.split(',')[-1].strip()
                self.log_debug(lambda: F'{lpick} = {match[0]}')
            except (TypeError, IndexError):
                lpick = match[0]
            return lpick

        p = R'\s{{0,5}}'.join([
            '\\(', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\)'
        ]).format(i=formats.integer, s=formats.string)
        return re.sub(p, litpick, data)

class deob_ps1 (timeout=100)

This unit is implemented in refinery.units.obfuscation.ps1.all and has the following commandline Interface:

usage: deob-ps1 [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

optional arguments:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Allow partial results as output.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1(IterativeDeobfuscator):

    _SUBUNITS = [sub() for sub in [
        deob_ps1_escape,
        deob_ps1_cases,
        deob_ps1_brackets,
        deob_ps1_format,
        deob_ps1_typecast,
        deob_ps1_stringreplace,
        deob_ps1_b64convert,
        deob_ps1_encodings,
        deob_ps1_concat,
        deob_ps1_invoke,
        deob_ps1_uncurly
    ]]

    def deobfuscate(self, data):
        for u in self._SUBUNITS:
            u.log_level = self.log_level
        for unit in self._SUBUNITS:
            self.log_debug(lambda: F'invoking {unit.name}')
            checkpoint = hash(data)
            data = unit.deobfuscate(data)
            if checkpoint != hash(data) and not self.log_debug('data has changed.'):
                self.log_info(F'used {unit.name}')
        return re.sub(R'[\r\n]+', '\n', data)

class deob_ps1_b64convert

This unit is implemented in refinery.units.obfuscation.ps1.b64convert and has the following commandline Interface:

usage: deob-ps1-b64convert [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_b64convert(Deobfuscator):

    _SENTINEL = re.compile('\\s*'.join(
        (re.escape('[System.Convert]::FromBase64String'), '\\(', '({s})', '\\)')
    ).format(s=formats.ps1str), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        def replacer(match: re.Match[str]):
            if strlit.get_container(match.start()):
                return match[0]
            try:
                string, = string_unquote(match[1])
            except ValueError:
                return match[0]
            try:
                bytes = base64.b64decode(string)
            except Exception:
                return match[0]
            return '@({})'.format(','.join(F'0x{b:02X}' for b in bytes))

        return self._SENTINEL.sub(replacer, data)

class deob_ps1_brackets

This unit is implemented in refinery.units.obfuscation.ps1.brackets and has the following commandline Interface:

usage: deob-ps1-brackets [-h] [-L] [-Q] [-0] [-v]

PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e.
("{0}{2}{1}") is transformed to "{0}{2}{1}". Currently, only integer and string constants are
supported.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_brackets(Deobfuscator):
    """
    PowerShell deobfuscation that removes superfluous brackets around constant
    literals, i.e. `("{0}{2}{1}")` is transformed to `"{0}{2}{1}"`. Currently,
    only integer and string constants are supported.
    """
    _SENTINEL = re.compile(
        RF'''(?<![\w"']{{2}})'''  # this may be a function call
        RF'''(\-\w+)?'''  # not a function call but an argument
        RF'''\(\s*({formats.integer}|{formats.ps1str})\s*(\S)''',
        flags=re.IGNORECASE
    )

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        repeat = True

        @strlit.outside
        def replacement(match):
            nonlocal repeat
            if match[3] == ')':
                repeat = True
                return (match[1] or '') + match[2]

        while repeat:
            repeat = False
            data = self._SENTINEL.sub(replacement, data)

        return data

class deob_ps1_cases

This unit is implemented in refinery.units.obfuscation.ps1.cases and has the following commandline Interface:

usage: deob-ps1-cases [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_cases(Deobfuscator):
    _NAMES = [
        '-BXor',
        '-Exec Bypass',
        '-NoLogo',
        '-NonInter',
        '-Replace',
        '-Windows Hidden',
        '.Invoke',
        'Assembly',
        'Byte',
        'Char',
        'ChildItem',
        'CreateThread',
        'Get-Variable',
        'GetType',
        'IntPtr',
        'Invoke-Expression',
        'Invoke',
        'Length',
        'Net.WebClient',
        'PowerShell',
        'PSVersionTable',
        'Set-Item',
        'Set-Variable',
        'Start-Sleep',
        'ToString',
        'Type',
        'Value',
        'Void',
    ]

    @outside(formats.ps1str)
    def deobfuscate(self, data):
        for name in self._NAMES:
            data = re.sub(RF'\b{re.escape(name)}\b', name, data, flags=re.IGNORECASE)
        return data

class deob_ps1_concat (timeout=100)

This unit is implemented in refinery.units.obfuscation.ps1.concat and has the following commandline Interface:

usage: deob-ps1-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

optional arguments:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Allow partial results as output.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''['"]\s*[+&]\s*['"]''')

    def deobfuscate(self, data):

        def concat(data):
            strlit = Ps1StringLiterals(data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    a = strlit.ranges[a]
                    b = strlit.ranges[b]
                    stra = data[slice(*a)]
                    strb = data[slice(*b)]
                    parts = list(string_unquote(stra))
                    it = iter(string_unquote(strb))
                    parts[~0] += next(it)
                    parts.extend(it)
                    yield data[:a[0]] + string_quote(parts)
                    data = data[b[1]:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_ps1_encodings

This unit is implemented in refinery.units.obfuscation.ps1.encodings and has the following commandline Interface:

usage: deob-ps1-encodings [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_encodings(Deobfuscator):

    _SENTINEL = re.compile('\\s*'.join(
        (re.escape('[System.Text.Encoding]::') + '(\\w+)\\.GetString', '\\(', '@\\(', '({a})', '\\)', '\\)')
    ).format(a=formats.intarray), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        def replacer(match: re.Match[str]):
            if strlit.get_container(match.start()):
                return match[0]
            try:
                bytes = bytearray(int(x.strip(), 0) for x in match[2].split(','))
            except Exception:
                return match[0]
            encoding = {
                'ASCII': 'ascii',
                'BigEndianUnicode': 'utf-16be',
                'Default': 'latin1',
                'Unicode': 'utf-16le',
            }.get(match[1], match[1])
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'utf8'
            try:
                string = bytes.decode(encoding)
            except Exception:
                return match[0]
            return string_quote(string)

        return self._SENTINEL.sub(replacer, data)

class deob_ps1_escape

This unit is implemented in refinery.units.obfuscation.ps1.escape and has the following commandline Interface:

usage: deob-ps1-escape [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_escape(Deobfuscator):

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        @strlit.outside
        def repl(m): return m[1]
        return re.sub(R'''`([^0abfnrtv`#'"\$])''', repl, data)

class deob_ps1_format

This unit is implemented in refinery.units.obfuscation.ps1.format and has the following commandline Interface:

usage: deob-ps1-format [-h] [-L] [-Q] [-0] [-v]

PowerShell deobfuscation for the following "format string"-based technique:

- "{0}{2}{1}"-f 'signa','ures','t'
- "{0}na{2}{1}"-f 'sig','ures','t'

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_format(Deobfuscator):
    """
    PowerShell deobfuscation for the following "format string"-based technique:

    - `"{0}{2}{1}"-f 'signa','ures','t'`
    - `"{0}na{2}{1}"-f 'sig','ures','t'`
    """

    def deobfuscate(self, data):

        repeat = True

        while repeat:

            repeat = False

            for string in re.finditer(str(formats.ps1str), data):
                argmatch = re.search(R'^\s*-[fF]\s*((?:{s},\s*)*{s})'.format(s=formats.ps1str), data[string.end():])
                if not argmatch:
                    continue

                def dbgmsg():
                    sample = string[0]
                    if len(sample) > 33:
                        sample = F"{sample[1:30]}...{sample[0]}"
                    return F'found match at {string.start()}: {sample}'

                self.log_debug(dbgmsg)

                args = re.split(F'({formats.ps1str})', argmatch[1])
                args = [list(string_unquote(a.strip())) for a in args[1::2]]

                def formatter(string):
                    buffer = []
                    for k, part in enumerate(re.split(R'(\{\d+\})', string)):
                        if k % 2 == 0:
                            if part:
                                buffer.append(part)
                            continue
                        try:
                            index = int(part[1:-1])
                            arg = args[index]
                        except IndexError as IE:
                            raise IndexError(F'only found {len(args)} arguments and format sequence {index}, aborting.') from IE

                        it = iter(arg)
                        buffer.append(next(it))

                        if len(arg) > 1:
                            yield ''.join(buffer)
                            buffer = []
                            for last, part in lookahead(it):
                                if last:
                                    buffer.append(part)
                                    break
                                yield part

                    yield ''.join(buffer)

                try:
                    result = string_apply(string[0], formatter)
                except IndexError:
                    continue

                data = data[:string.start()] + result + data[argmatch.end() + string.end():]
                repeat = True
                break

        return data

class deob_ps1_invoke

This unit is implemented in refinery.units.obfuscation.ps1.invoke and has the following commandline Interface:

usage: deob-ps1-invoke [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_invoke(Deobfuscator):
    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        @strlit.outside
        def invrepl1(m): return m[1] + m[3]

        data = re.sub(
            R'''(\.|::)'''                    # preceeded by dot or namespace delimiter
            R'''(['"])(\w{1,200})\2'''        # quoted string (actually a method name)
            R'''(?=[\s\(\.\,\;\+\-])''',      # only if followed by certain characters
            invrepl1, data                    # remove quotes around symbol
        )

        @strlit.outside
        def invrepl2(m): return m[1] + '('

        data = re.sub(
            '\\s{0,5}'.join([
                '[.&]', '(\\(',               # sourcing operator
                '(?:gcm|get-command)', ')?',  # potentially a get-command
                '([\'"])([-a-z]{1,100})\\2'   # string enclosing a command
                '(?(1)\\s{0,5}\\)|)',         # closing bracket for get-command
            ]), '\\3', data, flags=re.IGNORECASE
        )
        data = re.sub(
            R'''(\w{1,200})\.Invoke\s*\(''',
            invrepl2, data,
            flags=re.IGNORECASE
        )

        return data

class deob_ps1_secstr (*a)

This unit is implemented in refinery.units.obfuscation.ps1.securestring and has the following commandline Interface:

usage: deob-ps1-secstr [-h] [-L] [-Q] [-0] [-v] [a [a ...]]

positional arguments:
  a

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_secstr(Deobfuscator):
    def __init__(self, *a, **kw):
        super().__init__(*a, **kw)

        self._pack = pack()
        self._secstr = secstr()

        self._pattern = re.compile(
            R'\s{{0,20}}'.join([
                R'''(['"])({b})\1''',
                R'\|', R'\.?', R'&?',
                R'''(['"]?)ConvertTo-SecureString\3''',
                R'-ke?y?',
                R'''(\(?)({a}|{i}\s{{0,20}}\.\.\s{{0,20}}{i})''',
                R'((?:\)\s{{0,20}}){{0,10}})?'
            ]).format(
                b=formats.b64,
                a=formats.intarray,
                i=formats.integer
            ),
            flags=re.IGNORECASE | re.DOTALL
        )

    def _decrypt_block(self, data, match):
        if '..' in match[5]:
            a, b = [int(x.strip(), 0) for x in match[5].split('..')]
            key = range(min(a, b), max(a, b) + 1)
            if a > b:
                key = reversed(key)
            self._secstr.args.key = bytes(bytearray(key))
        else:
            self._secstr.args.key = self._pack(match[5].encode(self.codec))
        decoded = self._secstr(match[2].encode(self.codec))
        decoded = decoded.decode(self.codec)
        result = F'\n\n{decoded}\n\n'
        brackets = match[6].count(')')
        start = match.start()
        if match[4]:
            brackets -= 1
        if brackets <= 0:
            if brackets < 0:
                result += ')'
            return start, result
        while brackets:
            start -= 1
            if data[start] == '(':
                brackets -= 1
            if data[start] == ')':
                brackets += 1
        return start, result

    def deobfuscate(self, data):
        while True:
            match = self._pattern.search(data)
            if not match:
                break
            start, result = self._decrypt_block(data, match)
            data = data[:start] + result + data[match.end():]
        return data

class deob_ps1_stringreplace

This unit is implemented in refinery.units.obfuscation.ps1.stringreplace and has the following commandline Interface:

usage: deob-ps1-stringreplace [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_stringreplace(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)[\'"]\s*'               # end of haystack string
        R'(-c|-i|-|\.)replace'        # the replace call
        R'([\(\s]*)({s})([\)\s]*),'   # needle (with brackets)
        R'([\(\s]*)({s})([\)\s]*)'    # insert (with brackets)
    ).format(s=formats.ps1str), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        repeat = True
        strlit = Ps1StringLiterals(data)

        while repeat:
            repeat = False
            needle = None

            for match in self._SENTINEL.finditer(data):
                k = strlit.get_container(match.start())
                if k is None:
                    continue
                offset, end = strlit.ranges[k]
                if match.start() != end - 1:
                    continue
                string = data[offset:end]
                pf, bl1, needle, bl2, br1, insert, br2 = match.groups()
                end = match.end()
                case = '' if pf[0] in '.c' else '(?i)'
                bl = bl1.count('(') - bl2.count(')')
                br = br2.count(')') - br1.count('(')
                if pf[0] == '.':
                    bl -= 1
                    br -= 1
                if bl != 0 or br < 0:
                    continue
                needle = list(string_unquote(needle))
                if len(needle) > 1:
                    continue

                needle = needle[0]
                head, *body = string_unquote(insert)

                self.log_info('replacing', needle, 'by', insert)

                if not body:
                    def perform_replacement(string):
                        return re.sub(F'{case}{re.escape(needle)}', lambda _: head, string)
                else:
                    *body, tail = body
                    def perform_replacement(string): # noqa
                        parts = re.split(F'{case}{re.escape(needle)}', string)
                        if len(parts) == 1:
                            yield string
                            return
                        it = iter(parts)
                        yield next(it) + head
                        yield from body
                        for last, part in lookahead(it):
                            if last:
                                yield tail + part
                            else:
                                yield tail + part + head
                                yield from body

                replaced = string_apply(string, perform_replacement) + (br * ')')
                strlit.ranges[k] = offset, offset + len(replaced) - br
                strlit.ranges[k + 1: k + 3] = []
                strlit.shift(len(replaced) + offset - end, k + 1)
                data = data[:offset] + replaced + data[end:]
                repeat = True
                break

        return data

class deob_ps1_typecast

This unit is implemented in refinery.units.obfuscation.ps1.typecast and has the following commandline Interface:

usage: deob-ps1-typecast [-h] [-L] [-Q] [-0] [-v]

Replaces sequences like [Char]120 to their string representation, in this case the string "x".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_typecast(Deobfuscator):
    """
    Replaces sequences like [Char]120 to their string representation, in this
    case the string "x".
    """

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        @strlit.outside
        def strip_typecast(m): return m[1]

        data = re.sub(
            FR'\[(?:string|char\[\])\]\s*({formats.ps1str!s})',
            strip_typecast,
            data,
            flags=re.IGNORECASE
        )

        @strlit.outside
        def char_literal(match):
            c = chr(int(match[1]))
            if c == "'":
                return '''"'"'''
            return F"'{c}'"

        data = re.sub(
            R'\[char\]\s*0*(0x[0-9a-f]+|\d+)',
            char_literal,
            data,
            flags=re.IGNORECASE
        )

        def char_array(match):
            result = bytes(int(x, 0) for x in match[1].split(','))
            try:
                result = result.decode('ascii')
                if not all(x in string.printable or x.isspace() for x in result):
                    raise ValueError
            except ValueError:
                return match[0]
            else:
                return string_quote(result)

        data = re.sub(
            R'\s*'.join([
                R'\[char\[\]\]',
                R'\((',
                R'(?:\s*(?:0x[0-9a-f]+|\d+)\s*,)+',
                R'(?:0x[0-9a-f]+|\d+)',
                R')\)'
            ]),
            char_array,
            data,
            flags=re.IGNORECASE
        )

        return data

class deob_ps1_uncurly

This unit is implemented in refinery.units.obfuscation.ps1.uncurly and has the following commandline Interface:

usage: deob-ps1-uncurly [-h] [-L] [-Q] [-0] [-v]

PowerShell deobfuscation that removes superfluous curly braces around variable names that do not
require it, i.e. ${variable} is transformed to just $variable.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_ps1_uncurly(Deobfuscator):
    """
    PowerShell deobfuscation that removes superfluous curly braces around variable
    names that do not require it, i.e. `${variable}` is transformed to just `$variable`.
    """

    _SENTINEL = re.compile(R'\$\{(\w+)\}')

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        @strlit.outside
        def strip(m): return F'${m[1]}'
        return self._SENTINEL.sub(strip, data)

class deob_vba (timeout=100)

This unit is implemented in refinery.units.obfuscation.vba.all and has the following commandline Interface:

usage: deob-vba [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

optional arguments:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Allow partial results as output.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba(IterativeDeobfuscator):

    _SUBUNITS = [sub() for sub in [
        deob_vba_comments,
        deob_vba_brackets,
        deob_vba_char_function,
        deob_vba_concat,
        deob_vba_arithmetic,
        deob_vba_constants,
        deob_vba_dummy_variables,
        deob_vba_stringreplace,
        deob_vba_stringreverse,
    ]]

    def deobfuscate(self, data):
        for u in self._SUBUNITS:
            u.log_level = self.log_level
        for unit in self._SUBUNITS:
            self.log_debug(lambda: F'invoking {unit.name}')
            checkpoint = hash(data)
            data = unit.deobfuscate(data)
            if checkpoint != hash(data) and not self.log_debug('data has changed.'):
                self.log_info(F'used {unit.name}')
        return re.sub(R'[\r\n]+', '\n', data)

class deob_vba_arithmetic

This unit is implemented in refinery.units.obfuscation.vba.arithmetic and has the following commandline Interface:

usage: deob-vba-arithmetic [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_arithmetic(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.vbastr, data)

        def vba_int_eval(match: re.Match[str]) -> str:
            s = match[0].lower()
            if not s.startswith('&'):
                return s
            t, s = s[1], s[2:].rstrip('&')
            if t == 'h':
                return str(int(s, 16))
            if t == 'b':
                return str(int(s, 2))
            if t == 'o':
                return str(int(s, 8))

        @strings.outside
        def evaluate(match: re.Match[str]):
            expression = match[0]
            expression = expression.strip()
            if not any(c.isdigit() for c in expression):
                return expression
            expression = re.sub(str(formats.vbaint), vba_int_eval, expression)
            brackets = 0
            positions = []
            ok = True
            head = tail = rest = ''
            for end, character in enumerate(expression):
                if character == '(':
                    brackets += 1
                    positions.append(end)
                    continue
                if character == ')':
                    brackets -= 1
                    if brackets < 0:
                        expression, tail = expression[:end], expression[end:]
                        break
                    else:
                        positions.pop()
                    if brackets == 0 and expression[0] == '(':
                        expression, rest = expression[:end + 1], expression[end + 1:]
                        break
            if expression.isdigit():
                return match[0]
            if brackets > 0:
                pos = positions[~0] + 1
                head = expression[:pos]
                expression = expression[pos:]
            try:
                result = str(_cautious_vba_eval(expression + rest))
            except Exception:
                ok = False
            else:
                rest = ''
            if not ok and rest:
                try:
                    result = str(_cautious_vba_eval(expression))
                except Exception:
                    expression += rest
                else:
                    ok = True
            if not ok:
                result = expression
                self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})')
            else:
                if expression.startswith('(') and expression.endswith(')'):
                    result = F'({result})'
            if tail:
                tail = self.deobfuscate(tail)
            return F'{head}{result}{rest}{tail}'

        pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format(
            i=str(formats.vbaint), f=str(formats.float)))

        return pattern.sub(evaluate, data)

class deob_vba_brackets

This unit is implemented in refinery.units.obfuscation.vba.brackets and has the following commandline Interface:

usage: deob-vba-brackets [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_brackets(Deobfuscator):
    _SENTINEL = re.compile(
        RF'''(?<![\w"']{{2}})'''  # this may be a function call
        RF'''\(\s*({formats.vbaint}|{formats.vbastr}|{formats.float})\s*(\S)''',
        flags=re.IGNORECASE
    )

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)
        repeat = True

        @strlit.outside
        def replacement(match):
            nonlocal repeat
            if match[2] == ')':
                repeat = True
                return match[1]

        while repeat:
            repeat = False
            data = self._SENTINEL.sub(replacement, data)

        return data

class deob_vba_char_function

This unit is implemented in refinery.units.obfuscation.vba.char and has the following commandline Interface:

usage: deob-vba-char-function [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_char_function(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.vbastr, data)

        @strings.outside
        def evaluate_char_function(match: re.Match[str]):
            try:
                c = chr(int(match[1]))
            except ValueError:
                return match[0]
            if c == '"':
                return '""""'
            if c == '\\':
                return '"\\"'
            c = repr(c)[1:-1]
            if len(c) > 1:
                return match[0]
            return '"{}"'.format(c)

        return re.sub(R'(?i)\bchrw?\s*\(\s*(\d+)\s*\)', evaluate_char_function, data)

class deob_vba_chr_literals

This unit is implemented in refinery.units.obfuscation.vba.vba and has the following commandline Interface:

usage: deob-vba-chr-literals [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_chr_literals(Unit):
    def process(self, data):
        def _chr(m):
            code = int(m[1], 0)
            if code == 34:
                return B'""""'
            return B'"%s"' % chr(code).encode('unicode_escape')
        data = re.sub(BR'Chr\((\d+x?\d+)\)', _chr, data, flags=re.IGNORECASE)
        data = re.sub(BR'"\s*\&\s*"', B'', data)
        return data

class deob_vba_comments

This unit is implemented in refinery.units.obfuscation.vba.comments and has the following commandline Interface:

usage: deob-vba-comments [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_comments(Deobfuscator):
    def deobfuscate(self, data):
        return re.sub(R"(?im)^\s{0,20}(?:'|rem\b|dim\b).*(?:\Z|$\n\r?)", '', data)

class deob_vba_concat (timeout=100)

This unit is implemented in refinery.units.obfuscation.vba.concat and has the following commandline Interface:

usage: deob-vba-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT]

optional arguments:
  -t, --timeout TIMEOUT  Maximum number of iterations; the default is 100.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Allow partial results as output.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''"\s*(\++|&)\s*"''')

    def deobfuscate(self, data):

        def concat(data):
            strlit = StringLiterals(formats.vbastr, data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    _, a = strlit.ranges[a]
                    b, c = strlit.ranges[b]
                    yield data[:a - 1] + data[b + 1:c]
                    data = data[c:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_vba_constants

This unit is implemented in refinery.units.obfuscation.vba.constants and has the following commandline Interface:

usage: deob-vba-constants [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_constants(Deobfuscator):
    def deobfuscate(self, data):
        codelines = data.splitlines(keepends=True)
        constants = {}
        constline = {}
        variables = set()
        for k, line in enumerate(codelines):
            match = re.match(R'(?im)^\s*(?:sub|function)\s*(\w+)', line)
            if match:
                variables.add(match[1])
                continue
            match = re.match(
                R'(?im)^(?:\s*const)?\s*(\w+)\s*=\s*({i}|{s})\s*(?:\'|rem|$)'.format(
                    s=formats.ps1str,
                    i=formats.integer
                ), line)
            if match is None or match[1] in variables:
                pass
            elif match[2] != constants.get(match[1], match[2]):
                self.log_debug(F'del {match[1]}')
                del constants[match[1]]
                del constline[match[1]]
                variables.add(match[1])
            else:
                self.log_debug(F'add {match[1]} = {match[2]}')
                constants[match[1]] = match[2]
                constline[match[1]] = k
        codelines = [line for k, line in enumerate(codelines) if k not in constline.values()]
        data = ''.join(codelines)
        for name, value in constants.items():
            data = re.sub(RF'\b{re.escape(name)!s}\b', lambda _: value, data)

        return data

class deob_vba_dummy_variables

This unit is implemented in refinery.units.obfuscation.vba.dummies and has the following commandline Interface:

usage: deob-vba-dummy-variables [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_dummy_variables(Deobfuscator):
    def deobfuscate(self, data):
        lines = data.splitlines(keepends=False)
        names = collections.defaultdict(list)

        def might_be_used_in(name, line):
            # avoid finding the name within a string literal
            line = '""'.join(re.split(str(formats.ps1str), line))
            line = re.split(RF'\b{name}\b', line)
            try:
                L, R = line
            except ValueError:
                return False
            L = L.strip().lower()
            if L.startswith("'") or L.startswith('rem'):
                return False
            R = R.strip().lower()
            if R.startswith('=') and 'if' not in L:
                return False
            if L.startswith('dim'):
                return False
            return True

        pattern = re.compile(
            R'(?i)^\s{0,8}(?:const\s{1,8})?(\w+)\s{1,8}=\s{1,8}.*$'
        )

        for k, line in enumerate(lines):
            try:
                name = pattern.match(line)[1]
            except (AttributeError, TypeError):
                continue
            if re.search(r'\w+\(', line):
                # might be a function call
                continue
            names[name].append(k)

        for line in lines:
            while True:
                for name in names:
                    if might_be_used_in(name, line):
                        del names[name]
                        break
                else:
                    break

        return '\n'.join(line for k, line in enumerate(lines) if not any(
            k in rows for rows in names.values()))

class deob_vba_stringreplace

This unit is implemented in refinery.units.obfuscation.vba.stringreplace and has the following commandline Interface:

usage: deob-vba-stringreplace [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_stringreplace(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)\bReplace\s*\('  # the replace call
        R'\s*({s}),'           # haystack (with brackets)
        R'\s*({s}),'           # needle (with brackets)
        R'\s*({s})\s*\)'       # insert (with brackets)
    ).format(s=formats.vbastr), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)

        @strlit.outside
        def replacement(match: re.Match[str]):
            return string_quote(
                string_unquote(match[1]).replace(
                    string_unquote(match[2]),
                    string_unquote(match[3])
                )
            )

        return self._SENTINEL.sub(replacement, data)

class deob_vba_stringreverse

This unit is implemented in refinery.units.obfuscation.vba.stringreverse and has the following commandline Interface:

usage: deob-vba-stringreverse [-h] [-L] [-Q] [-0] [-v]

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deob_vba_stringreverse(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)\bStrReverse\s*\('  # the reverse call
        R'\s*({s})\s*\)'          # string
    ).format(s=formats.vbastr), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)

        @strlit.outside
        def replacement(match: re.Match[str]):
            return string_quote(''.join(reversed(string_unquote(match[1]))))

        return self._SENTINEL.sub(replacement, data)

class des (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.des and has the following commandline Interface:

usage: des [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key

DES encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -M, --mac-len N       Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in
                        bytes.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class des(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES)):
    """
    DES encryption and decryption.
    """
    pass

class des3 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.des3 and has the following commandline Interface:

usage: des3 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key

3-DES encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -M, --mac-len N       Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in
                        bytes.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class des3(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES3)):
    """
    3-DES encryption and decryption.
    """
    pass

class deskd (size=8)

This unit is implemented in refinery.units.crypto.keyderive.deskd and has the following commandline Interface:

usage: deskd [-h] [-L] [-Q] [-0] [-v] [size]

Stands for "DES Key Derivation". It implements the same functionality as DES_string_to_key in
OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification.
This is not a modern key derivation function.

positional arguments:
  size           The number of bytes to generate, default is the maximum of 8.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class deskd(KeyDerivation):
    """
    Stands for "DES Key Derivation". It implements the same functionality as `DES_string_to_key` in OpenSSL. It
    converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern
    key derivation function.
    """
    def __init__(self, size: Arg(help='The number of bytes to generate, default is the maximum of 8.') = 8):
        super().__init__(size=size, salt=None)

    def process(self, password):
        from Cryptodome.Cipher import DES
        from Cryptodome.Util.strxor import strxor

        key = bytearray(8)

        for i, j in enumerate(password):
            if ((i % 16) < 8):
                key[i % 8] ^= (j << 1) & 0xFF
            else:
                j = (((j << 4) & 0xf0) | ((j >> 4) & 0x0f))
                j = (((j << 2) & 0xcc) | ((j >> 2) & 0x33))
                j = (((j << 1) & 0xaa) | ((j >> 1) & 0x55))
                key[7 - (i % 8)] ^= j

        des_set_odd_parity(key)

        if password:
            n = len(password)
            password = password.ljust(n + 7 - ((n - 1) % 8), b'\0')
            des = DES.new(key, DES.MODE_ECB)
            for k in range(0, n, 8):
                key[:] = des.encrypt(strxor(password[k:k + 8], key))
            des_set_odd_parity(key)

        if self.args.size > 8:
            raise RefineryPartialResult('can provide at most 8 bytes.', partial=key)

        return key[:self.args.size]

class dexstr

This unit is implemented in refinery.units.formats.dexstr and has the following commandline Interface:

usage: dexstr [-h] [-L] [-Q] [-0] [-v]

Extract strings from DEX (Dalvik Executable) files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dexstr(Unit):
    """
    Extract strings from DEX (Dalvik Executable) files.
    """
    def process(self, data):
        dex = DexFile(data)
        for string in dex.read_strings():
            yield string.encode(self.codec)

class dnblob

This unit is implemented in refinery.units.formats.pe.dotnet.dnblob and has the following commandline Interface:

usage: dnblob [-h] [-L] [-Q] [-0] [-v]

Extracts all blobs defined in the #Blob stream of .NET executables.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dnblob(Unit):
    """
    Extracts all blobs defined in the `#Blob` stream of .NET executables.
    """
    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        for blob in header.meta.Streams.Blob.values():
            yield blob

class dncfx

This unit is implemented in refinery.units.formats.pe.dotnet.dncfx and has the following commandline Interface:

usage: dncfx [-h] [-L] [-Q] [-0] [-v]

Extracts the encrypted strings from ConfuserX protected .NET execuctables. Each decrypted string
is returned as a single output.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dncfx(Unit):
    """
    Extracts the encrypted strings from ConfuserX protected .NET execuctables.
    Each decrypted string is returned as a single output.
    """
    _PATTERN_ARRAY_INIT = (
        BR'(\x1F.|\x20....)'      # load size of a chunk
        BR'\x8D.\x00\x00\x01'     # create a UInt32 array
        BR'\x25'                  # dup
        BR'\xD0%s\x04'            # ldtoken: RVA of array data
        BR'\x28.\x00\x00.'        # call to InitializeArray
    )

    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        decompressor = lzma()

        class IntegerAssignment:
            def __init__(self, match):
                self.offset = match.start()
                self.value, = struct.unpack('<I', match[1])

        def get_size(match):
            ins = match[1]
            fmt = '<B' if ins[0] == 0x1F else '<I'
            result, = struct.unpack(fmt, ins[-struct.calcsize(fmt):])
            return result

        potential_seeds = [
            IntegerAssignment(m)
            for m in re.finditer(br'\x20(....)', data, re.DOTALL)
        ]

        for entry in header.meta.RVAs:
            offset = header.pe.get_offset_from_rva(entry.RVA)
            index = struct.pack('<I', entry.Field.Index)
            strings_found = 0
            for match in re.finditer(self._PATTERN_ARRAY_INIT % re.escape(index[:3]), data, flags=re.DOTALL):
                ms = match.start()

                def sortkey(t):
                    weight = abs(t.offset - ms)
                    if t.offset < ms:
                        # this weights assignments after the array initialization down, but still
                        # prefers them over assignments that are further away than 2kb
                        weight += 2000
                    return weight

                size = get_size(match)

                if size % 0x10 or size > 10000:
                    continue

                self.log_debug(F'found RVA {entry.Field.Index} initialized with length {size}.')
                potential_seeds.sort(key=sortkey)

                for seed in potential_seeds[1:400]:
                    # the first potential_seed will always be the assignment of the size variable
                    ciphertext = data[offset:offset + size * 4]
                    key = self._xs64star(seed.value)
                    key = chunks.pack(key, 4) + ciphertext[:-0x40]
                    decrypted = strxor(key, ciphertext)
                    try:
                        decompressed = decompressor(decrypted)
                    except Exception as e:
                        self.log_debug(
                            F'decompression failed for seed {seed.value:08X} at offset {seed.offset:08X}: {e}')
                        continue
                    else:
                        self.log_info(
                            F'decompression worked for seed {seed.value:08X} at offset {seed.offset:08X}.')
                    if len(decompressed) < 0x100:
                        continue
                    for string in self._extract_strings(decompressed):
                        strings_found += 1
                        yield string
                    if strings_found > 10:
                        break

    def _xs64star(self, state):
        for i in range(16):
            state ^= (state >> 12) & 0xFFFFFFFF
            state ^= (state << 25) & 0xFFFFFFFF
            state ^= (state >> 27) & 0xFFFFFFFF
            yield state & 0xFFFFFFFF

    def _extract_strings(self, blob):
        reader = StreamReader(blob)
        while reader.tell() < len(blob):
            try:
                size = reader.expect(UInt32)
                string = reader.expect(StringPrimitive, size=size, codec='UTF8', align=4)
            except ParserEOF:
                return
            if string:
                yield string.encode(self.codec)

class dnds (dereference=True, encode=None, digest=None)

This unit is implemented in refinery.units.formats.pe.dotnet.dnds and has the following commandline Interface:

usage: dnds [-h] [-L] [-Q] [-0] [-v] [-r] [-e UNIT | -d HASH]

Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class
"BinaryFormatter". The output is a representation of the deserialized data in JSON format.

optional arguments:
  -r, --keep-references  Do not resolve Object references in serialized data.
  -e, --encode UNIT      Select an encoder unit used to represent binary data in the JSON output.
                         Available are: hex, esc, url, b64.
  -d, --digest HASH      Select a hashing algorithm to digest binary data; instead of the data,
                         only the hash will be displayed. The available algorithms are: md5,
                         crc32, sha1, sha256, sha512.

generic options:
  -h, --help             Show this help message and exit.
  -L, --lenient          Allow partial results as output.
  -Q, --quiet            Disables all log output.
  -0, --devnull          Do not produce any output.
  -v, --verbose          Specify up to two times to increase log level.

Expand source code Browse git

class dnds(JSONEncoderUnit):
    """
    Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class
    "BinaryFormatter". The output is a representation of the deserialized data in JSON format.
    """

    def __init__(
        self, dereference: Arg.Switch('-r', '--keep-references', off=True,
            help='Do not resolve Object references in serialized data.') = True,
        encode=None, digest=None
    ):
        super().__init__(encode=encode, digest=digest, dereference=dereference)

    def process(self, data):
        self.log_debug('initializing parser, will fail on malformed stream')
        bf = BinaryFormatterParser(
            data,
            keep_meta=True,
            dereference=self.args.dereference,
            ignore_errors=not self.log_debug(),
        )

        return self.to_json([
            {
                'Type': repr(record),
                'Data': record
            } for record in bf
        ])

class dnfields (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pe.dotnet.dnfields and has the following commandline Interface:

usage: dnfields [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
                [path [path ...]]

This unit can extract data from constant field variables in classes of .NET executables. Since
the .NET header stores only the offset and not the size of constant fields, heuristics are used
to search for opcode sequences that load the data and additional heuristics are used to guess the
size of the data type.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class dnfields(PathExtractorUnit):
    """
    This unit can extract data from constant field variables in classes of .NET
    executables. Since the .NET header stores only the offset and not the size of
    constant fields, heuristics are used to search for opcode sequences that load
    the data and additional heuristics are used to guess the size of the data
    type.
    """
    _SIZEMAP = {
        '^s?byte$'       : 1,
        '^s?char$'       : 2,
        '^[us]?int.?16$' : 2,
        '^[us]?int.?32$' : 4,
        '^[us]?int.?64$' : 8,
    }

    def _guess_field_info(self, tables, data, t) -> FieldInfo:
        pattern = (
            BR'(\x20....|\x1F.)'                # ldc.i4  count
            BR'\x8D(...)([\x01\x02])'           # newarr  col|row
            BR'\x25'                            # dup
            BR'\xD0\x%02x\x%02x\x%02x\x04'      # ldtoken t
            BR'(?:.{0,12}'                      # ...
            BR'\x80(...)\x04)?' % (             # stsfld variable
                (t >> 0x00) & 0xFF,
                (t >> 0x08) & 0xFF,
                (t >> 0x10) & 0xFF
            )
        )
        for match in re.finditer(pattern, data, flags=re.DOTALL):
            count, j, r, name = match.groups()
            count, j, r = struct.unpack('<LLB', B'%s%s\0%s' % (count[1:].ljust(4, B'\0'), j, r))
            if name:
                try:
                    name = struct.unpack('<L', B'%s\0' % name)
                    name = name[0]
                    name = tables[4][name - 1].Name
                except Exception as E:
                    self.log_info(F'attempt to parse field name failed: {E!s}')
                    name = None
            element = tables[r][j - 1]
            for pattern, size in self._SIZEMAP.items():
                if re.match(pattern, element.TypeName, flags=re.IGNORECASE):
                    return FieldInfo(element.TypeName, count, size, name)

    def unpack(self, data):
        header = DotNetHeader(data, parse_resources=False)
        tables = header.meta.Streams.Tables
        fields = tables.FieldRVA
        if not fields:
            return
        iwidth = len(str(len(fields)))
        rwidth = max(len(F'{field.RVA:X}') for field in fields)
        rwidth = max(rwidth, 4)
        remaining_field_indices = set(range(len(tables.Field)))

        for k, rv in enumerate(fields):
            _index = rv.Field.Index
            field = tables.Field[_index - 1]
            remaining_field_indices.discard(_index - 1)
            fname = field.Name
            ftype = None
            if len(field.Signature) == 2:
                # Crude signature parser for non-array case. Reference:
                # https://www.codeproject.com/Articles/42649/NET-File-Format-Signatures-Under-the-Hood-Part-1
                # https://www.codeproject.com/Articles/42655/NET-file-format-Signatures-under-the-hood-Part-2
                guess = {
                    0x03: FieldInfo('Char',   1, 1, None),  # noqa
                    0x04: FieldInfo('SByte',  1, 1, None),  # noqa
                    0x05: FieldInfo('Byte',   1, 1, None),  # noqa
                    0x06: FieldInfo('Int16',  1, 2, None),  # noqa
                    0x07: FieldInfo('UInt16', 1, 2, None),  # noqa
                    0x08: FieldInfo('Int32',  1, 4, None),  # noqa
                    0x09: FieldInfo('UInt32', 1, 4, None),  # noqa
                    0x0A: FieldInfo('Int64',  1, 8, None),  # noqa
                    0x0B: FieldInfo('UInt64', 1, 8, None),  # noqa
                    0x0C: FieldInfo('Single', 1, 4, None),  # noqa
                    0x0D: FieldInfo('Double', 1, 8, None),  # noqa
                }.get(field.Signature[1], None)
            else:
                guess = self._guess_field_info(tables, data, _index)
            if guess is None:
                self.log_debug(lambda: F'field {k:0{iwidth}d} name {field.Signature}: unable to guess type information')
                continue
            totalsize = guess.count * guess.size
            if guess.name is not None:
                fname = guess.name
            if not fname.isprintable():
                fname = F'F{rv.RVA:0{rwidth}X}'
            ext = ftype = guess.type.lower()
            if guess.count > 1:
                ftype += F'[{guess.count}]'
            self.log_info(lambda: F'field {k:0{iwidth}d} at RVA 0x{rv.RVA:04X} of type {guess.type}, count: {guess.count}, name: {fname}')
            offset = header.pe.get_offset_from_rva(rv.RVA)
            yield UnpackResult(
                F'{fname}.{ext}',
                lambda t=offset, s=totalsize: data[t:t + s],
                name=fname,
                type=ftype,
            )

        for _index in remaining_field_indices:
            field = tables.Field[_index]
            index = _index + 1
            name = field.Name
            if field.Flags.HasFieldRVA:
                self.log_warn(F'field {name} has RVA flag set, but no RVA was found')
            token = index.to_bytes(3, 'little')
            values = set()
            for match in re.finditer((
                BR'\x72(?P<token>...)\x70'          # ldstr
                BR'(?:\x6F(?P<function>...)\x0A)?'  # call GetBytes
                BR'\x80%s\x04'                      # stsfld
            ) % re.escape(token), data, re.DOTALL):
                md = match.groupdict()
                fn_token = md.get('function')
                fn_index = fn_token and int.from_bytes(fn_token, 'little') or None
                if fn_index is not None:
                    fn_name = tables.MemberRef[fn_index].Name
                    if fn_name != 'GetBytes':
                        self.log_info(F'skipping string assignment passing through call to {fn_name}')
                        continue
                k = int.from_bytes(md['token'], 'little')
                values.add(header.meta.Streams.US[k].encode(self.codec))
            if not values:
                continue
            if len(values) == 1:
                yield UnpackResult(
                    F'{name}.str',
                    next(iter(values)),
                    name=name,
                    type='string'
                )

class dnhdr (resources=False, encode=None, digest=None)

This unit is implemented in refinery.units.formats.pe.dotnet.dnhdr and has the following commandline Interface:

usage: dnhdr [-h] [-L] [-Q] [-0] [-v] [-r] [-e UNIT | -d HASH]

Expects data that has been formatted with the BinaryFormatter class. The output is a
representation of the deserialized data in JSON format.

optional arguments:
  -r, --resources    Also parse .NET resources.
  -e, --encode UNIT  Select an encoder unit used to represent binary data in the JSON output.
                     Available are: hex, esc, url, b64.
  -d, --digest HASH  Select a hashing algorithm to digest binary data; instead of the data, only
                     the hash will be displayed. The available algorithms are: md5, crc32, sha1,
                     sha256, sha512.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class dnhdr(JSONEncoderUnit):
    """
    Expects data that has been formatted with the `BinaryFormatter` class. The
    output is a representation of the deserialized data in JSON format.
    """
    def __init__(
        self,
        resources: Arg.Switch('-r', '--resources', help='Also parse .NET resources.') = False,
        encode=None, digest=None
    ):
        super().__init__(encode=encode, digest=digest, resources=resources)

    def process(self, data):
        dn = DotNetHeader(data, parse_resources=self.args.resources)
        dn = {
            'Head': dn.head,
            'Meta': dn.meta
        }

        if self.args.resources:
            dn['RSRC'] = dn.resources

        return self.to_json(dn)

class dnmr (*paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw=False)

This unit is implemented in refinery.units.formats.pe.dotnet.dnmr and has the following commandline Interface:

usage: dnmr [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-e | -z] [-r] [-P NAME] [-w]
            [path [path ...]]

Extracts subfiles from .NET managed resources.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -e, --exact      Path patterns never match on substrings.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "name".
  -w, --raw        Do not deserialize the managed resource entry data.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class dnmr(PathExtractorUnit):
    """
    Extracts subfiles from .NET managed resources.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name',
        raw: Arg.Switch('-w', help='Do not deserialize the managed resource entry data.') = False
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            path=path,
            raw=raw,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
        )

    def unpack(self, data):
        try:
            managed = NetStructuredResources(data)
        except NoManagedResource:
            managed = None
        if not managed:
            raise RefineryPartialResult('no managed resources found', partial=data)
        for entry in managed:
            if entry.Error:
                self.log_warn(F'entry {entry.Name} carried error message: {entry.Error}')
            data = entry.Data
            if not self.args.raw:
                if isinstance(entry.Value, str):
                    data = entry.Value.encode('utf-16le')
                elif isbuffer(entry.Value):
                    data = entry.Value
            yield UnpackResult(entry.Name, data)

class dnrc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pe.dotnet.dnrc and has the following commandline Interface:

usage: dnrc [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use
the dnmr unit to extract subfiles from managed .NET resources.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class dnrc(PathExtractorUnit):
    """
    Extracts all .NET resources whose name matches any of the given patterns
    and outputs them. Use the `refinery.units.formats.pe.dotnet.dnmr` unit to
    extract subfiles from managed .NET resources.
    """
    def unpack(self, data):
        header = DotNetHeader(data)

        if not header.resources:
            if self.args.list:
                return
            raise ValueError('This file contains no resources.')

        for resource in header.resources:
            yield UnpackResult(resource.Name, resource.Data)

class dnsdomain (min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.dnsdomain and has the following commandline Interface:

usage: dnsdomain [-h] [-L] [-Q] [-0] [-v] [-n N] [-m N] [-e N] [-x] [-r] [-l] [-t N]

Extracts domain names in the format as they appear in DNS requests. This can be used as a quick
and dirty way to extract domains from PCAP files, for example.

optional arguments:
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -r, --duplicates  Yield every (transformed) Match, even when it was found before.
  -l, --longest     Sort results by length.
  -t, --take N      Return only the first N occurrences in order of appearance.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class dnsdomain(PatternExtractorBase):
    """
    Extracts domain names in the format as they appear in DNS requests. This
    can be used as a quick and dirty way to extract domains from PCAP files,
    for example.
    """

    _DOMAIN_CHARACTERS = (
        B'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        B'abcdefghijklmnopqrstuvwxyz'
        B'0123456789-_'
    )

    _DOMAIN_PATTERN = BR'(?:%s){1,20}(?:%s)\b' % (_lps(0xFF), _lps(25))

    def process(self, data):

        def transform(match):
            match = bytearray(match[0])
            pos = 0
            while pos < len(match):
                length = match[pos]
                match[pos] = 0x2E
                if len(match) < length + pos:
                    return None
                if any(x not in self._DOMAIN_CHARACTERS for x in match[pos + 1 : pos + length]):
                    return None
                pos += 1 + length
            return match[1:]

        yield from self.matches_filtered(memoryview(data), self._DOMAIN_PATTERN, transform)

class dnsfx (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pe.dotnet.dnsfx and has the following commandline Interface:

usage: dnsfx [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extracts files from .NET single file applications.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class dnsfx(PathExtractorUnit):
    """
    Extracts files from .NET single file applications.
    """
    _SIGNATURE = bytes([
        # 32 bytes represent the bundle signature: SHA-256 for '.net core bundle'
        0x8b, 0x12, 0x02, 0xb9, 0x6a, 0x61, 0x20, 0x38,
        0x72, 0x7b, 0x93, 0x02, 0x14, 0xd7, 0xa0, 0x32,
        0x13, 0xf5, 0xb9, 0xe6, 0xef, 0xae, 0x33, 0x18,
        0xee, 0x3b, 0x2d, 0xce, 0x24, 0xb3, 0x6a, 0xae
    ])

    def unpack(self, data):
        reader = StreamReader(data)
        reader.seek(self._find_bundle_manifest_offset(data))

        major_version = reader.expect(UInt32)
        minor_version = reader.expect(UInt32)
        self.log_info(F'version {major_version}.{minor_version}')

        count = reader.expect(UInt32)
        bhash = reader.expect(StringPrimitive)
        self.log_info(F'bundle {bhash} contains {count} files')

        if major_version >= 2:
            reader.expect(UInt64) # depsOffset
            reader.expect(UInt64) # depsSize
            reader.expect(UInt64) # runtimeConfigOffset
            reader.expect(UInt64) # runtimeConfigSize
            reader.expect(UInt64) # flags

        for _ in range(count):
            try:
                offset = reader.expect(UInt64)
                size = reader.expect(UInt64)
                compressed_size = 0
                if major_version >= 6:
                    compressed_size = reader.expect(UInt64)
                type = reader.expect(Byte)
                path = reader.expect(StringPrimitive)

                def _logmsg():
                    _log = F'read item at offset 0x{offset:08X}, type 0x{type:02X}, size {SizeInt(size)!r}'
                    if compressed_size:
                        return F'{_log}, compressed to size {SizeInt(compressed_size)!r}'
                    return F'{_log}, uncompressed'

                self.log_debug(_logmsg)

                with reader.checkpoint():
                    reader.seek(offset)
                    if compressed_size:
                        item_data = reader.read(compressed_size) | zl | bytearray
                    else:
                        item_data = reader.read(size)

                yield UnpackResult(path, item_data)
            except ParserEOF:
                self.log_warn('unexpected EOF while parsing bundle, terminating')
                break

    def _find_bundle_manifest_offset(self, data: bytearray) -> int:
        bundle_sig_offset = data.find(self._SIGNATURE, 0)
        if bundle_sig_offset < 0:
            raise ValueError('Cannot find valid Bundle Manifest offset. Is this a .NET Bundle?')
        return int.from_bytes(data[bundle_sig_offset - 8:bundle_sig_offset], 'little')

    @classmethod
    def handles(self, data: bytearray):
        return self._SIGNATURE in data

class dnstr (user=True, meta=True)

This unit is implemented in refinery.units.formats.pe.dotnet.dnstr and has the following commandline Interface:

usage: dnstr [-h] [-L] [-Q] [-0] [-v] [-m | -u]

Extracts all strings defined in the #Strings and #US streams of .NET executables.

optional arguments:
  -m, --meta     Only extract from #Strings.
  -u, --user     Only extract from #US.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dnstr(Unit):
    """
    Extracts all strings defined in the `#Strings` and `#US` streams of .NET
    executables.
    """

    def __init__(
        self,
        user: Arg.Switch('-m', '--meta', off=True, group='HEAP', help='Only extract from #Strings.') = True,
        meta: Arg.Switch('-u', '--user', off=True, group='HEAP', help='Only extract from #US.') = True,
    ):
        if not meta and not user:
            raise ValueError('Either ascii or utf16 strings must be enabled.')
        super().__init__(meta=meta, user=user)

    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        if self.args.meta:
            for string in header.meta.Streams.Strings.values():
                yield string.encode(self.codec)
        if self.args.user:
            for string in header.meta.Streams.US.values():
                yield string.encode(self.codec)

class doctxt

This unit is implemented in refinery.units.formats.office.doctxt and has the following commandline Interface:

usage: doctxt [-h] [-L] [-Q] [-0] [-v]

Extracts the text body from Word documents.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class doctxt(Unit):
    """
    Extracts the text body from Word documents.
    """

    @Unit.Requires('olefile', 'formats', 'office', 'extended')
    def _olefile():
        import olefile
        return olefile

    def process(self, data: bytearray):
        extractors: Dict[str, Callable[[bytearray], str]] = OrderedDict(
            doc=self._extract_ole,
            docx=self._extract_docx,
            odt=self._extract_odt,
        )
        if data.startswith(B'PK'):
            self.log_debug('document contains zip file signature, likely a odt or docx file')
            extractors.move_to_end('doc')
            if 'opendocument' in str(data | xtzip('mimetype')):
                self.log_debug('odt signature detected')
                extractors.move_to_end('odt', last=False)
        for filetype, extractor in extractors.items():
            self.log_debug(F'trying to extract as {filetype}')
            try:
                result = extractor(data)
            except ImportError:
                raise
            except Exception as error:
                self.log_info(F'failed extractring as {filetype}: {error!s}')
            else:
                return result.encode(self.codec)
        raise ValueError('All extractors failed, the input data is not recognized as any known document format.')

    def _extract_docx(self, data: Chunk) -> str:
        NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARAGRAPH = F'{NAMESPACE}p'
        TEXT = F'{NAMESPACE}t'
        chunk = data | xtzip('word/document.xml') | bytearray
        if not chunk:
            raise ValueError('No document.xml file found.')
        root: Element = XML(chunk)
        with StringIO() as output:
            for index, paragraph in enumerate(root.iter(PARAGRAPH)):
                if index > 0:
                    output.write('\n')
                for node in paragraph.iter(TEXT):
                    if node.text:
                        output.write(node.text)
            return output.getvalue()

    def _extract_odt(self, data: bytes):
        def _extract_text(node: Element):
            NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}'
            PARAGRAPH = F'{NAMESPACE}p'
            SPAN = F'{NAMESPACE}span'
            SPACE = F'{NAMESPACE}s'
            with StringIO() as res:
                for element in node:
                    tag = element.tag
                    text = element.text or ''
                    tail = element.tail or ''
                    if tag in [PARAGRAPH, SPAN]:
                        res.write(text)
                    elif tag == SPACE:
                        res.write(' ')
                    else:
                        self.log_debug(F'unknown tag: {tag}')
                    res.write(_extract_text(element))
                    res.write(tail)
                    if tag == PARAGRAPH:
                        res.write('\n')
                return res.getvalue()

        NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}'
        BODY = F'{NAMESPACE}body'
        TEXT = F'{NAMESPACE}text'
        for part in xtzip().unpack(data):
            if part.path != 'content.xml':
                continue
            xml_content: bytes = part.get_data()
            root: Element = XML(xml_content)
            body: Element = root.find(BODY)
            text: Element = body.find(TEXT)
            return _extract_text(text)
        else:
            raise ValueError('found no text')

    def _extract_ole(self, data: bytearray) -> str:
        stream = MemoryFile(data)
        with self._olefile.OleFileIO(stream) as ole:
            doc = ole.openstream('WordDocument').read()
            with StructReader(doc) as reader:
                table_name = F'{(doc[11] >> 1) & 1}Table'
                reader.seek(0x1A2)
                offset = reader.u32()
                length = reader.u32()
            with StructReader(ole.openstream(table_name).read()) as reader:
                reader.seek(offset)
                table = reader.read(length)
            piece_table = self._load_piece_table(table)
            return self._get_text(doc, piece_table)

    def _load_piece_table(self, table: bytes) -> bytes:
        with StructReader(table) as reader:
            while not reader.eof:
                entry_type = reader.read_byte()
                if entry_type == 1:
                    reader.seekrel(reader.read_byte())
                    continue
                if entry_type == 2:
                    length = reader.u32()
                    return reader.read(length)
                raise NotImplementedError(F'Unsupported table entry type value 0x{entry_type:X}.')

    def _get_text(self, doc: bytes, piece_table: bytes) -> str:
        piece_count: int = 1 + (len(piece_table) - 4) // 12
        with StringIO() as text:
            with StructReader(piece_table) as reader:
                character_positions = [reader.u32() for _ in range(piece_count)]
                for i in range(piece_count - 1):
                    cp_start = character_positions[i]
                    cp_end = character_positions[i + 1]
                    fc_value = reader.read_struct('xxLxx', unwrap=True)
                    is_ansi = bool((fc_value >> 30) & 1)
                    fc = fc_value & 0xBFFFFFFF
                    cb = cp_end - cp_start
                    if is_ansi:
                        encoding = 'cp1252'
                        fc = fc // 2
                    else:
                        encoding = 'utf16'
                        cb *= 2
                    raw = doc[fc : fc + cb]
                    text.write(raw.decode(encoding).replace('\r', '\n'))
            return text.getvalue()

class drp (consecutive=False, align=False, min=1, max=∞, len=None, all=False, threshold=20, weight=0, buffer=1024, chug=False)

This unit is implemented in refinery.units.misc.drp and has the following commandline Interface:

usage: drp [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-n N] [-N N] [-l N] [-a] [-t N] [-w N] [-b N | -g]

Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data.
The unit computes a suffix tree which may require a lot of memory for large buffers.

optional arguments:
  -c, --consecutive  Assume that the repeating pattern is consecutive when observable.
  -d, --align        Assume that the pattern occurs at offsets that are multiples of its length.
  -n, --min N        Minimum size of the pattern to search for. Default is 1.
  -N, --max N        Maximum size of the pattern to search for. Default is ∞.
  -l, --len N        Set the exact size of the pattern. This is equivalent to --min=N --max=N.
  -a, --all          Produce one output for each repeating pattern that was detected.
  -t, --threshold N  Patterns must match this performance threshold in percent, lest they be
                     discarded.
  -w, --weight N     Specifies how much longer patterns are favored over small ones. Default is
                     0.
  -b, --buffer N     Maximum number of bytes to inspect at once. The default is 1024.
  -g, --chug         Compute the prefix tree for the entire buffer instead of chunking it.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class drp(Unit):
    """
    Detect Repeating Patterns - detects the most prevalent repeating byte pattern
    in a chunk of data. The unit computes a suffix tree which may require a lot of
    memory for large buffers.
    """
    def __init__(
        self,
        consecutive: Arg.Switch('-c', help='Assume that the repeating pattern is consecutive when observable.') = False,
        align: Arg.Switch('-d', help='Assume that the pattern occurs at offsets that are multiples of its length.') = False,
        min: Arg.Number('-n', help='Minimum size of the pattern to search for. Default is {default}.') = 1,
        max: Arg.Number('-N', help='Maximum size of the pattern to search for. Default is {default}.') = INF,
        len: Arg.Number('-l', help='Set the exact size of the pattern. This is equivalent to --min=N --max=N.') = None,
        all: Arg.Switch('-a', help='Produce one output for each repeating pattern that was detected.') = False,
        threshold: Arg.Number('-t', help='Patterns must match this performance threshold in percent, lest they be discarded.') = 20,
        weight: Arg.Number('-w', help='Specifies how much longer patterns are favored over small ones. Default is {default}.') = 0,
        buffer: Arg.Number('-b', group='BFR', help='Maximum number of bytes to inspect at once. The default is {default}.') = 1024,
        chug  : Arg.Switch('-g', group='BFR', help='Compute the prefix tree for the entire buffer instead of chunking it.') = False
    ):
        if len is not None:
            min = max = len
        super().__init__(
            min=min,
            max=max,
            all=all,
            consecutive=consecutive,
            align=align,
            weight=weight,
            buffer=buffer,
            chug=chug,
            threshold=threshold
        )

    def _get_patterns(self, data):
        with stackdepth(len(data)):
            tree = SuffixTree(data)
        min_size = self.args.min
        max_size = self.args.max
        patterns = set()
        cursor = 0
        while cursor < len(data):
            node = tree.root
            rest = data[cursor:]
            remaining = len(rest)
            length = 0
            offset = None
            while node.children and length < remaining:
                for child in node.children.values():
                    if tree.data[child.start] == rest[length]:
                        node = child
                        break
                if node.start >= cursor:
                    break
                offset = node.start - length
                length = node.end + 1 - offset
            if offset is None:
                cursor += 1
                continue
            length = min(remaining, length)
            if max_size >= length >= min_size:
                pattern = rest[:length].tobytes()
                patterns.add(pattern)
            cursor += length
        del tree
        return patterns

    @staticmethod
    def _consecutive_count(data, pattern):
        length = len(pattern)
        if length == 1:
            return data.count(pattern)
        view = memoryview(data)
        return max(sum(1 for i in range(k, len(view), length) if view[i:i + length] == pattern)
            for k in range(len(pattern)))

    @staticmethod
    def _truncate_pattern(pattern):
        offset = 0
        for byte in pattern[1:]:
            if byte == pattern[offset]:
                offset += 1
            else:
                offset = 0
        if offset > 0:
            pattern = pattern[:-offset]
        return pattern

    def process(self, data: bytearray):
        if len(data) <= 1:
            yield data
            return

        memview = memoryview(data)
        weight = 1 + (self.args.weight / 10)

        if self.args.chug:
            patterns = self._get_patterns(memview)
        else:
            patterns = set()
            chunksize = self.args.buffer
            for k in range(0, len(memview), chunksize):
                patterns |= self._get_patterns(memview[k:k + chunksize])
        if not patterns:
            raise RefineryPartialResult('no repeating sequences found', data)

        self.log_debug('removing duplicate pattern detections')
        duplicates = set()
        maxlen = max(len(p) for p in patterns)
        for pattern in sorted(patterns, key=len):
            for k in range(2, maxlen // len(pattern) + 1):
                repeated = pattern * k
                if repeated in patterns:
                    duplicates.add(repeated)
        patterns -= duplicates

        self.log_debug(F'counting coverage of {len(patterns)} patterns')
        pattern_count = {p: data.count(p) for p in patterns}
        pattern_performance = dict(pattern_count)

        for consecutive in (False, True):
            if consecutive:
                self.log_debug(F're-counting coverage of {len(patterns)} patterns')
                patterns = {self._truncate_pattern(p) for p in patterns}
                pattern_performance = {p: self._consecutive_count(data, p) for p in patterns}

            self.log_debug('evaluating pattern performance')
            for pattern, count in pattern_performance.items():
                pattern_performance[pattern] = count * (len(pattern) ** weight)
            best_performance = max(pattern_performance.values())
            for pattern, performance in pattern_performance.items():
                pattern_performance[pattern] = performance / best_performance

            self.log_debug('removing patterns below performance threshold')
            threshold = self.args.threshold
            patterns = {p for p in patterns if pattern_performance[p] * 100 >= threshold}
            pattern_count = {p: data.count(p) for p in patterns}

            if not self.args.consecutive:
                break

        if self.args.all:
            for pattern in sorted(patterns, key=pattern_performance.get, reverse=True):
                yield self.labelled(pattern, count=pattern_count[pattern])
            return

        best_patterns = [p for p in patterns if pattern_performance[p] == 1.0]

        if len(best_patterns) > 1:
            self.log_warn('could not determine unique best repeating pattern, returning the first of these:')
            for k, pattern in enumerate(best_patterns):
                self.log_warn(F'{k:02d}.: {pattern.hex()}')

        result = best_patterns[0]

        if self.args.align:
            def rotated(pattern):
                for k in range(len(pattern)):
                    yield pattern[k:] + pattern[:k]
            rotations = {k % len(result): r for k, r in (
                (data.find(r), r) for r in rotated(result)) if k >= 0}
            result = rotations[min(rotations)]

        yield result

class dsjava

This unit is implemented in refinery.units.formats.java.deserialize and has the following commandline Interface:

usage: dsjava [-h] [-L] [-Q] [-0] [-v]

Deserialize Java serialized data and re-serialize as JSON.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dsjava(Unit):
    """
    Deserialize Java serialized data and re-serialize as JSON.
    """
    @Unit.Requires('javaobj-py3>=0.4.0.1', 'formats')
    def _javaobj():
        import javaobj.v2
        return javaobj.v2

    def process(self, data):
        with JavaEncoder as encoder:
            return encoder.dumps(self._javaobj.loads(data)).encode(self.codec)

class dsphp

This unit is implemented in refinery.units.formats.deserialize_php and has the following commandline Interface:

usage: dsphp [-h] [-L] [-Q] [-0] [-v] [-R]

Deserialize PHP serialized data and re-serialize as JSON.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class dsphp(Unit):
    """
    Deserialize PHP serialized data and re-serialize as JSON.
    """
    @Unit.Requires('phpserialize', 'formats')
    def _php():
        import phpserialize
        return phpserialize

    def reverse(self, data):
        return self._php.dumps(json.loads(data))

    def process(self, data):
        phpobject = self._php.phpobject

        class encoder(json.JSONEncoder):
            def default(self, obj):
                try:
                    return super().default(obj)
                except TypeError:
                    pass
                if isinstance(obj, bytes) or isinstance(obj, bytearray):
                    return obj.decode('utf8')
                if isinstance(obj, phpobject):
                    return obj._asdict()

        return json.dumps(
            self._php.loads(
                data,
                object_hook=phpobject,
                decode_strings=True
            ),
            indent=4,
            cls=encoder
        ).encode(self.codec)

class dump (*files, tee=False, stream=False, plain=False, force=False)

This unit is implemented in refinery.units.sinks.dump and has the following commandline Interface:

usage: dump [-h] [-L] [-Q] [-0] [-v] [-t] [-s] [-p] [-f] [file [file ...]]

Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any
metadata field on an incoming chunk is available. Additionally, any field that can be populated
by the cm unit is also available. These include the following:

    {ext}    : Automatically guessed file extension
    {crc32}  : CRC32 checksum of the data
    {index}  : Index of the data in the input stream, starting at 0
    {size}   : Size of the data in bytes
    {md5}    : MD5 hash of the data
    {sha1}   : SHA1 hash of the data
    {sha256} : SHA-256 hash of the data
    {path}   : Associated path; defaults to {sha256} if none is given.

When not using formatted file names, the unit ingests as many incoming inputs as filenames were
specified on the command line. Unless connected to a terminal, the remaining inputs will be
forwarded on STDOUT. The -t or --tee switch can be used to forward all inputs, under all
circumstances, regardless of whether or not they have been processed.

If no file is specified, all ingested inputs are concatenated and written to the clipboard. This
will only succeed when the data can successfully be encoded.

positional arguments:
  file           Optionally formatted filename.

optional arguments:
  -t, --tee      Forward all inputs to STDOUT.
  -s, --stream   Dump all incoming data to the same file.
  -p, --plain    Never apply any formatting to file names.
  -f, --force    Remove files if necessary to create dump path.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class dump(Unit):
    """
    Dump incoming data to files on disk. It is possible to specify filenames with format fields.
    Any metadata field on an incoming chunk is available. Additionally, any field that can be
    populated by the `refinery.cm` unit is also available. These include the following:

        {ext}    : Automatically guessed file extension
        {crc32}  : CRC32 checksum of the data
        {index}  : Index of the data in the input stream, starting at 0
        {size}   : Size of the data in bytes
        {md5}    : MD5 hash of the data
        {sha1}   : SHA1 hash of the data
        {sha256} : SHA-256 hash of the data
        {path}   : Associated path; defaults to {sha256} if none is given.

    When not using formatted file names, the unit ingests as many incoming inputs as filenames were
    specified on the command line. Unless connected to a terminal, the remaining inputs will be
    forwarded on STDOUT. The `-t` or `--tee` switch can be used to forward all inputs, under all
    circumstances, regardless of whether or not they have been processed.

    If no file is specified, all ingested inputs are concatenated and written to the clipboard. This
    will only succeed when the data can successfully be encoded.
    """

    def __init__(
        self, *files: Arg(metavar='file', type=str, help='Optionally formatted filename.'),
        tee    : Arg.Switch('-t', help='Forward all inputs to STDOUT.') = False,
        stream : Arg.Switch('-s', help='Dump all incoming data to the same file.') = False,
        plain  : Arg.Switch('-p', help='Never apply any formatting to file names.') = False,
        force  : Arg.Switch('-f', help='Remove files if necessary to create dump path.') = False,
    ):
        if stream and len(files) != 1:
            raise ValueError('Can only use exactly one file in stream mode.')
        super().__init__(files=files, tee=tee, stream=stream, force=force)
        self.stream = None
        self._formatted = not plain and any(self._has_format(f) for f in files)
        self._reset()

    @staticmethod
    def _has_format(filename):
        if not isinstance(filename, str):
            return False
        formatter = Formatter()
        return any(
            any(t.isalnum() for t in fields)
            for _, fields, *__ in formatter.parse(filename) if fields
        )

    def _reset(self):
        self.exhausted = False
        self.paths = cycle(self.args.files) if self._formatted else iter(self.args.files)
        self._close()

    @property
    def _clipcopy(self):
        return not self.args.files

    def _components(self, path):
        def _reversed_components(path):
            while True:
                path, component = os.path.split(path)
                if not component:
                    break
                yield component
            yield path
        components = list(_reversed_components(path))
        components.reverse()
        return components

    def _open(self, path, unc=False):
        if hasattr(path, 'close'):
            return path
        path = os.path.abspath(path)
        base = os.path.dirname(path)
        if not unc:
            self.log_info('opening:', path)
        try:
            os.makedirs(base, exist_ok=True)
        except FileExistsError:
            self.log_info('existed:', path)
            part, components = '', self._components(path)
            while components:
                component, *components = components
                part = os.path.join(part, component)
                if os.path.exists(part) and os.path.isfile(part):
                    if self.args.force:
                        os.unlink(part)
                        return self._open(path, unc)
                    break
            raise RefineryCriticalException(F'Unable to dump to {path} because {part} is a file.')
        except FileNotFoundError:
            if unc or os.name != 'nt':
                raise
            path = F'\\\\?\\{path}'
            return self._open(path, unc=True)
        except OSError as e:
            if not self.log_info():
                self.log_warn('opening:', path)
            self.log_warn('errored:', e.args[1])
            return open(os.devnull, 'wb')
        else:
            mode = 'ab' if self.args.stream else 'wb'
            return open(path, mode)

    def _close(self, final=False):
        if not self.stream:
            return
        self.stream.flush()
        if self.args.stream and not final:
            return
        if self._clipcopy:
            if os.name == 'nt':
                from refinery.lib.winclip import ClipBoard, CF
                try:
                    img = self._image.open(self.stream)
                    with io.BytesIO() as out:
                        img.save(out, 'BMP')
                except Exception:
                    with ClipBoard(CF.TEXT) as cpb:
                        cpb.copy(self.stream.getvalue())
                else:
                    with ClipBoard(CF.DIB) as cpb:
                        out.seek(14, io.SEEK_SET)
                        cpb.copy(out.read())
            else:
                data = self.stream.getvalue()
                data = data.decode(self.codec, errors='backslashreplace')
                self._pyperclip.copy(data)
        self.stream.close()
        self.stream = None

    @Unit.Requires('pyperclip')
    def _pyperclip():
        import pyperclip
        return pyperclip

    @Unit.Requires('Pillow', 'formats')
    def _image():
        from PIL import Image
        return Image

    def process(self, data: bytes):
        forward_input_data = self.args.tee
        if self._clipcopy:
            self.stream.write(data)
        elif not self.exhausted:
            if not self.stream:
                # This should happen only when the unit is called from Python code
                # rather than via the command line.
                try:
                    path = next(self.paths)
                except StopIteration:
                    raise RefineryCriticalException('the list of filenames was exhausted.')
                else:
                    with self._open(path) as stream:
                        stream.write(data)
            else:
                self.stream.write(data)
                self.log_debug(F'wrote 0x{len(data):08X} bytes')
                self._close()
        else:
            forward_input_data = forward_input_data or not self.isatty
            if not forward_input_data:
                size = metavars(data).size
                self.log_warn(F'discarding unprocessed chunk of size {size!s}.')
        if forward_input_data:
            yield data

    def filter(self, chunks):
        if self.exhausted:
            self._reset()

        nostream = not self.args.stream
        clipcopy = self._clipcopy

        if clipcopy:
            self.stream = io.BytesIO()

        for index, chunk in enumerate(chunks, 0):
            if not chunk.visible:
                continue
            if not clipcopy and not self.exhausted and (nostream or not self.stream):
                try:
                    path = next(self.paths)
                except StopIteration:
                    self.exhausted = True
                else:
                    if self._has_format(path):
                        meta = metavars(chunk)
                        meta.ghost = True
                        meta.update_index(index)
                        path = meta.format_str(path, self.codec, [chunk])
                    self.stream = self._open(path)
            yield chunk

        self._close(final=True)
        self.exhausted = True

class eat (name)

This unit is implemented in refinery.units.meta.eat and has the following commandline Interface:

usage: eat [-h] [-L] [-Q] [-0] [-v] name

Consume a meta variable and replace the contents of the current chunk with it. If the variable
contains a string, it is encoded with the default codec. If the variable cannot be converted to a
byte string, the data is lost and an empty chunk is returned.

positional arguments:
  name           The name of the variable to be used.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class eat(Unit):
    """
    Consume a meta variable and replace the contents of the current chunk with it. If the variable
    contains a string, it is encoded with the default codec. If the variable cannot be converted to
    a byte string, the data is lost and an empty chunk is returned.
    """
    def __init__(
        self,
        name: Arg(help='The name of the variable to be used.', type=str),
    ):
        super().__init__(name=check_variable_name(name))

    def process(self, data: Chunk):
        def invalid_type():
            return F'variable {name} is of type "{type}", unable to convert to byte string - data is lost'
        name = self.args.name
        meta = metavars(data)
        data = meta.pop(name)
        type = data.__class__.__name__
        if isinstance(data, int):
            self.log_info(F'variable {name} is an integer, converting to string.')
            data = str(data).encode(self.codec)
        if isinstance(data, str):
            self.log_info(F'variable {name} is a string, encoding as {self.codec}')
            data = data.encode(self.codec)
        elif not isbuffer(data):
            try:
                wrapped = bytearray(data)
            except Exception:
                self.log_warn(invalid_type())
                data = None
            else:
                data = wrapped
        return data

class ef (*filenames, list=False, meta=False, size=0, wild=False, tame=False, linewise=False)

This unit is implemented in refinery.units.meta.ef and has the following commandline Interface:

usage: ef [-h] [-L] [-Q] [-0] [-v] [-l] [-m] [-s N] [-w | -t] [-i] FILEMASK [FILEMASK ...]

Short for "emit file". The unit reads files from disk and outputs them individually. Has the
ability to read large files in chunks.

positional arguments:
  FILEMASK        A list of file masks. Each matching file will be read from disk and emitted.
                  The file masks can include format string expressions which will be substituted
                  from the current meta variables. The masks can use wild-card expressions, but
                  this feature is disabled by default on Posix platforms, where it has to be
                  enabled explicitly using the -w switch. On Windows, the feature is enabled by
                  default and can be disabled using the -t switch.

optional arguments:
  -l, --list      Only lists files with metadata.
  -m, --meta      Adds the atime, mtime, ctime, and size metadata variables.
  -s, --size N    If specified, files will be read in chunks of size N and each chunk is emitted
                  as one element in the output list.
  -w, --wild      Force use of wildcard patterns in file masks.
  -t, --tame      Disable wildcard patterns in file masks.
  -i, --linewise  Read the file linewise. By default, one line is read at a time. In line mode,
                  the --size argument can be used to read the given number of lines in each
                  chunk.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ef(Unit):
    """
    Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to
    read large files in chunks.
    """

    def __init__(self,
        *filenames: Arg(metavar='FILEMASK', nargs='+', type=str, help=(
            'A list of file masks. Each matching file will be read from disk and '
            'emitted. The file masks can include format string expressions which '
            'will be substituted from the current meta variables. The masks can '
            'use wild-card expressions, but this feature is disabled by default on '
            'Posix platforms, where it has to be enabled explicitly using the -w '
            'switch. On Windows, the feature is enabled by default and can be '
            'disabled using the -t switch.'
        )),
        list: Arg.Switch('-l', help='Only lists files with metadata.') = False,
        meta: Arg.Switch('-m', help=(
            'Adds the atime, mtime, ctime, and size metadata variables.'
        )) = False,
        size: Arg.Number('-s', help=(
            'If specified, files will be read in chunks of size N and each '
            'chunk is emitted as one element in the output list.'
        )) = 0,
        wild: Arg.Switch('-w', group='W', help='Force use of wildcard patterns in file masks.') = False,
        tame: Arg.Switch('-t', group='W', help='Disable wildcard patterns in file masks.') = False,
        linewise: Arg.Switch('-i', help=(
            'Read the file linewise. By default, one line is read at a time. '
            'In line mode, the --size argument can be used to read the given '
            'number of lines in each chunk.'
        )) = False
    ):
        if wild and tame:
            raise ValueError('Cannot be both wild and tame!')
        super().__init__(
            size=size,
            list=list,
            meta=meta,
            wild=wild,
            tame=tame,
            linewise=linewise,
            filenames=filenames
        )

    def _read_chunks(self, fd):
        while True:
            buffer = fd.read(self.args.size)
            if not buffer:
                break
            yield buffer

    def _read_lines(self, fd):
        count = self.args.size or 1
        if count == 1:
            while True:
                buffer = fd.readline()
                if not buffer:
                    break
                yield buffer
            return
        with MemoryFile() as out:
            while True:
                for _ in range(count):
                    buffer = fd.readline()
                    if not buffer:
                        break
                    out.write(buffer)
                if not out.tell():
                    break
                yield out.getvalue()
                out.seek(0)
                out.truncate()

    def _absolute_path(self, path_string: str):
        path = Path(path_string).absolute()
        if os.name == 'nt' and not path.parts[0].startswith('\\\\?\\'):
            # The pathlib glob method will simply fail mid-traversal if it attempts to descend into
            # a folder or to a file whose path exceeds MAX_PATH on Windows. As a workaround, we use
            # UNC paths throughout and truncate to relative paths after enumeration.
            path = Path(F'\\\\?\\{path!s}')
        return path

    def _glob(self, pattern: str) -> Iterable[Path]:
        if pattern.endswith('**'):
            pattern += '/*'
        wildcard = re.search(R'[\[\?\*]', pattern)
        if wildcard is None:
            yield self._absolute_path(pattern)
            return
        k = wildcard.start()
        base, pattern = pattern[:k], pattern[k:]
        path = self._absolute_path(base or '.')
        last = path.parts[-1]
        if base.endswith(last):
            # /base/something.*
            pattern = F'{last}{pattern}'
            path = path.parent
        for match in path.glob(pattern):
            yield match

    def process(self, data):
        meta = metavars(data)
        meta.ghost = True
        wild = (os.name == 'nt' or self.args.wild) and not self.args.tame
        root = self._absolute_path('.')
        paths = self._glob if wild else lambda mask: [self._absolute_path(mask)]

        for mask in self.args.filenames:
            mask = meta.format_str(mask, self.codec, [data])
            self.log_debug('scanning for mask:', mask)
            kwargs = dict()
            for path in paths(mask):
                try:
                    path = path.relative_to(root)
                except ValueError:
                    pass
                if wild:
                    try:
                        if not path.is_file():
                            continue
                    except Exception:
                        self.log_info(F'access error while scanning: {path!s}')
                        continue
                if self.args.meta:
                    stat = path.stat()
                    kwargs.update(
                        size=stat.st_size,
                        atime=datetime.fromtimestamp(stat.st_atime).isoformat(' ', 'seconds'),
                        ctime=datetime.fromtimestamp(stat.st_ctime).isoformat(' ', 'seconds'),
                        mtime=datetime.fromtimestamp(stat.st_mtime).isoformat(' ', 'seconds')
                    )
                if self.args.list:
                    try:
                        yield self.labelled(str(path).encode(self.codec), **kwargs)
                    except OSError:
                        self.log_warn(F'os error while scanning: {path!s}')
                    continue
                try:
                    with path.open('rb') as stream:
                        if self.args.linewise:
                            yield from self._read_lines(stream)
                        elif self.args.size:
                            yield from self._read_chunks(stream)
                        else:
                            data = stream.read()
                            self.log_info(lambda: F'reading: {path!s} ({len(data)} bytes)')
                            yield self.labelled(data, path=path.as_posix(), **kwargs)
                except PermissionError:
                    self.log_warn('permission denied:', path.as_posix())
                except FileNotFoundError:
                    self.log_warn('file is missing:', path.as_posix())
                except Exception:
                    self.log_warn('unknown error while reading:', path.as_posix())

class emit (*data)

This unit is implemented in refinery.units.meta.emit and has the following commandline Interface:

usage: emit [-h] [-L] [-Q] [-0] [-v] [data [data ...]]

positional arguments:
  data           Data to be emitted. If no argument is specified, data is retrieved from the
                 clipboard. Multiple arguments are output in framed format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class emit(Unit):

    def __init__(self, *data: Arg(help=(
        'Data to be emitted. If no argument is specified, data is retrieved from '
        'the clipboard. Multiple arguments are output in framed format.'
    ))):
        super().__init__(data=data)

    @Unit.Requires('pyperclip')
    def _pyperclip():
        import pyperclip
        return pyperclip

    def process(self, data):
        if self.args.data:
            yield from self.args.data
            return
        if os.name == 'nt':
            from refinery.lib.winclip import get_any_data
            mode, data = get_any_data()
            if mode is not None:
                self.log_info(F'retrieved clipboard data in {mode.name} format')
            yield data
        else:
            data = self._pyperclip.paste()
            if not data:
                return
            yield data.encode(self.codec, 'replace')

class esc (hex=False, unicode=False, greedy=False, quoted=False, bare=False)

This unit is implemented in refinery.units.encoding.esc and has the following commandline Interface:

usage: esc [-h] [-L] [-Q] [-0] [-v] [-R] [-x] [-u] [-g] [-q] [-b]

Encodes and decodes common ASCII escape sequences.

optional arguments:
  -x, --hex      Hex encode everything, do not use C escape sequences.
  -u, --unicode  Use unicode escape sequences and UTF-8 encoding.
  -g, --greedy   Replace \x by x and \u by u when not followed by two or four hex digits,
                 respectively.
  -q, --quoted   Remove enclosing quotes while decoding and add them for encoding.
  -b, --bare     Do not escape quote characters.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class esc(Unit):
    """
    Encodes and decodes common ASCII escape sequences.
    """
    _ESCAPE = {
        0x00: BR'\0',
        0x07: BR'\a',
        0x08: BR'\b',
        0x0C: BR'\f',
        0x0A: BR'\n',
        0x0D: BR'\r',
        0x09: BR'\t',
        0x0B: BR'\v',
        0x5C: BR'\\',
        0x27: BR'\'',
        0x22: BR'\"'
    }
    _UNESCAPE = {
        BR'0': B'\x00',
        BR'a': B'\x07',
        BR'b': B'\x08',
        BR'f': B'\x0C',
        BR'n': B'\x0A',
        BR'r': B'\x0D',
        BR't': B'\x09',
        BR'v': B'\x0B',
        B'\\': B'\x5C',
        BR"'": B'\x27',
        BR'"': B'\x22'
    }

    def __init__(self,
        hex     : Arg.Switch('-x', help='Hex encode everything, do not use C escape sequences.') = False,
        unicode : Arg.Switch('-u', help='Use unicode escape sequences and UTF-8 encoding.') = False,
        greedy  : Arg.Switch('-g', help='Replace \\x by x and \\u by u when not followed by two or four hex digits, respectively.') = False,
        quoted  : Arg.Switch('-q', help='Remove enclosing quotes while decoding and add them for encoding.') = False,
        bare    : Arg.Switch('-b', help='Do not escape quote characters.') = False,
    ) -> Unit: pass  # noqa

    def process(self, data):
        if self.args.quoted:
            quote = data[0]
            if data[-1] != quote:
                self.log_info('string is not correctly quoted')
            else:
                data = data[1:-1]

        def unescape(match):
            c = match[1]
            if len(c) > 1:
                if c[0] == 0x75:
                    # unicode
                    upper = int(c[1:3], 16)
                    lower = int(c[3:5], 16)
                    if self.args.unicode:
                        return bytes((lower, upper)).decode('utf-16le').encode(self.codec)
                    return bytes((lower,))
                elif c[0] == 0x78:
                    # hexadecimal
                    return bytes((int(c[1:3], 16),))
                else:
                    # octal escape sequence
                    return bytes((int(c, 8) & 0xFF,))
            elif c in B'ux':
                return c if self.args.greedy else match[0]
            return self._UNESCAPE.get(c, c)
        data = re.sub(
            RB'\\(u[a-fA-F0-9]{4}|x[a-fA-F0-9]{1,2}|[0-7]{3}|.)', unescape, data)
        return data

    def reverse(self, data):
        if self.args.unicode:
            string = data.decode(self.codec).encode('UNICODE_ESCAPE')
        else:
            if not self.args.hex:
                def escape(match):
                    c = match[0][0]
                    return self._ESCAPE.get(c, RB'\x%02x' % c)
                pattern = RB'[\x00-\x1F\x22\x27\x5C\x7F-\xFF]'
                if self.args.bare:
                    pattern = RB'[\x00-\x1F\x5C\x7F-\xFF]'
                string = re.sub(pattern, escape, data)
            else:
                string = bytearray(4 * len(data))
                for k in range(len(data)):
                    a = k * 4
                    b = k * 4 + 4
                    string[a:b] = RB'\x%02x' % data[k]
        if self.args.quoted:
            string = B'"%s"' % string
        return string

class evtx (raw=False)

This unit is implemented in refinery.units.formats.evtx and has the following commandline Interface:

usage: evtx [-h] [-L] [-Q] [-0] [-v] [-r]

Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a
single output chunk in XML format.

optional arguments:
  -r, --raw      Extract raw event data rather than XML.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class evtx(Unit):
    """
    Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single
    output chunk in XML format.
    """

    def __init__(self, raw: Unit.Arg.Switch('-r', help='Extract raw event data rather than XML.') = False):
        super().__init__(raw=raw)

    @Unit.Requires('python-evtx', 'formats')
    def _evtx():
        from Evtx.Evtx import Evtx
        return Evtx

    def process(self, data):
        with VirtualFileSystem() as vfs:
            raw = self.args.raw
            with self._evtx(vfs.new(data)) as log:
                for record in log.records():
                    yield record.data() if raw else record.xml().encode(self.codec)

class fernet (key)

This unit is implemented in refinery.units.crypto.cipher.fernet and has the following commandline Interface:

usage: fernet [-h] [-L] [-Q] [-0] [-v] key

Decrypt Fernet messages.

positional arguments:
  key            A fernet key, either in base64 or raw binary.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class fernet(Unit):
    """
    Decrypt Fernet messages.
    """
    def __init__(self, key: Arg(help='A fernet key, either in base64 or raw binary.')):
        super().__init__(key=key)

    def _b64(self, data):
        try:
            return data | b64(urlsafe=True) | bytearray
        except Exception:
            return data

    def process(self, data):
        fk = self._b64(self.args.key)
        if len(fk) != 32:
            raise ValueError(F'The given Fernet key has length {len(fk)}, expected 32 bytes.')
        signing_key = fk[:16]
        encryption_key = fk[16:]
        decoded = self._b64(data)
        reader = StructReader(memoryview(decoded), bigendian=True)
        signed_data = reader.peek(reader.remaining_bytes - 32)
        version = reader.u8()
        timestamp = datetime.fromtimestamp(reader.u64())
        iv = reader.read(16)
        if version != 0x80:
            self.log_warn(F'The Fernet version is 0x{version:02X}, the only documented one is 0x80.')
        ciphertext = reader.read(reader.remaining_bytes - 32)
        if len(ciphertext) % 16 != 0:
            raise ValueError('The encoded ciphertext is not 16-byte block aligned.')
        signature = reader.read(32)
        hmac = HMAC.new(signing_key, digestmod=SHA256)
        hmac.update(signed_data)
        if hmac.digest() != signature:
            self.log_warn('HMAC verification failed; the message has been tampered with.')
            self.log_info(F'computed signature: {hmac.hexdigest().upper()}')
            self.log_info(F'provided signature: {signature.hex().upper()}')
        plaintext = ciphertext | aes(mode='cbc', iv=iv, key=encryption_key) | bytearray
        return self.labelled(plaintext, timestamp=timestamp.isoformat(' ', 'seconds'))

class gost (key, iv=b'', padding=None, mode=None, raw=False, swap=False, sbox=SBOX.R34, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False)

This unit is implemented in refinery.units.crypto.cipher.gost and has the following commandline Interface:

usage: gost [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-x SBOX] [-e] [-S N]
            key

GOST encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -s, --swap            Decode blocks as big endian rather than little endian.
  -x, --sbox SBOX       Choose an SBOX. The default is R34, which corresponds to the R-34.12.2015
                        standard. The other option is CBR, which is the SBOX used by the Central
                        Bank of Russia.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class gost(StandardBlockCipherUnit, cipher=BlockCipherFactory(GOST)):
    """
    GOST encryption and decryption.
    """
    def __init__(
        self, key, iv=B'', padding=None, mode=None, raw=False,
        swap: Arg.Switch('-s', help='Decode blocks as big endian rather than little endian.') = False,
        sbox: Arg.Option('-x', choices=SBOX, help=(
            'Choose an SBOX. The default is {default}, which corresponds to the R-34.12.2015 standard. '
            'The other option is CBR, which is the SBOX used by the Central Bank of Russia.'
        )) = SBOX.R34, **more
    ):
        sbox = Arg.AsOption(sbox, SBOX)
        super().__init__(key, iv, padding=padding, mode=mode, raw=raw, swap=swap, sbox=sbox, **more)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            swap=self.args.swap,
            sbox=self.args.sbox,
            **optionals
        )

class group (size)

This unit is implemented in refinery.units.meta.group and has the following commandline Interface:

usage: group [-h] [-L] [-Q] [-0] [-v] N

Group incoming chunks into frames of the given size.

positional arguments:
  N              Size of each group; must be at least 2.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class group(Unit):
    """
    Group incoming chunks into frames of the given size.
    """
    def __init__(self, size: Arg.Number(help='Size of each group; must be at least 2.', bound=(2, None))):
        super().__init__(size=size)

    def process(self, data: Chunk):
        if not data.temp:
            return
        yield data
        yield from islice(data.temp, 0, self.args.size - 1)

    def filter(self, chunks):
        it = iter(chunks)
        while True:
            try:
                head: Chunk = next(it)
            except StopIteration:
                return
            head.temp = it
            yield head

class groupby (name)

This unit is implemented in refinery.units.meta.groupby and has the following commandline Interface:

usage: groupby [-h] [-L] [-Q] [-0] [-v] name

Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot
stream any output until the input frame is consumed: It has to read every input chunk to make
sure that all groupings are complete.

positional arguments:
  name           name of the meta variable

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class groupby(Unit):
    """
    Group incoming chunks by the contents of a meta variable. Note that the unit
    blocks and cannot stream any output until the input frame is consumed: It has
    to read every input chunk to make sure that all groupings are complete.
    """
    def __init__(self, name: Arg(type=str, help='name of the meta variable')):
        super().__init__(name=check_variable_name(name))

    def process(self, data):
        yield from data.temp

    def filter(self, chunks: Iterable[Chunk]) -> Generator[Chunk, None, None]:
        name = self.args.name
        members = defaultdict(list)
        for chunk in chunks:
            try:
                value = chunk.meta[name]
            except KeyError:
                value = None
            members[value].append(chunk)
        for chunklist in members.values():
            dummy = chunklist[0]
            dummy.temp = chunklist
            yield dummy

class hc128 (key, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.hc128 and has the following commandline Interface:

usage: hc128 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key

HC-128 encryption and decryption.

positional arguments:
  key              The encryption key.

optional arguments:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class hc128(StreamCipherUnit):
    """
    HC-128 encryption and decryption.
    """
    key_size = {32}

    def keystream(self) -> Iterable[int]:
        return HC128(self.args.key)

class hc256 (key, iv=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.hc256 and has the following commandline Interface:

usage: hc256 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key [iv]

HC-256 encryption and decryption.

positional arguments:
  key              The encryption key.
  iv               An initialization vector; the default is a sequence of 32 zero bytes.

optional arguments:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class hc256(StreamCipherUnit):
    """
    HC-256 encryption and decryption.
    """
    key_size = {32}

    def __init__(
        self, key,
        iv: Arg(help='An initialization vector; the default is a sequence of 32 zero bytes.') = bytes(32),
        discard=0, stateful=False,
    ):
        super().__init__(key=key, iv=iv, stateful=stateful, discard=discard)
        self._keystream = None

    def keystream(self) -> Iterable[int]:
        for num in HC256(self.args.key, self.args.iv):
            yield from num.to_bytes(4, 'little')

class hex

This unit is implemented in refinery.units.encoding.hex and has the following commandline Interface:

usage: hex [-h] [-L] [-Q] [-0] [-v] [-R]

Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding,
any odd trailing hex digits are stripped as two hex digits are required to represent a byte.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class hex(Unit):
    """
    Hex-decodes and encodes binary data. Non-hex characters are removed from
    the input. For decoding, any odd trailing hex digits are stripped as two
    hex digits are required to represent a byte.
    """

    def reverse(self, data):
        import base64
        return base64.b16encode(data)

    def process(self, data):
        import re
        import base64
        data = re.sub(B'[^A-Fa-f0-9]+', B'', data)
        if len(data) % 2:
            data = data[:-1]
        return base64.b16decode(data, casefold=True)

class hexload (blocks=1, dense=False, expand=False, narrow=False, width=0)

This unit is implemented in refinery.units.formats.hexload and has the following commandline Interface:

usage: hexload [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] [-D] [-E] [-N] [-W N]

Convert hex dumps back to the original data and vice versa. All options of this unit apply to its
reverse operation where binary data is converted to a readable hexdump format. The default mode
of the unit expects the input data to contain a readable hexdump and converts it back to binary.

optional arguments:
  -B, --blocks N  Group hexadecimal bytes in blocks of the given size; default is 1.
  -D, --dense     Do not insert spaces in hexdump.
  -E, --expand    Do not compress sequences of identical lines in hexdump
  -N, --narrow    Do not show addresses in hexdump
  -W, --width N   Specify the number of hexadecimal characters to use in preview.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.
  -R, --reverse   Use the reverse operation; Specify twice to normalize (first decode, then
                  encode).

Expand source code Browse git

class hexload(HexViewer):
    """
    Convert hex dumps back to the original data and vice versa. All options of this unit apply
    to its reverse operation where binary data is converted to a readable hexdump format.
    The default mode of the unit expects the input data to contain a readable hexdump and
    converts it back to binary.
    """
    @regex
    class _ENCODED_BYTES:
        R"""
        (?ix)(?:^|(?<=\s))                      # encoded byte patches must be prefixed by white space
        (?:
            (?:                                 # separated chunks of hex data
                [a-f0-9]{2}                     # hexadecimal chunk; single byte (two hexadecimal letters)
                \s{1,2}                         # encoded byte followed by whitespace
                (?:                             # at least one more encoded byte
                    [a-f0-9]{2}                 # followed by more encoded bytes
                    (?:\s{1,2}[a-f0-9]{2})*     # unless it was just a single byte
                )?
            )
            | (?:[a-f0-9]{4}\s{1,2}             # 2-byte chunks
              (?:[a-f0-9]{4}
              (?:\s{1,2}[a-f0-9]{4})*)?)
            | (?:[a-f0-9]{8}\s{1,2}             # 4-byte chunks
              (?:[a-f0-9]{8}
              (?:\s{1,2}[a-f0-9]{8})*)?)
            | (?:(?:[a-f0-9]{2})+)              # continuous line of hexadecimal characters
        )(?=\s|$)                               # terminated by a whitespace or line end
        """

    def __init__(self, blocks=1, dense=False, expand=False, narrow=False, width=0):
        super().__init__(blocks=blocks, dense=dense, expand=expand, narrow=narrow, width=width)
        self._hexline_pattern = re.compile(F'{make_hexline_pattern(1)}(?:[\r\n]|$)', flags=re.MULTILINE)

    def process(self, data: bytearray):
        lines = data.decode(self.codec).splitlines(keepends=False)

        if not lines:
            return None

        decoded_bytes = bytearray()
        encoded_byte_matches: List[Dict[int, int]] = []

        for line in lines:
            matches: Dict[int, int] = {}
            encoded_byte_matches.append(matches)
            for match in self._ENCODED_BYTES.finditer(line):
                a, b = match.span()
                matches[a] = b - a

        it = iter(encoded_byte_matches)
        offsets = set(next(it).keys())
        for matches in it:
            offsets.intersection_update(matches.keys())
        if not offsets:
            raise ValueError('unable to determine the position of the hex bytes in this dump')
        lengths: Dict[int, List[int]] = {offset: [] for offset in offsets}
        del offsets
        for matches in encoded_byte_matches:
            for offset in lengths:
                lengths[offset].append(matches[offset])
        for offset in lengths:
            lengths[offset].sort()
        midpoint = len(encoded_byte_matches) // 2
        offset, length = max(((offset, lengths[offset][midpoint]) for offset in lengths),
            key=operator.itemgetter(1))
        end = offset + length
        del lengths
        for k, line in enumerate(lines, 1):
            encoded_line = line[offset:end]
            onlyhex = re.search(r'^[\sA-Fa-f0-9]+', encoded_line)
            if not onlyhex:
                self.log_warn(F'ignoring line without hexadecimal data: {line}')
                continue
            if onlyhex.group(0) != encoded_line:
                if k != len(lines):
                    self.log_warn(F'ignoring line with mismatching hex data length: {line}')
                    continue
                encoded_line = onlyhex.group(0)
            self.log_debug(F'decoding: {encoded_line.strip()}')
            decoded_line = bytes.fromhex(encoded_line)
            decoded_bytes.extend(decoded_line)
            txt = line[end:]
            txt_stripped = txt.strip()
            if not txt_stripped:
                continue
            if len(decoded_line) not in range(len(txt_stripped), len(txt) + 1):
                self.log_warn(F'preview size {len(txt_stripped)} does not match decoding: {line}')
        if decoded_bytes:
            yield decoded_bytes

    def reverse(self, data):
        metrics = self._get_metrics(len(data))
        if not self.args.width:
            metrics.fit_to_width(allow_increase=True)
        for line in self.hexdump(data, metrics):
            yield line.encode(self.codec)

class HKDF (size, salt, hash='SHA512')

This unit is implemented in refinery.units.crypto.keyderive.hkdf and has the following commandline Interface:

usage: HKDF [-h] [-L] [-Q] [-0] [-v] size salt [hash]

HKDF Key derivation

positional arguments:
  size           The number of bytes to generate.
  salt           Salt for the derivation.
  hash           Specify one of these algorithms (default is SHA512): md2, md4, md5, sha1,
                 sha256, sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class HKDF(KeyDerivation):
    """HKDF Key derivation"""

    def __init__(self, size, salt, hash='SHA512'):
        super().__init__(size=size, salt=salt, hash=hash)

    def process(self, data):
        from Cryptodome.Protocol.KDF import HKDF
        return HKDF(data, self.args.size, self.args.salt, self.hash)

class hmac (salt, hash='SHA1', size=None)

This unit is implemented in refinery.units.crypto.keyderive.hmac and has the following commandline Interface:

usage: hmac [-h] [-L] [-Q] [-0] [-v] salt [hash] [size]

HMAC based key derivation

positional arguments:
  salt           Salt for the derivation.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384
  size           The number of bytes to generate.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class hmac(KeyDerivation):
    """
    HMAC based key derivation
    """

    def __init__(self, salt, hash='SHA1', size=None):
        super().__init__(salt=salt, size=size, hash=hash)

    def process(self, data):
        from Cryptodome.Hash import HMAC
        return HMAC.new(data, self.args.salt, digestmod=self.hash).digest()

class htmlesc

This unit is implemented in refinery.units.encoding.htmlesc and has the following commandline Interface:

usage: htmlesc [-h] [-L] [-Q] [-0] [-v] [-R]

Encodes and decodes HTML entities.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class htmlesc(Unit):
    """
    Encodes and decodes HTML entities.
    """

    @unicoded
    def process(self, data: str) -> str:
        return html_entities.unescape(data)

    @unicoded
    def reverse(self, data: str) -> str:
        return html_entities.escape(data)

class httpresponse

This unit is implemented in refinery.units.formats.httpresponse and has the following commandline Interface:

usage: httpresponse [-h] [-L] [-Q] [-0] [-v]

Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked
or compressed transfer encoding was used.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class httpresponse(Unit):
    """
    Parses HTTP response text, as you would obtain from a packet dump. This can be
    useful if chunked or compressed transfer encoding was used.
    """
    def process(self, data):
        with SockWrapper(data) as mock:
            mock.seek(0)
            parser = HTTPResponse(mock)
            parser.begin()
            try:
                return parser.read()
            except IncompleteRead as incomplete:
                msg = F'incomplete read: {len(incomplete.partial)} bytes processed, {incomplete.expected} more expected'
                raise RefineryPartialResult(msg, incomplete.partial) from incomplete

class iemap (legend=False, background=False, block_char='#', *label)

This unit is implemented in refinery.units.sinks.iemap and has the following commandline Interface:

usage: iemap [-h] [-L] [-Q] [-0] [-v] [-l] [-b] [-c C] [label-part [label-part ...]]

The information entropy map displays a colored bar on the terminal visualizing the file's local
entropy from beginning to end.

positional arguments:
  label-part          The remaining command line specifies a format string expression that will
                      be printed over the heat map display of each processed chunk.

optional arguments:
  -l, --legend        Show entropy color legend.
  -b, --background    Generate the bar by coloring the background.
  -c, --block-char C  Character used for filling the bar, default is #

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Allow partial results as output.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.

Expand source code Browse git

class iemap(Unit):
    """
    The information entropy map displays a colored bar on the terminal visualizing the file's local
    entropy from beginning to end.
    """
    def __init__(
        self,
        legend: Unit.Arg.Switch('-l', help='Show entropy color legend.') = False,
        background: Unit.Arg.Switch('-b', help='Generate the bar by coloring the background.') = False,
        block_char: Unit.Arg('-c', '--block-char', type=str, metavar='C',
            help='Character used for filling the bar, default is {default}') = '#',
        *label: Unit.Arg(type=str, metavar='label-part', help=(
            'The remaining command line specifies a format string expression that will be printed '
            'over the heat map display of each processed chunk.'
        ))
    ):
        super().__init__(label=' '.join(label), background=background, legend=legend, block_char=block_char)

    @Unit.Requires('colorama', 'display', 'default', 'extended')
    def _colorama():
        import colorama
        return colorama

    def process(self, data):
        from sys import stderr
        from os import name as os_name
        colorama = self._colorama
        colorama.init(autoreset=False, convert=(os_name == 'nt'))

        nobg = not self.args.background
        meta = metavars(data)

        label = meta.format_str(self.args.label, self.codec, [data])
        if label:
            if not label.endswith(' '):
                label = F'{label} '
            if not label.startswith(' '):
                label = F' {label}'

        bgmap = [
            colorama.Back.BLACK,
            colorama.Back.WHITE,
            colorama.Back.YELLOW,
            colorama.Back.CYAN,
            colorama.Back.BLUE,
            colorama.Back.GREEN,
            colorama.Back.LIGHTRED_EX,
            colorama.Back.MAGENTA,
        ]
        fgmap = [
            colorama.Fore.LIGHTBLACK_EX,
            colorama.Fore.LIGHTWHITE_EX,
            colorama.Fore.LIGHTYELLOW_EX,
            colorama.Fore.LIGHTCYAN_EX,
            colorama.Fore.LIGHTBLUE_EX,
            colorama.Fore.LIGHTGREEN_EX,
            colorama.Fore.LIGHTRED_EX,
            colorama.Fore.LIGHTMAGENTA_EX,
        ]

        _reset = colorama.Back.BLACK + colorama.Fore.WHITE + colorama.Style.RESET_ALL

        clrmap = fgmap if nobg else bgmap
        header = '['
        header_length = 1
        footer_length = 4 + 7

        if self.args.legend:
            header = '[{1}{0}] {2}'.format(_reset, ''.join(F'{bg}{k}' for k, bg in enumerate(clrmap, 1)), header)
            header_length += 3 + len(clrmap)

        _tw = get_terminal_size()
        width = _tw - header_length - footer_length
        if width < 16:
            raise RuntimeError(F'computed terminal width {_tw} is too small for heatmap')

        def entropy_select(value, map):
            index = min(len(map) - 1, math.floor(value * len(map)))
            return map[index]

        view = memoryview(data)
        size = len(data)
        chunk_size = 0

        for block_size in range(1, width + 1):
            block_count = width // block_size
            chunk_size = size // block_count
            if chunk_size > 1024:
                break

        q, remainder = divmod(width, block_size)
        assert q == block_count
        indices = list(range(q))
        random.seed(sum(view[:1024]))
        random.shuffle(indices)

        block_sizes = [block_size] * q
        q, r = divmod(remainder, block_count)
        for i in indices:
            block_sizes[i] += q
        for i in indices[:r]:
            block_sizes[i] += 1
        assert sum(block_sizes) == width

        q, remainder = divmod(size, block_count)
        assert q == chunk_size
        chunk_sizes = [chunk_size] * block_count
        for i in indices[:remainder]:
            chunk_sizes[i] += 1
        assert sum(chunk_sizes) == size

        stream = MemoryFile(view)
        filler = self.args.block_char if nobg else ' '

        try:
            stderr.write(header)
            if label is not None:
                stderr.write(colorama.Fore.WHITE)
                stderr.flush()
            it = itertools.chain(itertools.repeat(filler, 3), label, itertools.cycle(filler))
            cp = None
            for chunk_size, block_size in zip(chunk_sizes, block_sizes):
                chunk = stream.read(chunk_size)
                chunk_entropy = entropy(chunk)
                pp = entropy_select(chunk_entropy, clrmap)
                string = ''.join(itertools.islice(it, block_size))
                if pp != cp:
                    string = F'{pp}{string}'
                cp = pp
                stderr.write(string)
                stderr.flush()
        except BaseException:
            eraser = ' ' * width
            stderr.write(F'\r{_reset}{eraser}\r')
            raise
        else:
            stderr.write(F'{_reset}] [---.--%]')
            te = meta['entropy']
            stderr.write('\b' * footer_length)
            stderr.write(F'] [{te!r:>7}]\n')
            stderr.flush()
        if not self.isatty:
            yield data

class iff (*expression, ge=None, gt=None, le=None, lt=None, ct=None, iN=None, eq=None, single=False, negate=False)

This unit is implemented in refinery.units.meta.iff and has the following commandline Interface:

usage: iff [-h] [-L] [-Q] [-0] [-v]
           [-ge RHS | -gt RHS | -le RHS | -lt RHS | -ct RHS | -in RHS | -eq RHS] [-s] [-n]
           [token [token ...]]

Filter incoming chunks depending on whether a given Python expression evaluates to true. If no
expression is given, the unit filters out empty chunks.

Conditional units can be used in two different ways. When a new frame opens after using this
unit, chunks that did not match the condition are moved out of scope for that frame but still
exist and will re-appear after the frame closes. When used inside a frame, however, the unit
works as a filter and will discard any chunks that do not match.

positional arguments:
  token          All "token" arguments to this unit are joined with spaces to produce the
                 expression to be evaluated. This is done so that unnecessary shell quoting is
                 avoided.

optional arguments:
  -ge RHS        check that the expression is greater or equal to RHS
  -gt RHS        check that the expression is greater than RHS
  -le RHS        check that the expression is less or equal to RHS
  -lt RHS        check that the expression is less than RHS
  -ct RHS        check that the expression contains RHS
  -in, -i RHS    check that the expression is contained in RHS
  -eq, -e RHS    check that the expression is equal to RHS
  -s, --single   discard all chunks after filtering a single one that matches the condition
  -n, --negate   invert the logic of this filter; drop all matching chunks instead of keeping
                 them

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class iff(ConditionalUnit, extend_docs=True):
    """
    Filter incoming chunks depending on whether a given Python expression evaluates to true. If no
    expression is given, the unit filters out empty chunks.
    """
    def __init__(
        self,
        *expression: Arg(metavar='token', type=str, help=(
            'All "token" arguments to this unit are joined with spaces to produce the expression '
            'to be evaluated. This is done so that unnecessary shell quoting is avoided.')),
        ge: Arg('-ge', type=str, metavar='RHS', group='OP',
            help='check that the expression is greater or equal to {varname}') = None,
        gt: Arg('-gt', type=str, metavar='RHS', group='OP',
            help='check that the expression is greater than {varname}') = None,
        le: Arg('-le', type=str, metavar='RHS', group='OP',
            help='check that the expression is less or equal to {varname}') = None,
        lt: Arg('-lt', type=str, metavar='RHS', group='OP',
            help='check that the expression is less than {varname}') = None,
        ct: Arg('-ct', type=str, metavar='RHS', group='OP',
            help='check that the expression contains {varname}') = None,
        iN: Arg('-in', '-i', type=str, metavar='RHS', group='OP',
            help='check that the expression is contained in {varname}') = None,
        eq: Arg('-eq', '-e', type=str, metavar='RHS', group='OP',
            help='check that the expression is equal to {varname}') = None,
        single=False,
        negate=False,
    ):
        operators = [
            (ge, operator.__ge__),
            (gt, operator.__gt__),
            (le, operator.__le__),
            (lt, operator.__lt__),
            (eq, operator.__eq__),
            (ct, operator.__contains__),
            (iN, lambda a, b: operator.__contains__(b, a)),
        ]

        operators = [
            (rhs, cmp)
            for (rhs, cmp) in operators
            if rhs is not None
        ]

        rhs, cmp, lhs = None, None, '\x20'.join(expression) or None

        if len(operators) > 0:
            if not lhs:
                raise ValueError('Comparison operator with empty left hand side.')
            if len(operators) > 1:
                raise ValueError('Only one comparison operation can be specified.')
            rhs, cmp = operators[0]

        super().__init__(
            lhs=lhs,
            rhs=rhs,
            cmp=cmp,
            negate=negate,
            single=single,
        )

    def match(self, chunk):
        meta = metavars(chunk)
        lhs: Optional[str] = self.args.lhs
        rhs: Optional[Any] = self.args.rhs
        cmp: Optional[Callable[[Any, Any], bool]] = self.args.cmp

        if cmp is None and rhs is not None:
            raise ValueError('right hand side defined but no operator')

        if lhs is not None:
            if rhs is not None:
                lhs = DelayedNumSeqArgument(lhs, additional_types=(float, str))(chunk)
            else:
                lhs = PythonExpression.evaluate(lhs, meta)

        self.log_debug('lhs:', lhs)
        self.log_debug('rhs:', rhs)

        rhs = rhs and DelayedNumSeqArgument(rhs, additional_types=(float, str))(chunk)

        if lhs is None:
            return bool(chunk)
        if rhs is None:
            return bool(lhs)

        return cmp(lhs, rhs)

class iffp (*patterns, partial=False, negate=False, single=False)

This unit is implemented in refinery.units.meta.iffp and has the following commandline Interface:

usage: iffp [-h] [-L] [-Q] [-0] [-v] [-p] [-n] [-s] [pattern [pattern ...]]

Filter incoming chunks depending on whether it matches any of a given set of patterns. The
available patterns are the following: integer, float, number, string, multiline_string, cmdstr,
ps1str, vbastr, vbaint, printable, urlquote, urlquote_coarse, urlquote_narrow, intarray,
numarray, word, letters, wshenc, alphanumeric, b32, b64, b85, b64any, b64url, hex, uppercase_hex,
spaced_hex, spaced_b64, spaced_b85, utf8, hexdump, hexarray, uuencode, domain, email, guid, ipv4,
ipv6, md5, sha1, sha256, hostname, socket, subdomain, url, btc, pem, xmr, path, winpath, nixpath,
environment_variable.

Conditional units can be used in two different ways. When a new frame opens after using this
unit, chunks that did not match the condition are moved out of scope for that frame but still
exist and will re-appear after the frame closes. When used inside a frame, however, the unit
works as a filter and will discard any chunks that do not match.

positional arguments:
  pattern

optional arguments:
  -p, --partial  Allow partial matches on the data.
  -n, --negate   invert the logic of this filter; drop all matching chunks instead of keeping
                 them
  -s, --single   discard all chunks after filtering a single one that matches the condition

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class iffp(ConditionalUnit, extend_docs=True):
    """
    Filter incoming chunks depending on whether it matches any of a given set of patterns. The
    available patterns are the following: {}.
    """

    def __init__(
        self,
        *patterns: Arg.Choice(metavar='pattern', choices=_PATTERNS),
        partial: Arg.Switch('-p', help='Allow partial matches on the data.') = False,
        negate=False, single=False
    ):
        super().__init__(
            negate=negate,
            single=single,
            patterns=patterns,
            partial=partial
        )

    def match(self, chunk):
        for name in self.args.patterns:
            p: pattern = _PATTERNS[name]
            matcher = p.match if self.args.partial else p.fullmatch
            if matcher(chunk):
                return True
        return False

class iffs (needle, negate=False, single=False)

This unit is implemented in refinery.units.meta.iffs and has the following commandline Interface:

usage: iffs [-h] [-L] [-Q] [-0] [-v] [-n] [-s] needle

Filter incoming chunks depending on whether they contain a given binary substring.

Conditional units can be used in two different ways. When a new frame opens after using this
unit, chunks that did not match the condition are moved out of scope for that frame but still
exist and will re-appear after the frame closes. When used inside a frame, however, the unit
works as a filter and will discard any chunks that do not match.

positional arguments:
  needle         the string to search for

optional arguments:
  -n, --negate   invert the logic of this filter; drop all matching chunks instead of keeping
                 them
  -s, --single   discard all chunks after filtering a single one that matches the condition

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class iffs(ConditionalUnit, extend_docs=True):
    """
    Filter incoming chunks depending on whether they contain a given binary substring.
    """
    def __init__(
        self,
        needle: Arg(help='the string to search for'),
        negate=False,
        single=False,
    ):
        super().__init__(
            needle=needle,
            negate=negate,
            single=single,
        )

    def match(self, chunk):
        return self.args.needle in chunk

class iffx (regex, multiline=False, ignorecase=False, negate=False, single=False, match=False)

This unit is implemented in refinery.units.meta.iffx and has the following commandline Interface:

usage: iffx [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-n] [-s] [-m] regex

Filter incoming chunks by discarding those that do not match the given regular expression.

Conditional units can be used in two different ways. When a new frame opens after using this
unit, chunks that did not match the condition are moved out of scope for that frame but still
exist and will re-appear after the frame closes. When used inside a frame, however, the unit
works as a filter and will discard any chunks that do not match.

positional arguments:
  regex             Regular expression to match.

optional arguments:
  -M, --multiline   Caret and dollar match the beginning and end of a line, a dot does not match
                    line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters.
  -n, --negate      invert the logic of this filter; drop all matching chunks instead of keeping
                    them
  -s, --single      discard all chunks after filtering a single one that matches the condition
  -m, --match       Perform a full match rather than matching anywhere in the chunk.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class iffx(RegexUnit, ConditionalUnit, extend_docs=True):
    """
    Filter incoming chunks by discarding those that do not match the given
    regular expression.
    """
    def __init__(
        self, regex, multiline=False, ignorecase=False, negate=False, single=False,
        match: Arg.Switch('-m',
            help='Perform a full match rather than matching anywhere in the chunk.') = False
    ):
        super().__init__(
            regex=regex,
            negate=negate,
            single=single,
            multiline=multiline,
            ignorecase=ignorecase,
            match=match
        )

    def match(self, chunk):
        return bool(self._matcher(chunk))

    def filter(self, chunks):
        self._matcher = self.regex.fullmatch if self.args.match else self.regex.search
        yield from super().filter(chunks)

class ifps

This unit is implemented in refinery.units.formats.ifps and has the following commandline Interface:

usage: ifps [-h] [-L] [-Q] [-0] [-v]

Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These
scripts can be found, for example, when unpacking InnoSetup installers using innounp.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ifps(Unit):
    """
    Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These
    scripts can be found, for example, when unpacking InnoSetup installers using innounp.
    """
    def process(self, data):
        return str(IFPSFile(data)).encode(self.codec)

    @classmethod
    def handles(self, data: bytearray) -> bool:
        return data.startswith(IFPSFile.Magic)

class ifpsstr

This unit is implemented in refinery.units.formats.ifpsstr and has the following commandline Interface:

usage: ifpsstr [-h] [-L] [-Q] [-0] [-v]

Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS".
These scripts can be found, for example, when unpacking InnoSetup installers using innounp.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ifpsstr(Unit):
    """
    Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS".
    These scripts can be found, for example, when unpacking InnoSetup installers using innounp.
    """
    def process(self, data):
        ifps = IFPSFile(data)
        for string in ifps.strings:
            yield string.encode(self.codec)

    @classmethod
    def handles(self, data: bytearray) -> bool:
        return data.startswith(IFPSFile.Magic)

class imphash (text=False)

This unit is implemented in refinery.units.crypto.hash.imphash and has the following commandline Interface:

usage: imphash [-h] [-L] [-Q] [-0] [-v] [-t]

Implements the import hash for PE files.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class imphash(HashUnit):
    """
    Implements the import hash for PE files.
    """

    def _algorithm(self, data):
        pe = PE(data=data, fast_load=True)
        pe.parse_data_directories(directories=[IMAGE_DIRECTORY_ENTRY_IMPORT])
        th = pe.get_imphash()
        if not th:
            raise ValueError('no import directory.')
        return bytes.fromhex(th)

class isaac (key, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.isaac and has the following commandline Interface:

usage: isaac [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key

The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.

positional arguments:
  key              The encryption key.

optional arguments:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class isaac(StreamCipherUnit):
    """
    The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.
    """

    def keystream(self) -> Iterable[int]:
        key = self.args.key

        A: int = 0
        B: int = 0
        C: int = 0
        S: List[int] = [0x9E3779B9] * 8
        T: List[int] = []
        K = list(chunks.unpack(key + bytearray(0x400 - len(key)), 4, bigendian=False))
        U = 0xFFFFFFFF

        def _mix_state():
            a, b, c, d, e, f, g, h = S
            a ^= (b << 0x0B) & U; d = d + a & U; b = b + c & U # noqa
            b ^= (c >> 0x02) & U; e = e + b & U; c = c + d & U # noqa
            c ^= (d << 0x08) & U; f = f + c & U; d = d + e & U # noqa
            d ^= (e >> 0x10) & U; g = g + d & U; e = e + f & U # noqa
            e ^= (f << 0x0A) & U; h = h + e & U; f = f + g & U # noqa
            f ^= (g >> 0x04) & U; a = a + f & U; g = g + h & U # noqa
            g ^= (h << 0x08) & U; b = b + g & U; h = h + a & U # noqa
            h ^= (a >> 0x09) & U; c = c + h & U; a = a + b & U # noqa
            S[:] = a, b, c, d, e, f, g, h
            return S

        def _initialize_with(R: List[int]):
            for i in range(0, 0x100, 8):
                S[:] = (x + R[j] & U for j, x in enumerate(S, i))
                T[i:i + 8] = _mix_state()

        for _ in range(4):
            _mix_state()

        _initialize_with(K)
        _initialize_with(T)

        operations = [
            (__lshift__, 0x0D),
            (__rshift__, 0x06),
            (__lshift__, 0x02),
            (__rshift__, 0x10),
        ]

        while True:
            C = (C + 1) & U
            B = (B + C) & U
            for i in range(0x100):
                X = T[i]
                shift, k = operations[i % 4]
                A = (A ^ shift(A, k)) & U
                A = (A + T[i ^ 0x80]) & U
                Y = T[+i] = T[X // 4 & 0xFF] + A + B & U
                B = K[~i] = X + T[Y // 1024 & 0xFF] & U
            yield from chunks.pack(K, 4, True)

class jcalg (ignore_header=False)

This unit is implemented in refinery.units.compression.jcalg and has the following commandline Interface:

usage: jcalg [-h] [-L] [-Q] [-0] [-v] [-g]

JCALG decompression.

optional arguments:
  -g, --ignore-header  Keep decompressing even after the output has reached the final size as
                       given by the header value.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Allow partial results as output.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class jcalg(Unit):
    """
    JCALG decompression.
    """
    def __init__(
        self,
        ignore_header: Unit.Arg('-g', help=(
            'Keep decompressing even after the output has reached the final size as given by the header value.')) = False,
    ):
        super().__init__(ignore_header=ignore_header)

    def process(self, data: bytearray):
        with MemoryFile() as output, StructReader(data) as reader:
            if reader.read(2) != B'JC':
                self.log_warn('data does not begin with magic sequence, assuming that header is missing')
                reader.seek(0)
                size = checksum = None
            else:
                size = reader.u32()
                checksum = reader.u32()
            if self.args.ignore_header:
                size = None
            self._decompress(output, reader, size)
            if size is not None:
                if len(output) > size:
                    self.log_info(F'tuncating to size {size}')
                    output.truncate(size)
                elif len(output) < size:
                    self.log_warn(F'header size was {size}, but only {len(data)} bytes were decompressed')
            data = output.getvalue()
            if checksum:
                c = self._checksum(data)
                if c != checksum:
                    self.log_warn(F'header checksum was {checksum:08X}, computed value is {c:08X}')
            return data

    @classmethod
    def handles(cls, data: bytearray):
        if data[:2] == B'JC':
            return True

    def _checksum(self, data):
        from refinery.lib import chunks
        checksum = 0
        it = chunks.unpack(data, 4)
        if len(data) % 4:
            import itertools
            it = itertools.chain(it, (int.from_bytes(data[-4:], 'little'),))
        for chunk in it:
            checksum += chunk
            checksum ^= ((chunk & 0x7FFFFFFF) << 1) + (chunk >> 31) + 1
            checksum &= 0xFFFFFFFF
        return checksum

    def _decompress(self, writer: MemoryFile, reader_: StructReader[bytearray], size: Optional[int] = None):
        index = 1
        base = 8
        literal_bits = None
        literal_offset = None
        flags = BitBufferedReader(reader_, 32)

        while True:
            if size and len(writer) >= size:
                break
            if flags.next():
                b = flags.read(literal_bits) + literal_offset
                b = b & 0xFF
                writer.write_byte(b)
                continue
            if flags.next():
                high = flags.variable_length_integer()
                if high == 2:
                    match_length = flags.variable_length_integer()
                else:
                    index = ((high - 3) << base) + flags.read(base)
                    match_length = flags.variable_length_integer()
                    if index >= 0x10000:
                        match_length += 3
                    elif index >= 0x37FF:
                        match_length += 2
                    elif index >= 0x27F:
                        match_length += 1
                    elif index <= 127:
                        match_length += 4
                writer.replay(index, match_length)
                continue
            if not flags.next():
                new_index = flags.read(7)
                match_length = 2 + flags.read(2)
                if new_index == 0:
                    if match_length == 2:
                        break
                    base = flags.read(match_length + 1)
                else:
                    index = new_index
                    writer.replay(index, match_length)
                continue
            one_byte_phrase_value = flags.read(4) - 1
            if one_byte_phrase_value == 0:
                writer.write_byte(0)
            elif one_byte_phrase_value > 0:
                b = writer.getbuffer()[-one_byte_phrase_value]
                writer.write_byte(b)
            else:
                if not flags.next():
                    literal_bits = 7 + flags.next()
                    literal_offset = 0
                    if literal_bits != 8:
                        literal_offset = flags.read(8)
                    continue
                while True:
                    for _ in range(0x100):
                        b = flags.read(8)
                        writer.write_byte(b)
                    if not flags.next():
                        break

class jvdasm (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.java.jvdasm and has the following commandline Interface:

usage: jvdasm [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The
unit is implemented as a path extractor and each path name corresponds to the name of one method
defined in the class file.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class jvdasm(PathExtractorUnit):
    """
    Disassembles the JVM bytecode instructions of methods of classes defined in Java class
    files. The unit is implemented as a path extractor and each path name corresponds to the
    name of one method defined in the class file.
    """
    _OPC_STRLEN = max(len(op.name) for op in opc)

    def _hex(self, bytestring, sep=''):
        return sep.join(F'{x:02x}' for x in bytestring)

    def unpack(self, data):
        jc = JvClassFile(data)
        tt = '  '
        opcw = self._OPC_STRLEN
        for method in jc.methods:
            for attribute in method.attributes:
                if attribute.name == 'Code': break
            else:
                self.log_warn(F'no code found for method: {method.name}')
                continue
            code: JvCode = attribute.parse(JvCode)
            with io.StringIO() as display:
                args, retval = re.match(R'^\((.*?)\)(.*?)$', method.descriptor).groups()
                print(F'{jc.this!s}::{method!s}{method.descriptor}', file=display)
                for op in code.disassembly:
                    olen = len(op.raw)
                    if op.table is None:
                        args = ', '.join(repr(a) for a in op.arguments)
                    else:
                        ow = 4 if op.code is opc.tableswitch else 8
                        olen = olen - (len(op.table) - 1) * ow
                        args = F'defaultjmp => {op.table[None]:#010x}'
                        jmps = []
                        for k, (key, jmp) in enumerate(op.table.items()):
                            if key is None:
                                continue
                            raw = self._hex(op.raw[olen + k * ow: olen + k * ow + ow], ' ')
                            jmps.append(F'{tt}{raw!s:<{opcw + 15}} {key:#010x} => {jmp:#010x}')
                        args = '\n'.join((args, *jmps))
                    opch = self._hex(op.raw[:olen], ' ')
                    if len(opch) > 14:
                        opch += F'\n{tt}{tt:<15}'
                    print(F'{tt}{opch:<15}{op.code!r:<{opcw}} {args}', file=display)
                name = method.name
                if name.startswith('<'):
                    this = jc.this.value.split('/')
                    this = this[-1]
                    name = F'{this}${name[1:-1]}'
                yield UnpackResult(F'{name}.jd', display.getvalue().encode(self.codec))

class jvstr

This unit is implemented in refinery.units.formats.java.jvstr and has the following commandline Interface:

usage: jvstr [-h] [-L] [-Q] [-0] [-v]

Extract string constants from Java class files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class jvstr(Unit):
    """
    Extract string constants from Java class files.
    """
    def process(self, data):
        jc = JvClassFile(data)
        for string in jc.strings:
            yield string.encode(self.codec)

class kblob

This unit is implemented in refinery.units.crypto.keyderive.kblob and has the following commandline Interface:

usage: kblob [-h] [-L] [-Q] [-0] [-v]

Extracts a key from a Microsoft Crypto API BLOB structure.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class kblob(Unit):
    """
    Extracts a key from a Microsoft Crypto API BLOB structure.
    """

    def process(self, data):
        blob = CRYPTOKEY(data)
        try:
            return self.labelled(
                bytes(blob.key),
                type=blob.header.type.name,
                algorithm=blob.header.algorithm.name
            )
        except AttributeError as A:
            raise ValueError(F'unable to derive key from {blob.header.type!s}') from A

class lnk (tabular=False)

This unit is implemented in refinery.units.formats.lnk and has the following commandline Interface:

usage: lnk [-h] [-L] [-Q] [-0] [-v] [-t]

Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit
is a thin wrapper around the LnkParse3 library.

optional arguments:
  -t, --tabular  Print information in a table rather than as JSON

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lnk(Unit):
    """
    Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This
    unit is a thin wrapper around the LnkParse3 library.
    """

    @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'default', 'extended')
    def _LnkParse3():
        import LnkParse3
        return LnkParse3

    def __init__(self, tabular: Unit.Arg('-t', help='Print information in a table rather than as JSON') = False):
        super().__init__(tabular=tabular)

    def process(self, data):
        with NoLogging():
            parsed = self._LnkParse3.lnk_file(MemoryFile(data)).get_json()
        with JSONEncoderEx as encoder:
            pp = ppjson(tabular=self.args.tabular)
            yield from pp._pretty_output(
                parsed, indent=4, cls=encoder, ensure_ascii=False)

class lz4

This unit is implemented in refinery.units.compression.lz4 and has the following commandline Interface:

usage: lz4 [-h] [-L] [-Q] [-0] [-v]

LZ4 block decompression. See also:
https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lz4(Unit):
    """
    LZ4 block decompression. See also:
    https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format
    """
    def _read_block(self, reader, output, ubound=None):
        entry = reader.tell()
        lastend = 0

        def ubound_check():
            if ubound is None:
                return False
            consumed = reader.tell() - entry
            if consumed > ubound:
                raise ValueError(F'upper bound {ubound} exceeded by {consumed - ubound} in LZ4 block')
            return consumed == ubound

        while not reader.eof:
            reflen = reader.read_nibble()
            litlen = reader.read_nibble()
            litlen = reader.read_size(litlen)
            literal = reader.read(litlen)
            output.write(literal)
            if ubound_check(): break
            try: refpos = reader.u16()
            except EOF: break
            if refpos - 1 not in range(output.tell()):
                with StreamDetour(output, lastend):
                    if output.read(len(literal)) == literal:
                        # This literal could have been encoded in the last match, but it wasn't.
                        # Therefore, it is very likely that we have reached the end of the stream.
                        break
                position = reader.tell()
                remaining = len(literal) - position
                raise RefineryPartialResult(
                    F'encountered invalid match offset value {refpos} at position {position} with {remaining} bytes remaining',
                    partial=output.getvalue())
            reflen = reader.read_size(reflen)
            if ubound_check():
                raise ValueError('last sequence in block contained a match')
            reflen += 4
            available_bytes = min(refpos, reflen)
            q, r = divmod(reflen, available_bytes)
            with StreamDetour(output, -refpos, io.SEEK_CUR):
                match = output.read(available_bytes)
                match = q * match + match[:r]
                assert len(match) == reflen
                lastend = output.tell() - available_bytes + r
            output.write(match)

    def process(self, data):
        output = io.BytesIO()
        reader = LZ4Reader(memoryview(data))
        try:
            magic = reader.u32() == 0x184D2204
        except EOF:
            magic = False
        if not magic:
            reader.seek(0)
            self._read_block(reader, output)
            return output.getbuffer()

        (dict_id, rsrv1, content_checksummed, content_size,
            blocks_checksummed, blocks_independent, v2, v1) = reader.read_bits(8)
        rsrv2 = reader.read_nibble()
        try:
            block_maximum = {
                7: 0x400000,
                6: 0x100000,
                5: 0x040000,
                4: 0x010000,
            }[reader.read_integer(3)]
        except KeyError:
            raise ValueError('unknown maximum block size value in LZ4 frame header')
        rsrv3 = reader.read_bit()
        if any((rsrv1, rsrv2, rsrv3)):
            self.log_warn('nonzero reserved value in LZ4 frame header')
        if (v1, v2) != (0, 1):
            self.log_warn(F'invalid version ({v1},{v2}) in LZ4 frame header')
        content_size = content_size and reader.u64() or None
        dict_id = dict_id and reader.u32() or None
        # Header Checksum
        xxh = xxhash(data[4:reader.tell()]).intdigest() >> 8 & 0xFF
        chk = reader.read_byte()
        if chk != xxh:
            self.log_warn(F'header checksum {chk:02X} does not match computed value {xxh:02X}')

        self.log_debug(lambda: F'dictionary id: {dict_id}')
        self.log_debug(lambda: F'block max: 0x{block_maximum:X}')
        if content_size is not None:
            self.log_debug(lambda: F'chunk max: 0x{content_size:X}')
        self.log_debug(lambda: F'blocks independent: {bool(blocks_independent)}')
        self.log_debug(lambda: F'blocks checksummed: {bool(blocks_checksummed)}')

        blockindex = 0

        while True:
            blockindex += 1
            size = reader.read_integer(31)
            uncompressed = reader.read_bit()
            if not size:
                assert not uncompressed
                break
            self.log_info(F'reading block of size 0x{size:06X}')
            assert reader.byte_aligned
            assert size <= block_maximum, 'block size exceeds maximum size'
            if uncompressed:
                output.write(reader.read_exactly(size))
            else:
                self._read_block(reader, output, size)
            if blocks_checksummed:
                with StreamDetour(reader, -size, io.SEEK_CUR):
                    xxh = xxhash(reader.read_exactly(size)).intdigest()
                chk = reader.u32()
                if chk != xxh:
                    self.log_warn(F'block {blockindex} had checksum {chk:08X} which did not match computed value {xxh:08X}')
        if content_checksummed:
            self.log_info('computing checksum')
            xxh = xxhash(output.getbuffer()).intdigest()
            chk = reader.u32()
            if chk != xxh:
                self.log_warn(F'the given checksum {chk:08X} did not match the computed checksum {xxh:08X}')
        if not reader.eof:
            pos = reader.tell()
            self.log_warn(F'found {len(data) - pos} additional bytes starting at position 0x{pos:X} after compressed data')
        return output.getbuffer()

class lzf (fast=False)

This unit is implemented in refinery.units.compression.lzf and has the following commandline Interface:

usage: lzf [-h] [-L] [-Q] [-0] [-v] [-R] [-x]

This unit implements LZF compression and decompression.

optional arguments:
  -x, --fast     Enable fast compression mode.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class lzf(Unit):
    """
    This unit implements LZF compression and decompression.
    """

    def __init__(self, fast: Arg.Switch('-x', help='Enable fast compression mode.') = False):
        super().__init__(fast=fast)

    def reverse(self, data):
        def FRST(p: memoryview) -> int:
            return ((p[0]) << 8) | p[1]

        def NEXT(v: int, p: memoryview) -> int:
            return ((v << 8) | p[2]) & 0xFFFFFFFF

        def DELTA(p: memoryview):
            return view.nbytes - p.nbytes

        if self.args.fast:
            def HIDX(h: int) -> int:
                return (((h >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))
        else:
            def HIDX(h: int) -> int:
                q = (h ^ (h << 5))
                return (((q >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))

        if not data:
            return data

        ip = view = memoryview(data)
        op = bytearray()

        if len(data) == 1:
            op.append(0)
            op.extend(data)
            return op

        hval = FRST(ip)
        htab = [0] * _HSIZE
        fast = 1 if self.args.fast else 0

        lit = 0

        def begin_literal():
            nonlocal lit
            op.append(0)
            lit = 0

        def advance_literal():
            nonlocal lit, ip
            lit += 1
            op.append(ip[0])
            ip = ip[1:]
            if lit == _MAX_LIT:
                op[-lit - 1] = lit - 1
                begin_literal()

        def commit_literal():
            if lit > 0:
                op[-lit - 1] = lit - 1
            else:
                op.pop()

        begin_literal()

        while ip.nbytes > 2:
            hval = NEXT(hval, ip)
            hpos = HIDX(hval)
            ipos = DELTA(ip)
            length = 2
            r, htab[hpos] = htab[hpos], ipos
            off = ipos - r - 1
            ref = view[r:]

            if off >= _MAX_OFF or r <= 0 or ref[:3] != ip[:3]:
                advance_literal()
                continue
            else:
                commit_literal()

            maxlen = min(_MAX_REF, ip.nbytes - length)

            while True:
                length += 1
                if length >= maxlen or ref[length] != ip[length]:
                    length -= 2
                    break

            if length < 7:
                op.append((off >> 8) + (length << 5))
            else:
                op.append((off >> 8) + (7 << 5))
                op.append(length - 7)

            op.append(off & 0xFF)
            begin_literal()

            if ip.nbytes <= length + 3:
                ip = ip[length + 2:]
                break
            if fast:
                ip = ip[length:]
                hval = FRST(ip)
                for _ in range(2):
                    hval = NEXT(hval, ip)
                    htab[HIDX(hval)] = DELTA(ip)
                    ip = ip[1:]
            else:
                ip = ip[1:]
                for _ in range(length + 1):
                    hval = NEXT(hval, ip)
                    htab[HIDX(hval)] = DELTA(ip)
                    ip = ip[1:]
        while ip.nbytes:
            advance_literal()
        commit_literal()
        return op

    def _decompress_chunk(self, data: memoryview, out: MemoryFile):
        ip = StructReader(data)
        while not ip.eof:
            ctrl = ip.u8()
            if ctrl < 0B100000:
                ctrl += 1
                out.write(ip.read_exactly(ctrl))
            else:
                length = ctrl >> 5
                offset = 1 + ((ctrl & 0B11111) << 8)
                if length == 7:
                    length += ip.u8()
                offset += ip.u8()
                length += 2
                out.replay(offset, length)

    def process(self, data):
        mem = memoryview(data)
        out = MemoryFile()

        try:
            reader = StructReader(mem)
            header = LZFHeader(reader)
        except Exception:
            self.log_info('no header detected, decompressing as raw stream')
            self._decompress_chunk(mem, out)
            return out.getvalue()

        for k in itertools.count(1):
            self.log_info(F'chunk: e=0x{header.encoded_size:04X} d=0x{header.decoded_size:04X}')
            chunk = reader.read(header.encoded_size)
            if header.compressed:
                self._decompress_chunk(chunk, out)
            else:
                out.write(chunk)
            if reader.eof:
                break
            try:
                header = LZFHeader(reader)
            except Exception as E:
                msg = F'failed parsing next header after {k} chunks: {E!s}'
                raise RefineryPartialResult(msg, out.getvalue())

        return out.getvalue()

class lzg

This unit is implemented in refinery.units.compression.lzg and has the following commandline Interface:

usage: lzg [-h] [-L] [-Q] [-0] [-v]

LZG decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lzg(Unit):
    """
    LZG decompression.
    """
    def process(self, data: bytearray):
        stream = LZGStream(data)
        out = stream.decompress()
        if len(out) != stream.decoded_size:
            msg = F'LZG header announced {stream.decoded_size} bytes, but decompressed buffer had size {len(out)}.'
            raise RefineryPartialResult(msg, out)
        return out

    @classmethod
    def handles(cls, data: bytearray):
        if data[:3] == B'LZG':
            return True

class lzip

This unit is implemented in refinery.units.compression.lzip and has the following commandline Interface:

usage: lzip [-h] [-L] [-Q] [-0] [-v]

LZIP decompression

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lzip(Unit):
    """
    LZIP decompression
    """
    def process(self, data: bytearray):
        view = memoryview(data)
        with MemoryFile() as output, StructReader(view) as reader:
            for k in count(1):
                if reader.eof:
                    break
                trailing_size = len(data) - reader.tell()
                try:
                    ID, VN, DS = reader.read_struct('4sBB')
                    if ID != B'LZIP':
                        if k > 1:
                            raise EOF
                        else:
                            self.log_warn(F'ignoring invalid LZIP signature: {ID.hex()}')
                    if VN != 1:
                        self.log_warn(F'ignoring invalid LZIP version: {VN}')
                    dict_size = 1 << (DS & 0x1F)
                    dict_size -= (dict_size // 16) * ((DS >> 5) & 7)
                    if dict_size not in range(_MIN_DICT_SIZE, _MAX_DICT_SIZE + 1):
                        raise ValueError(
                            F'The dictionary size {dict_size} is out of the valid range '
                            F'[{_MIN_DICT_SIZE}, {_MAX_DICT_SIZE}]; unable to proceed.'
                        )
                    decoder = MemberDecoder(dict_size, reader, output)
                    if not decoder():
                        raise ValueError(F'Data error in stream {k}.')
                    crc32, data_size, member_size = reader.read_struct('<LQQ')
                    if crc32 != decoder.crc32:
                        self.log_warn(F'checksum in stream {k} was {decoder.crc:08X}, should have been {crc32:08X}.')
                    if member_size - 20 != decoder.member_position:
                        self.log_warn(F'member size in stream {k} was {decoder.member_position}, should have been {member_size}.')
                    if data_size != decoder.data_position:
                        self.log_warn(F'data size in stream {k} was {decoder.data_position}, should have been {data_size}.')
                except EOF:
                    if k <= 1:
                        raise
                    self.log_info(F'silently ignoring {trailing_size} bytes of trailing data')
                    break

            return output.getvalue()

    @classmethod
    def handles(self, data: bytearray):
        return data[:4] == B'LZIP'

class lzjb

This unit is implemented in refinery.units.compression.lzjb and has the following commandline Interface:

usage: lzjb [-h] [-L] [-Q] [-0] [-v] [-R]

LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class lzjb(Unit):
    """
    LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.
    """
    def reverse(self, src):
        # https://web.archive.org/web/20100807223517/ ..
        # .. http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/fs/zfs/lzjb.c
        output = bytearray()
        lempel = [0] * _LEMPEL_SIZE
        copymask = 0x80
        position = 0
        while position < len(src):
            copymask <<= 1
            if copymask >= 0x100:
                copymask = 1
                copymap = len(output)
                output.append(0)
            if position > len(src) - _MATCH_MAX:
                output.append(src[position])
                position += 1
                continue
            hsh = (src[position] << 16) + (src[position + 1] << 8) + src[position + 2]
            hsh += hsh >> 9
            hsh += hsh >> 5
            hsh %= len(lempel)
            offset = (position - lempel[hsh]) & _OFFSET_MASK
            lempel[hsh] = position
            cpy = position - offset
            if cpy >= 0 and cpy != position and src[position:position + 3] == src[cpy:cpy + 3]:
                output[copymap] |= copymask
                for mlen in range(_MATCH_MIN, min(len(src) - position, _MATCH_MAX)):
                    if src[position + mlen] != src[cpy + mlen]:
                        break
                output.append(((mlen - _MATCH_MIN) << (8 - _MATCH_LEN)) | (offset >> 8))
                output.append(offset & 255)
                position += mlen
            else:
                output.append(src[position])
                position += 1
        return output

    def process(self, data):
        dst = bytearray()
        src = StructReader(data)
        while not src.eof:
            copy = src.read_byte()
            for mask in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80):
                if src.eof:
                    break
                if not copy & mask:
                    dst.append(src.read_byte())
                    continue
                elif not dst:
                    raise ValueError('copy requested against empty buffer')
                with src.be:
                    match_len = src.read_integer(6) + _MATCH_MIN
                    match_pos = src.read_integer(10)
                if not match_pos or match_pos > len(dst):
                    raise RuntimeError(F'invalid match offset at position {src.tell()}')
                match_pos = len(dst) - match_pos
                while match_len > 0:
                    match = dst[match_pos:match_pos + match_len]
                    dst.extend(match)
                    match_pos += len(match)
                    match_len -= len(match)
        return dst

class lzma (filter=None, raw=False, alone=False, xz=False, level=9, delta=None)

This unit is implemented in refinery.units.compression.lz and has the following commandline Interface:

usage: lzma [-h] [-L] [-Q] [-0] [-v] [-R] [-r | -a | -x] [-l N] [-d N] [FILTER]

LZMA compression and decompression.

positional arguments:
  FILTER         Specifies a bcj filter to be applied. Possible values are: ARM, ARMTHUMB, IA64,
                 LZMA1, LZMA2, POWERPC, SPARC, X86

optional arguments:
  -r, --raw      Use raw (no container) format.
  -a, --alone    Use the lzma container format.
  -x, --xz       Use the default xz format.
  -l, --level N  The compression level preset; between 0 and 9.
  -d, --delta N  Add a delta filter when compressing.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class lzma(Unit):
    """
    LZMA compression and decompression.
    """
    _LZMA_FILTER = extract_options(std_lzma, 'FILTER_', 'DELTA')
    _LZMA_PARSER = OptionFactory(_LZMA_FILTER)

    def __init__(
        self, filter: Arg.Choice(choices=list(_LZMA_FILTER), metavar='FILTER', help=(
            'Specifies a bcj filter to be applied. Possible values are: {choices}')) = None,
        raw   : Arg.Switch('-r', group='MODE', help='Use raw (no container) format.') = False,
        alone : Arg.Switch('-a', group='MODE', help='Use the lzma container format.') = False,
        xz    : Arg.Switch('-x', group='MODE', help='Use the default xz format.') = False,
        level : Arg.Number('-l', bound=(0, 9), help='The compression level preset; between 0 and 9.') = 9,
        delta : Arg.Number('-d', help='Add a delta filter when compressing.') = None,
    ):
        filter = filter and self._LZMA_PARSER(filter)
        if (raw, alone, xz).count(True) > 1:
            raise ValueError('Only one container format can be enabled.')
        if level not in range(10):
            raise ValueError('Compression level must be a number between 0 and 9.')
        super().__init__(filter=filter, raw=raw, alone=alone, xz=xz, delta=delta,
            level=level | std_lzma.PRESET_EXTREME)

    def _get_lz_mode_and_filters(self, reverse=False):
        mode = std_lzma.FORMAT_AUTO
        filters = []
        if self.args.filter is not None:
            filters.append({'id': self.args.filter.value})
        if self.args.delta is not None:
            self.log_debug('adding delta filter')
            filters.append({
                'id': std_lzma.FILTER_DELTA,
                'dist': self.args.delta
            })
        if self.args.alone:
            self.log_debug('setting alone format')
            mode = std_lzma.FORMAT_ALONE
            filters.append({
                'id': std_lzma.FILTER_LZMA1,
                'preset': self.args.level
            })
        elif self.args.raw:
            self.log_debug('setting raw format')
            mode = std_lzma.FORMAT_RAW
            filters.append({
                'id': std_lzma.FILTER_LZMA2,
                'preset': self.args.level
            })
        elif self.args.xz or reverse:
            if reverse and not self.log_debug('setting xz container format'):
                self.log_info('choosing default .xz container format for compression.')
            mode = std_lzma.FORMAT_XZ
            filters.append({
                'id': std_lzma.FILTER_LZMA2,
                'preset': self.args.level
            })
        return mode, filters

    def reverse(self, data):
        mode, filters = self._get_lz_mode_and_filters(True)
        lz = std_lzma.LZMACompressor(mode, filters=filters)
        output = lz.compress(data)
        output += lz.flush()
        return output

    def _process_stream(self, data: ByteString, strategy: F, keywords):
        if strategy & F.STEPWISE:
            sizes = repeat(1)
        else:
            sizes = [len(data)]
        lz = std_lzma.LZMADecompressor(**keywords)
        with MemoryFile() as output:
            with MemoryFile(data) as stream:
                if strategy & F.INJECT:
                    output.write(lz.decompress(stream.read(5)))
                    output.write(lz.decompress(B'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF'))
                for size in sizes:
                    if stream.eof or stream.closed:
                        break
                    try:
                        position = stream.tell()
                        output.write(lz.decompress(stream.read(size)))
                    except (EOFError, std_lzma.LZMAError) as error:
                        msg = error.args[0] if len(error.args) == 1 else error.__class__.__name__
                        raise RefineryPartialResult(F'compression failed at offset {position}: {msg!s}',
                            output.getvalue())
            return output.getvalue()

    def process(self, data: bytearray):
        errors: List[RefineryPartialResult] = []
        view = memoryview(data)
        keywords = {}
        keywords['format'], filters = self._get_lz_mode_and_filters(False)
        if self.args.raw:
            keywords['filters'] = filters
        for strategy in (F.DEFAULT, F.INJECT, F.STEPWISE, F.INJECT | F.STEPWISE):
            try:
                return self._process_stream(view, strategy, keywords)
            except RefineryPartialResult as p:
                self.log_info(F'decompression failed with strategy {strategy}: {p.message}')
                errors.append(p)
        raise max(errors, key=lambda e: len(e.partial))

    @classmethod
    def handles(self, data: bytearray):
        if data[:4] == B'\x5D\0\0\0':
            return True
        if data[:5] == B'\xFD7zXZ':
            return True

class lznt1 (chunk_size=4096)

This unit is implemented in refinery.units.compression.lznt1 and has the following commandline Interface:

usage: lznt1 [-h] [-L] [-Q] [-0] [-v] [-R] [-c N]

LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API
routine RtlDecompressBuffer, for example.

optional arguments:
  -c, --chunk-size N  Optionally specify the chunk size for compression, default is 0x1000.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Allow partial results as output.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.
  -R, --reverse       Use the reverse operation; Specify twice to normalize (first decode, then
                      encode).

Expand source code Browse git

class lznt1(Unit):
    """
    LZNT1 compression and decompression. This compression algorithm is expected
    by the Win32 API routine `RtlDecompressBuffer`, for example.
    """

    def _decompress_chunk(self, chunk):
        out = B''
        while chunk:
            flags = chunk[0]
            chunk = chunk[1:]
            for i in range(8):
                if not (flags >> i & 1):
                    out += chunk[:1]
                    chunk = chunk[1:]
                else:
                    flag = struct.unpack('<H', chunk[:2])[0]
                    pos = len(out) - 1
                    l_mask = 0xFFF
                    o_shift = 12
                    while pos >= 0x10:
                        l_mask >>= 1
                        o_shift -= 1
                        pos >>= 1
                    length = (flag & l_mask) + 3
                    offset = (flag >> o_shift) + 1
                    if length >= offset:
                        tmp = out[-offset:] * (0xFFF // len(out[-offset:]) + 1)
                        out += tmp[:length]
                    else:
                        out += out[-offset:length - offset]
                    chunk = chunk[2:]
                if len(chunk) == 0:
                    break
        return out

    def _find(self, src, target, max_len):
        result_offset = 0
        result_length = 0
        for i in range(1, max_len):
            offset = src.rfind(target[:i])
            if offset == -1:
                break
            tmp_offset = len(src) - offset
            tmp_length = i
            if tmp_offset == tmp_length:
                tmp = src[offset:] * (0xFFF // len(src[offset:]) + 1)
                for j in range(i, max_len + 1):
                    offset = tmp.rfind(target[:j])
                    if offset == -1:
                        break
                    tmp_length = j
            if tmp_length > result_length:
                result_offset = tmp_offset
                result_length = tmp_length
        if result_length < 3:
            return 0, 0
        return result_offset, result_length

    def _compress_chunk(self, chunk):
        blob = copy.copy(chunk)
        out = B''
        pow2 = 0x10
        l_mask3 = 0x1002
        o_shift = 12
        while len(blob) > 0:
            bits = 0
            tmp = B''
            for i in range(8):
                bits >>= 1
                while pow2 < (len(chunk) - len(blob)):
                    pow2 <<= 1
                    l_mask3 = (l_mask3 >> 1) + 1
                    o_shift -= 1
                if len(blob) < l_mask3:
                    max_len = len(blob)
                else:
                    max_len = l_mask3
                offset1, length1 = self._find(
                    chunk[:len(chunk) - len(blob)], blob, max_len)
                # try to find more compressed pattern
                offset2, length2 = self._find(
                    chunk[:len(chunk) - len(blob) + 1], blob[1:], max_len)
                if length1 < length2:
                    length1 = 0
                if length1 > 0:
                    symbol = ((offset1 - 1) << o_shift) | (length1 - 3)
                    tmp += struct.pack('<H', symbol)
                    bits |= 0x80  # set the highest bit
                    blob = blob[length1:]
                else:
                    tmp += blob[:1]
                    blob = blob[1:]
                if len(blob) == 0:
                    break
            out += struct.pack('B', bits >> (7 - i))
            out += tmp
        return out

    def reverse(self, buf):
        out = B''
        while buf:
            chunk = buf[:self.args.chunk_size]
            compressed = self._compress_chunk(chunk)
            if len(compressed) < len(chunk):  # chunk is compressed
                flags = 0xB000
                header = struct.pack('<H', flags | (len(compressed) - 1))
                out += header + compressed
            else:
                flags = 0x3000
                header = struct.pack('<H', flags | (len(chunk) - 1))
                out += header + chunk
            buf = buf[self.args.chunk_size:]
        return out

    def process(self, data):
        out = io.BytesIO()
        offset = 0
        while offset < len(data):
            try:
                header, = struct.unpack('<H', data[offset:offset + 2])
            except struct.error as err:
                raise RefineryPartialResult(str(err), partial=out.getvalue())
            offset += 2
            size = (header & 0xFFF) + 1
            if size + 1 >= len(data):
                raise RefineryPartialResult(
                    F'chunk header indicates size {size}, but only {len(data)} bytes remain.',
                    partial=out.getvalue()
                )
            chunk = data[offset:offset + size]
            offset += size
            if header & 0x8000:
                chunk = self._decompress_chunk(chunk)
            out.write(chunk)
        return out.getvalue()

    def __init__(self, chunk_size: Arg.Number('-c', help='Optionally specify the chunk size for compression, default is 0x1000.') = 0x1000):
        super().__init__(chunk_size=chunk_size)

class lzo

This unit is implemented in refinery.units.compression.lzo and has the following commandline Interface:

usage: lzo [-h] [-L] [-Q] [-0] [-v]

LZO decompression. The code works against simple test cases, but it is known to fail for certain
outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when
the -9 switch is used).

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lzo(Unit):
    """
    LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop
    command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used).
    """
    def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray:
        """
        An implementation of LZO decompression. We use the article
        "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)"
        as a reference since no proper specification is available.
        """
        def integer() -> int:
            length = 0
            while True:
                byte = src.read_byte()
                if byte:
                    return length + byte
                length += 0xFF
                if length > 0x100000:
                    raise LZOError('Too many zeros in integer encoding.')

        def literal(count):
            dst.write(src.read_bytes(count))

        def copy(distance: int, length: int):
            if distance > len(dst):
                raise LZOError(F'Distance {distance} > bufsize {len(dst)}')
            buffer = dst.getbuffer()
            if distance > length:
                start = len(buffer) - distance
                end = start + length
                dst.write(buffer[start:end])
            else:
                block = buffer[-distance:]
                while len(block) < length:
                    block += block[:length - len(block)]
                if len(block) > length:
                    block[length:] = ()
                dst.write(block)

        src = StructReader(memoryview(data))
        dst = MemoryFile()

        state = 0
        first = src.read_byte()

        if first == 0x10:
            raise LZOError('Invalid first stream byte 0x10.')
        elif first <= 0x12:
            src.seekrel(-1)
        elif first <= 0x15:
            state = first - 0x11
            literal(state)
        else:
            state = 4
            literal(first - 0x11)

        while True:
            instruction = src.read_byte()
            if instruction < 0x10:
                if state == 0:
                    length = instruction or integer() + 15
                    state = length + 3
                    if state < 4:
                        raise LZOError('Literal encoding is too short.')
                else:
                    state = instruction & 0b0011
                    D = (instruction & 0b1100) >> 2
                    H = src.read_byte()
                    distance = (H << 2) + D + 1
                    if state >= 4:
                        distance += 0x800
                        length = 3
                    else:
                        length = 2
                    copy(distance, length)
            elif instruction < 0x20:
                L = instruction & 0b0111
                H = instruction & 0b1000
                length = L or integer() + 7
                argument = src.u16()
                state = argument & 3
                distance = (H << 11) + (argument >> 2)
                if not distance:
                    return dst.getbuffer()
                if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265):
                    raise LZOError('Compressed data contains sequence that is banned in LZOv1.')
                if LZOv1 and distance == 0xBFFF:
                    X = src.read_byte()
                    count = ((X << 3) | L) + 4
                    self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.')
                    dst.write(B'\0' * count)
                else:
                    copy(distance + 0x4000, length + 2)
            elif instruction < 0x40:
                L = instruction & 0b11111
                length = L or integer() + 31
                argument = src.u16()
                state = argument & 3
                distance = (argument >> 2) + 1
                copy(distance, length + 2)
            else:
                if instruction < 0x80:
                    length = 3 + ((instruction >> 5) & 1)
                else:
                    length = 5 + ((instruction >> 5) & 3)
                H = src.read_byte()
                D = (instruction & 0b11100) >> 2
                state = instruction & 3
                distance = (H << 3) + D + 1
                copy(distance, length)
            if state:
                literal(state)

    def process(self, data):
        try:
            lzo = LZO(data)
        except LZOError:
            self.log_info('Not an LZO archive, processing raw stream.')
            return self.decompress_stream(data)
        with MemoryFile() as output:
            for k, chunk in enumerate(lzo, 1):
                self.log_debug(F'decompressing chunk {k}')
                output.write(self.decompress_stream(chunk.data))
            return self.labelled(
                output.getbuffer(),
                path=lzo.name,
                date=date_from_timestamp(lzo.mtime)
            )

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        sig = LZO.SIGNATURE
        if data[:len(sig)] == sig:
            return True

Methods

def decompress_stream(self, data, LZOv1=False)

An implementation of LZO decompression. We use the article "LZO stream format as understood by Linux's LZO decompressor" as a reference since no proper specification is available.

Expand source code Browse git

def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray:
    """
    An implementation of LZO decompression. We use the article
    "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)"
    as a reference since no proper specification is available.
    """
    def integer() -> int:
        length = 0
        while True:
            byte = src.read_byte()
            if byte:
                return length + byte
            length += 0xFF
            if length > 0x100000:
                raise LZOError('Too many zeros in integer encoding.')

    def literal(count):
        dst.write(src.read_bytes(count))

    def copy(distance: int, length: int):
        if distance > len(dst):
            raise LZOError(F'Distance {distance} > bufsize {len(dst)}')
        buffer = dst.getbuffer()
        if distance > length:
            start = len(buffer) - distance
            end = start + length
            dst.write(buffer[start:end])
        else:
            block = buffer[-distance:]
            while len(block) < length:
                block += block[:length - len(block)]
            if len(block) > length:
                block[length:] = ()
            dst.write(block)

    src = StructReader(memoryview(data))
    dst = MemoryFile()

    state = 0
    first = src.read_byte()

    if first == 0x10:
        raise LZOError('Invalid first stream byte 0x10.')
    elif first <= 0x12:
        src.seekrel(-1)
    elif first <= 0x15:
        state = first - 0x11
        literal(state)
    else:
        state = 4
        literal(first - 0x11)

    while True:
        instruction = src.read_byte()
        if instruction < 0x10:
            if state == 0:
                length = instruction or integer() + 15
                state = length + 3
                if state < 4:
                    raise LZOError('Literal encoding is too short.')
            else:
                state = instruction & 0b0011
                D = (instruction & 0b1100) >> 2
                H = src.read_byte()
                distance = (H << 2) + D + 1
                if state >= 4:
                    distance += 0x800
                    length = 3
                else:
                    length = 2
                copy(distance, length)
        elif instruction < 0x20:
            L = instruction & 0b0111
            H = instruction & 0b1000
            length = L or integer() + 7
            argument = src.u16()
            state = argument & 3
            distance = (H << 11) + (argument >> 2)
            if not distance:
                return dst.getbuffer()
            if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265):
                raise LZOError('Compressed data contains sequence that is banned in LZOv1.')
            if LZOv1 and distance == 0xBFFF:
                X = src.read_byte()
                count = ((X << 3) | L) + 4
                self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.')
                dst.write(B'\0' * count)
            else:
                copy(distance + 0x4000, length + 2)
        elif instruction < 0x40:
            L = instruction & 0b11111
            length = L or integer() + 31
            argument = src.u16()
            state = argument & 3
            distance = (argument >> 2) + 1
            copy(distance, length + 2)
        else:
            if instruction < 0x80:
                length = 3 + ((instruction >> 5) & 1)
            else:
                length = 5 + ((instruction >> 5) & 3)
            H = src.read_byte()
            D = (instruction & 0b11100) >> 2
            state = instruction & 3
            distance = (H << 3) + D + 1
            copy(distance, length)
        if state:
            literal(state)

class lzw

This unit is implemented in refinery.units.compression.lzw and has the following commandline Interface:

usage: lzw [-h] [-L] [-Q] [-0] [-v]

LZW decompression based on ancient Unix sources.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class lzw(Unit):
    '''
    LZW decompression based on ancient Unix sources.
    '''

    _MAGIC = B'\x1F\x9D'

    def process(self, data: bytearray):
        out = MemoryFile()
        inf = StructReader(memoryview(data))

        if inf.peek(2) != self._MAGIC:
            self.log_info('No LZW signature found, assuming raw stream.')
            maxbits = LZW.BITS
            block_mode = True
        else:
            inf.seekrel(2)
            maxbits = inf.read_integer(5)
            if inf.read_integer(2) != 0:
                self.log_info('reserved bits were set in LZW header')
            block_mode = bool(inf.read_bit())

        if maxbits > LZW.BITS:
            raise ValueError(F'Compressed with {maxbits} bits; cannot handle file.')

        maxmaxcode = 1 << maxbits

        ibuf = inf.read()

        tab_suffix = bytearray(LZW.WSIZE * 2)
        tab_prefix = array('H', itertools.repeat(0, 1 << LZW.BITS))

        n_bits = LZW.INIT_BITS
        maxcode = (1 << n_bits) - 1
        bitmask = (1 << n_bits) - 1
        oldcode = ~0
        finchar = +0
        posbits = +0

        free_entry = LZW.FIRST if block_mode else 0x100
        tab_suffix[:0x100] = range(0x100)
        resetbuf = True

        while resetbuf:
            resetbuf = False

            ibuf = ibuf[posbits >> 3:]
            insize = len(ibuf)
            posbits = 0

            if insize < LZW.EXTRA:
                inbits = (insize - insize % n_bits) << 3
            else:
                inbits = (insize << 3) - (n_bits - 1)

            while inbits > posbits:
                if free_entry > maxcode:
                    n = n_bits << 3
                    p = posbits - 1
                    posbits = p + (n - (p + n) % n)
                    n_bits += 1
                    if (n_bits == maxbits):
                        maxcode = maxmaxcode
                    else:
                        maxcode = (1 << n_bits) - 1
                    bitmask = (1 << n_bits) - 1
                    resetbuf = True
                    break

                p = ibuf[posbits >> 3:]
                code = int.from_bytes(p[:3], 'little') >> (posbits & 7) & bitmask
                posbits += n_bits

                if oldcode == -1:
                    if code >= 256:
                        raise ValueError('corrupt input.')
                    oldcode = code
                    finchar = oldcode
                    out.write_byte(finchar)
                    continue

                if code == LZW.CLEAR and block_mode:
                    tab_prefix[:0x100] = array('H', itertools.repeat(0, 0x100))
                    free_entry = LZW.FIRST - 1
                    n = n_bits << 3
                    p = posbits - 1
                    posbits = p + (n - (p + n) % n)
                    n_bits = LZW.INIT_BITS
                    maxcode = (1 << n_bits) - 1
                    bitmask = (1 << n_bits) - 1
                    resetbuf = True
                    break

                incode = code
                stack = bytearray()

                if code >= free_entry:
                    if code > free_entry:
                        raise RefineryPartialResult('corrupt input.', out.getbuffer())
                    stack.append(finchar)
                    code = oldcode
                while code >= 256:
                    stack.append(tab_suffix[code])
                    code = tab_prefix[code]

                finchar = tab_suffix[code]
                stack.append(finchar)
                stack.reverse()
                out.write(stack)
                code = free_entry

                if code < maxmaxcode:
                    tab_prefix[code] = oldcode & 0xFFFF
                    tab_suffix[code] = finchar & 0x00FF
                    free_entry = code + 1

                oldcode = incode

        return out.getvalue()

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        sig = self._MAGIC
        if data[:len(sig)] == sig:
            return True

class machometa (all=True, header=False, linked_images=False, signatures=False, version=False, load_commands=False, exports=False, imports=False, tabular=False)

This unit is implemented in refinery.units.formats.macho.machometa and has the following commandline Interface:

usage: machometa [-h] [-L] [-Q] [-0] [-v] [-c] [-H] [-K] [-S] [-V] [-D] [-E] [-I] [-t]

Extract metadata from Mach-O files.

optional arguments:
  -c, --custom         Unless enabled, all default categories will be extracted.
  -H, --header         Parse basic data from the Mach-O header.
  -K, --linked-images  Parse all library images linked by the Mach-O.
  -S, --signatures     Parse signature and entitlement information.
  -V, --version        Parse version information from the Mach-O load commands.
  -D, --load-commands  Parse load commands from the Mach-O header.
  -E, --exports        List all exported functions.
  -I, --imports        List all imported functions.
  -t, --tabular        Print information in a table rather than as JSON

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Allow partial results as output.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.

Expand source code Browse git

class machometa(Unit):
    """
    Extract metadata from Mach-O files.
    """
    def __init__(
        self, all: Arg('-c', '--custom',
            help='Unless enabled, all default categories will be extracted.') = True,
        header: Arg('-H', help='Parse basic data from the Mach-O header.') = False,
        linked_images: Arg('-K', help='Parse all library images linked by the Mach-O.') = False,
        signatures: Arg('-S', help='Parse signature and entitlement information.') = False,
        version: Arg('-V', help='Parse version information from the Mach-O load commands.') = False,
        load_commands: Arg('-D', help='Parse load commands from the Mach-O header.') = False,
        exports: Arg('-E', help='List all exported functions.') = False,
        imports: Arg('-I', help='List all imported functions.') = False,
        tabular: Arg('-t', help='Print information in a table rather than as JSON') = False,
    ):
        super().__init__(
            header=all or header,
            linked_images=all or linked_images,
            version=all or version,
            signatures=all or signatures,
            load_commands=load_commands,
            imports=imports,
            exports=exports,
            tabular=tabular,
        )

    @Unit.Requires('k2l>=2.0', 'all')
    def _ktool():
        import ktool
        import ktool.macho
        import ktool.codesign
        return ktool

    def parse_macho_header(self, macho_image: Image, data=None) -> Dict:
        info = {}
        macho_header = macho_image.macho_header
        dyld_header = macho_image.macho_header.dyld_header
        if dyld_header is not None:
            info['Type'] = dyld_header.type_name
            info['Magic'] = dyld_header.magic
            info['CPUType'] = macho_image.slice.type.name
            info['CPUSubType'] = macho_image.slice.subtype.name
            info['FileType'] = macho_image.macho_header.filetype.name
            info['LoadCount'] = dyld_header.loadcnt
            info['LoadSize'] = dyld_header.loadsize
            info['Flags'] = [flag.name for flag in macho_header.flags]
            info['Reserved'] = dyld_header.reserved
        return info

    def parse_linked_images(self, macho_image: Image, data=None) -> Dict:
        load_command_images = {}
        linked_images = macho_image.linked_images
        LOAD_COMMAND = self._ktool.macho.LOAD_COMMAND
        for linked_image in linked_images:
            load_command_name = LOAD_COMMAND(linked_image.cmd.cmd).name
            load_command_images.setdefault(load_command_name, []).append(linked_image.install_name)
        return load_command_images

    def parse_signature(self, macho_image: Image, data=None) -> Dict:

        _km = self._ktool.macho
        _kc = self._ktool.codesign

        class CodeDirectoryBlob(_km.Struct):
            FIELDS = {
                'magic': _km.uint32_t,
                'length': _km.uint32_t,
                'version': _km.uint32_t,
                'flags': _km.uint32_t,
                'hashOffset': _km.uint32_t,
                'identOffset': _km.uint32_t,
                'nSpecialSlots': _km.uint32_t,
                'nCodeSlots': _km.uint32_t,
                'codeLimit': _km.uint32_t,
                'hashSize': _km.uint8_t,
                'hashType': _km.uint8_t,
                'platform': _km.uint8_t,
                'pageSize': _km.uint8_t,
                'spare2': _km.uint32_t
            }

            def __init__(self, byte_order='little'):
                super().__init__(byte_order=byte_order)
                self.magic = 0
                self.length = 0
                self.version = 0
                self.flags = 0
                self.hashOffset = 0
                self.identOffset = 0
                self.nSpecialSlots = 0
                self.nCodeSlots = 0
                self.codeLimit = 0
                self.hashSize = 0
                self.hashType = 0
                self.platform = 0
                self.pageSize = 0
                self.spare2 = 0

        info = {}
        if macho_image.codesign_info is not None:
            superblob: SuperBlob = macho_image.codesign_info.superblob

            for blob in macho_image.codesign_info.slots:
                blob: BlobIndex
                # ktool does not include code for extracting Blobs of types
                # CSSLOT_CODEDIRECTORY, CSSLOT_CMS_SIGNATURE
                # so we must do it ourselves here.
                if blob.type == _kc.CSSLOT_CODEDIRECTORY:
                    start = superblob.off + blob.offset
                    codedirectory_blob = macho_image.read_struct(start, CodeDirectoryBlob)

                    # Ad-hoc signing
                    flags = _kc.swap_32(codedirectory_blob.flags)
                    if flags & CS_ADHOC != 0:
                        info['AdHocSigned'] = True
                    else:
                        info['AdHocSigned'] = False

                    # Signature identifier
                    identifier_offset = _kc.swap_32(codedirectory_blob.identOffset)
                    identifier_data = macho_image.read_cstr(start + identifier_offset)
                    info['SignatureIdentifier'] = identifier_data

                if blob.type == 0x10000:  # CSSLOT_CMS_SIGNATURE
                    start = superblob.off + blob.offset
                    blob_data = macho_image.read_struct(start, _kc.Blob)
                    blob_data.magic = _kc.swap_32(blob_data.magic)
                    blob_data.length = _kc.swap_32(blob_data.length)
                    cms_signature = macho_image.read_bytearray(start + _kc.Blob.SIZE, blob_data.length - _kc.Blob.SIZE)

                    if len(cms_signature) != 0:
                        try:
                            parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature))
                            info['Signature'] = parsed_cms_signature
                        except ValueError as pkcs7_parse_error:
                            self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}')

            if macho_image.codesign_info.req_dat is not None:
                # TODO: Parse the requirements blob,
                # which is encoded according to the code signing requirements language:
                # https://developer.apple.com/library/archive/documentation/Security/Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html
                info['Requirements'] = macho_image.codesign_info.req_dat.hex()
            if macho_image.codesign_info.entitlements is not None:
                info['Entitlements'] = macho_image.codesign_info.entitlements
        return info

    def parse_version(self, macho_image: Image, data=None) -> Dict:
        info = {}
        load_commands = macho_image.macho_header.load_commands

        SVC = self._ktool.macho.source_version_command
        BVC = self._ktool.macho.build_version_command

        for load_command in load_commands:
            if isinstance(load_command, SVC):
                if 'SourceVersion' not in info:
                    info['SourceVersion'] = load_command.version
                else:
                    self.log_warn('More than one load command of type source_version_command found; the MachO file is possibly malformed')
            elif isinstance(load_command, BVC):
                if 'BuildVersion' not in info:
                    info['BuildVersion'] = {}
                    info['BuildVersion']['Platform'] = macho_image.platform.name
                    info['BuildVersion']['MinOS'] = F'{macho_image.minos.x}.{macho_image.minos.y}.{macho_image.minos.z}'
                    info['BuildVersion']['SDK'] = F'{macho_image.sdk_version.x}.{macho_image.sdk_version.y}.{macho_image.sdk_version.z}'
                    info['BuildVersion']['Ntools'] = load_command.ntools
                else:
                    self.log_warn('More than one load command of type build_version_command found; the MachO file is possibly malformed')
        return info

    def parse_load_commands(self, macho_image: Image, data=None) -> List:
        info = []
        load_commands = macho_image.macho_header.load_commands
        for load_command in load_commands:
            info.append(load_command.serialize())
        return info

    def parse_imports(self, macho_image: Image, data=None) -> List:
        info = []
        for imp in macho_image.imports:
            info.append(imp.name)
        return info

    def parse_exports(self, macho_image: Image, data=None) -> List:
        info = []
        for exp in macho_image.exports:
            info.append(exp.name)
        return info

    def process(self, data: bytearray):
        result = {}
        ktool = self._ktool
        macho = ktool.load_macho_file(fp=BytesIO(data), use_mmaped_io=False)
        if macho.type is ktool.MachOFileType.FAT:
            result['FileType'] = 'FAT'
        elif macho.type is ktool.MachOFileType.THIN:
            result['FileType'] = 'THIN'

        slices = []

        for macho_slice in macho.slices:
            slice_result = {}
            macho_image = ktool.load_image(fp=macho_slice)

            for switch, resolver, name in [
                (self.args.header, self.parse_macho_header, 'Header'),
                (self.args.linked_images, self.parse_linked_images, 'LinkedImages'),
                (self.args.signatures, self.parse_signature, 'Signatures'),
                (self.args.version, self.parse_version, 'Version'),
                (self.args.load_commands, self.parse_load_commands, 'LoadCommands'),
                (self.args.imports, self.parse_imports, 'Imports'),
                (self.args.exports, self.parse_exports, 'Exports'),
            ]:
                if not switch:
                    continue
                self.log_debug(F'parsing: {name}')
                try:
                    info = resolver(macho_image, data)
                except Exception as E:
                    self.log_info(F'failed to obtain {name}: {E!s}')
                    continue
                if info:
                    slice_result[name] = info

            if macho_image.uuid is not None:
                uuid: bytes = macho_image.uuid
                slice_result['UUID'] = uuid.hex()
            slice_result['BaseName'] = macho_image.base_name
            slice_result['InstallName'] = macho_image.install_name
            slices.append(slice_result)

        if slices:
            result['Slices'] = slices
            yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False)

Methods

def parse_macho_header(self, macho_image, data=None)

Expand source code Browse git

def parse_macho_header(self, macho_image: Image, data=None) -> Dict:
    info = {}
    macho_header = macho_image.macho_header
    dyld_header = macho_image.macho_header.dyld_header
    if dyld_header is not None:
        info['Type'] = dyld_header.type_name
        info['Magic'] = dyld_header.magic
        info['CPUType'] = macho_image.slice.type.name
        info['CPUSubType'] = macho_image.slice.subtype.name
        info['FileType'] = macho_image.macho_header.filetype.name
        info['LoadCount'] = dyld_header.loadcnt
        info['LoadSize'] = dyld_header.loadsize
        info['Flags'] = [flag.name for flag in macho_header.flags]
        info['Reserved'] = dyld_header.reserved
    return info

def parse_linked_images(self, macho_image, data=None)

Expand source code Browse git

def parse_linked_images(self, macho_image: Image, data=None) -> Dict:
    load_command_images = {}
    linked_images = macho_image.linked_images
    LOAD_COMMAND = self._ktool.macho.LOAD_COMMAND
    for linked_image in linked_images:
        load_command_name = LOAD_COMMAND(linked_image.cmd.cmd).name
        load_command_images.setdefault(load_command_name, []).append(linked_image.install_name)
    return load_command_images

def parse_signature(self, macho_image, data=None)

Expand source code Browse git

def parse_signature(self, macho_image: Image, data=None) -> Dict:

    _km = self._ktool.macho
    _kc = self._ktool.codesign

    class CodeDirectoryBlob(_km.Struct):
        FIELDS = {
            'magic': _km.uint32_t,
            'length': _km.uint32_t,
            'version': _km.uint32_t,
            'flags': _km.uint32_t,
            'hashOffset': _km.uint32_t,
            'identOffset': _km.uint32_t,
            'nSpecialSlots': _km.uint32_t,
            'nCodeSlots': _km.uint32_t,
            'codeLimit': _km.uint32_t,
            'hashSize': _km.uint8_t,
            'hashType': _km.uint8_t,
            'platform': _km.uint8_t,
            'pageSize': _km.uint8_t,
            'spare2': _km.uint32_t
        }

        def __init__(self, byte_order='little'):
            super().__init__(byte_order=byte_order)
            self.magic = 0
            self.length = 0
            self.version = 0
            self.flags = 0
            self.hashOffset = 0
            self.identOffset = 0
            self.nSpecialSlots = 0
            self.nCodeSlots = 0
            self.codeLimit = 0
            self.hashSize = 0
            self.hashType = 0
            self.platform = 0
            self.pageSize = 0
            self.spare2 = 0

    info = {}
    if macho_image.codesign_info is not None:
        superblob: SuperBlob = macho_image.codesign_info.superblob

        for blob in macho_image.codesign_info.slots:
            blob: BlobIndex
            # ktool does not include code for extracting Blobs of types
            # CSSLOT_CODEDIRECTORY, CSSLOT_CMS_SIGNATURE
            # so we must do it ourselves here.
            if blob.type == _kc.CSSLOT_CODEDIRECTORY:
                start = superblob.off + blob.offset
                codedirectory_blob = macho_image.read_struct(start, CodeDirectoryBlob)

                # Ad-hoc signing
                flags = _kc.swap_32(codedirectory_blob.flags)
                if flags & CS_ADHOC != 0:
                    info['AdHocSigned'] = True
                else:
                    info['AdHocSigned'] = False

                # Signature identifier
                identifier_offset = _kc.swap_32(codedirectory_blob.identOffset)
                identifier_data = macho_image.read_cstr(start + identifier_offset)
                info['SignatureIdentifier'] = identifier_data

            if blob.type == 0x10000:  # CSSLOT_CMS_SIGNATURE
                start = superblob.off + blob.offset
                blob_data = macho_image.read_struct(start, _kc.Blob)
                blob_data.magic = _kc.swap_32(blob_data.magic)
                blob_data.length = _kc.swap_32(blob_data.length)
                cms_signature = macho_image.read_bytearray(start + _kc.Blob.SIZE, blob_data.length - _kc.Blob.SIZE)

                if len(cms_signature) != 0:
                    try:
                        parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature))
                        info['Signature'] = parsed_cms_signature
                    except ValueError as pkcs7_parse_error:
                        self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}')

        if macho_image.codesign_info.req_dat is not None:
            # TODO: Parse the requirements blob,
            # which is encoded according to the code signing requirements language:
            # https://developer.apple.com/library/archive/documentation/Security/Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html
            info['Requirements'] = macho_image.codesign_info.req_dat.hex()
        if macho_image.codesign_info.entitlements is not None:
            info['Entitlements'] = macho_image.codesign_info.entitlements
    return info

def parse_version(self, macho_image, data=None)

Expand source code Browse git

def parse_version(self, macho_image: Image, data=None) -> Dict:
    info = {}
    load_commands = macho_image.macho_header.load_commands

    SVC = self._ktool.macho.source_version_command
    BVC = self._ktool.macho.build_version_command

    for load_command in load_commands:
        if isinstance(load_command, SVC):
            if 'SourceVersion' not in info:
                info['SourceVersion'] = load_command.version
            else:
                self.log_warn('More than one load command of type source_version_command found; the MachO file is possibly malformed')
        elif isinstance(load_command, BVC):
            if 'BuildVersion' not in info:
                info['BuildVersion'] = {}
                info['BuildVersion']['Platform'] = macho_image.platform.name
                info['BuildVersion']['MinOS'] = F'{macho_image.minos.x}.{macho_image.minos.y}.{macho_image.minos.z}'
                info['BuildVersion']['SDK'] = F'{macho_image.sdk_version.x}.{macho_image.sdk_version.y}.{macho_image.sdk_version.z}'
                info['BuildVersion']['Ntools'] = load_command.ntools
            else:
                self.log_warn('More than one load command of type build_version_command found; the MachO file is possibly malformed')
    return info

def parse_load_commands(self, macho_image, data=None)

Expand source code Browse git

def parse_load_commands(self, macho_image: Image, data=None) -> List:
    info = []
    load_commands = macho_image.macho_header.load_commands
    for load_command in load_commands:
        info.append(load_command.serialize())
    return info

def parse_imports(self, macho_image, data=None)

Expand source code Browse git

def parse_imports(self, macho_image: Image, data=None) -> List:
    info = []
    for imp in macho_image.imports:
        info.append(imp.name)
    return info

def parse_exports(self, macho_image, data=None)

Expand source code Browse git

def parse_exports(self, macho_image: Image, data=None) -> List:
    info = []
    for exp in macho_image.exports:
        info.append(exp.name)
    return info

class map (index, image, blocksize=None)

This unit is implemented in refinery.units.blockwise.map and has the following commandline Interface:

usage: map [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] index image

Each block of the input data which occurs as a block of the index argument is replaced by the
corresponding block of the image argument. If a block size is specified, and if the index or
image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes
that are not an integer multiple of the block size are discarded. To prevent any automatic
chunking, the btoi handler can be used.

positional arguments:
  index              index characters
  image              image characters

optional arguments:
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class map(BlockTransformation):
    """
    Each block of the input data which occurs as a block of the index argument is replaced by the
    corresponding block of the image argument. If a block size is specified, and if the index or
    image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes
    that are not an integer multiple of the block size are discarded. To prevent any automatic
    chunking, the `refinery.lib.argformats.DelayedArgument.btoi` handler can be used.
    """
    _map: Optional[Dict[int, int]]

    def __init__(
        self,
        index: Arg.NumSeq(help='index characters'),
        image: Arg.NumSeq(help='image characters'),
        blocksize=None
    ):
        super().__init__(blocksize=blocksize, index=index, image=image, _truncate=2)
        self._map = None

    def reverse(self, data):
        return self._process(data, self.args.image, self.args.index)

    def process(self, data):
        return self._process(data, self.args.index, self.args.image)

    def _process(self, data: bytearray, index: Sequence[int], image: Sequence[int]):
        if not self.bytestream:
            if isbuffer(index):
                self.log_info(F'chunking index sequence into blocks of size {self.blocksize}')
                index = list(self.chunk(index))
                self.log_debug(F'index sequence: {index}')
            if isbuffer(image):
                self.log_info(F'chunking image sequence into blocks of size {self.blocksize}')
                image = list(self.chunk(image))
                self.log_debug(F'image sequence: {image}')
        if len(set(index)) != len(index):
            raise ValueError('The index sequence contains duplicates.')
        if len(index) > len(image):
            raise ValueError('The index sequence is longer than the image sequence.')
        if self.bytestream:
            mapping = dict(zip(index, image))
            mapping = bytes(mapping.get(c, c) for c in range(0x100))
            if not isinstance(data, bytearray):
                data = bytearray(data)
            data[:] = (mapping[b] for b in data)
            return data
        try:
            self._map = dict(zip(index, image))
            return super().process(data)
        finally:
            self._map = None

    def process_block(self, token):
        return self._map.get(token, token)

class max_ (key=None)

This unit is implemented in refinery.units.meta.max and has the following commandline Interface:

usage: max [-h] [-L] [-Q] [-0] [-v] [key]

Picks the maximum of all elements in the current frame.

positional arguments:
  key            A meta variable expression to sort by instead of sorting the content.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class max_(Unit):
    """
    Picks the maximum of all elements in the current `refinery.lib.frame`.
    """

    def __init__(
        self,
        key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None,
    ):
        super().__init__(key=key)

    def filter(self, chunks: Iterable[Chunk]):
        def get_value(chunk: Chunk):
            if key is None:
                return chunk
            return metavars(chunk).get(key)

        key = self.args.key
        it = iter(chunks)

        for max_chunk in it:
            if not max_chunk.visible:
                yield max_chunk
            else:
                max_index = 0
                max_value = get_value(max_chunk)
                break
        else:
            return

        for index, chunk in enumerate(chunks, 1):
            if not chunk.visible:
                yield chunk
                continue
            value = get_value(chunk)
            try:
                is_max = value > max_value
            except TypeError:
                if max_value is None:
                    self.log_info(
                        F'Discarding chunk {max_index} in favor of {index} because {key} was not '
                        F'set on the former; new maximum is {value!r}.')
                    is_max = True
                else:
                    self.log_info(
                        F'Discarding chunk {index} because {key} had value {value!r}; it could not '
                        F'be compared to the current maximum {max_value!r} on chunk {max_index}.')
                    is_max = False
            if is_max:
                max_value = value
                max_chunk = chunk
                max_index = index

        yield max_chunk

class md2 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: md2 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the MD2 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class md2(HashUnit):
    """
    Returns the MD2 hash of the input data.
    """
    def _algorithm(self, data):
        from Cryptodome.Hash import MD2
        return MD2.new(data)

class md4 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: md4 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the MD4 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class md4(HashUnit):
    """
    Returns the MD4 hash of the input data.
    """
    def _algorithm(self, data):
        from Cryptodome.Hash import MD4
        return MD4.new(data)

class md5 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: md5 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the MD5 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class md5(HashUnit):
    """
    Returns the MD5 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.md5(data)

class mimewords

This unit is implemented in refinery.units.pattern.mimewords and has the following commandline Interface:

usage: mimewords [-h] [-L] [-Q] [-0] [-v]

Implements the decoding of MIME encoded-word syntax from RFC-2047.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mimewords(Unit):
    """
    Implements the decoding of MIME encoded-word syntax from RFC-2047.
    """

    @unicoded
    def process(self, data: str) -> str:
        def replacer(match):
            self.log_info('encoded mime word:', match[0])
            decoded, = decode_header(match[0])
            raw, codec = decoded
            return codecs.decode(raw, codec, errors='surrogateescape')
        return re.sub(R"=(?:\?[^\?]*){3}\?=", replacer, data)

class min_ (key=None)

This unit is implemented in refinery.units.meta.min and has the following commandline Interface:

usage: min [-h] [-L] [-Q] [-0] [-v] [key]

Picks the minimum of all elements in the current frame.

positional arguments:
  key            A meta variable expression to sort by instead of sorting the content.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class min_(Unit):
    """
    Picks the minimum of all elements in the current `refinery.lib.frame`.
    """

    def __init__(
        self,
        key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None,
    ):
        super().__init__(key=key)

    def filter(self, chunks: Iterable[Chunk]):
        def get_value(chunk: Chunk):
            if key is None:
                return chunk
            return metavars(chunk).get(key)

        key = self.args.key
        it = iter(chunks)

        for min_chunk in it:
            if not min_chunk.visible:
                yield min_chunk
            else:
                min_index = 0
                min_value = get_value(min_chunk)
                break
        else:
            return

        for index, chunk in enumerate(chunks, 1):
            if not chunk.visible:
                yield chunk
                continue
            value = get_value(chunk)
            try:
                is_min = value < min_value
            except TypeError:
                if min_value is None:
                    self.log_info(
                        F'Discarding chunk {min_index} in favor of {index} because {key} was not '
                        F'set on the former; new minimum is {value!r}.')
                    is_min = True
                else:
                    self.log_info(
                        F'Discarding chunk {index} because {key} had value {value!r}; it could not '
                        F'be compared to the current minimum {min_value!r} on chunk {min_index}.')
                    is_min = False
            if is_min:
                min_value = value
                min_chunk = chunk
                min_index = index

        yield min_chunk

class mmh128x32 (seed=0, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: mmh128x32 [-h] [-L] [-Q] [-0] [-v] [-t] [N]

Returns the 128bit Murmur Hash of the input data, 64bit variant.

positional arguments:
  N              optional seed value

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mmh128x32(MurMurHash):
    """
    Returns the 128bit Murmur Hash of the input data, 64bit variant.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return mmh128digest32(data, self.args.seed)

class mmh128x64 (seed=0, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: mmh128x64 [-h] [-L] [-Q] [-0] [-v] [-t] [N]

Returns the 128bit Murmur Hash of the input data, 64bit variant.

positional arguments:
  N              optional seed value

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mmh128x64(MurMurHash):
    """
    Returns the 128bit Murmur Hash of the input data, 64bit variant.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return mmh128digest64(data, self.args.seed)

class mmh32 (seed=0, text=False)

This unit is implemented in refinery.units.crypto.hash.murmur and has the following commandline Interface:

usage: mmh32 [-h] [-L] [-Q] [-0] [-v] [-t] [N]

Returns the 32bit Murmur Hash of the input data.

positional arguments:
  N              optional seed value

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mmh32(MurMurHash):
    """
    Returns the 32bit Murmur Hash of the input data.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return mmh32digest(data, self.args.seed)

class mscdk (size, hash='MD5')

This unit is implemented in refinery.units.crypto.keyderive.mscdk and has the following commandline Interface:

usage: mscdk [-h] [-L] [-Q] [-0] [-v] size [hash]

An implementation of the CryptDeriveKey routine available from the Win32 API.

positional arguments:
  size           The number of bytes to generate.
  hash           Specify one of these algorithms (default is MD5): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mscdk(KeyDerivation):
    """
    An implementation of the CryptDeriveKey routine available from the Win32 API.
    """

    def __init__(self, size, hash='MD5'):
        super().__init__(size=size, salt=None, hash=hash)

    def process(self, data):
        def digest(x):
            return self.hash.new(x).digest()
        size = self.args.size
        if self.args.hash in (HASH.SHA224, HASH.SHA256, HASH.SHA384, HASH.SHA512):
            buffer = digest(data)
            max_size = len(buffer)
        else:
            max_size = 2 * self.hash.digest_size
            value = digest(data)
            del data
            buffer1 = bytearray([0x36] * 64)
            buffer2 = bytearray([0x5C] * 64)
            for k, b in enumerate(value):
                buffer1[k] ^= b
                buffer2[k] ^= b
            buffer = digest(buffer1) + digest(buffer2)
        if size > max_size:
            raise RefineryPartialResult(F'too many bytes requested, can only provide {max_size}', partial=buffer)
        return buffer[:size]

class mscf (mode=None)

This unit is implemented in refinery.units.compression.mscf and has the following commandline Interface:

usage: mscf [-h] [-L] [-Q] [-0] [-v] [MODE]

The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft
Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both
with and without Huffman table) are supported. This pure Python implementation is very slow when
compared to native code, so decompressing very large inputs can take several minutes.

positional arguments:
  MODE           Manually select decompression mode (mszip, xpress, xpress-huff, lzms); by
                 default the unit attempts to derive the mode from the header, but this will fail
                 for raw streams. However, even if a header is found, a manually specified mode
                 will take precedence.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mscf(Unit):
    """
    The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft
    Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both
    with and without Huffman table) are supported. This pure Python implementation is very slow when
    compared to native code, so decompressing very large inputs can take several minutes.
    """

    _SIGNATURE = B'\x0A\x51\xE5\xC0'

    def __init__(
        self,
        mode: Unit.Arg.Option(choices=MODE, help=(
            'Manually select decompression mode ({choices}); by default the unit attempts to derive the '
            'mode from the header, but this will fail for raw streams. However, even if a header is '
            'found, a manually specified mode will take precedence.')) = None,
    ):
        mode = Unit.Arg.AsOption(mode, MODE)
        super().__init__(mode=mode)

    def process(self, data):
        mode: MODE = self.args.mode
        with StructReader(memoryview(data)) as reader, MemoryFile() as writer:
            reader: StructReader[memoryview]
            check = zlib.crc32(reader.peek(6))
            magic = reader.read(4)
            if magic != self._SIGNATURE:
                if mode is None:
                    self.log_warn(
                        F'data starts with {magic.hex().upper()} rather than the expected sequence '
                        F'{self._SIGNATURE.hex().upper()}; this could be a raw stream.')
                else:
                    reader.seek(0)
                    handler = self._get_handler(mode)
                    handler(reader, writer, None)
                    return writer.getbuffer()

            header_size = reader.u16()
            if header_size != 24:
                self.log_warn(F'the header size {header_size} was not equal to 24')

            crc32byte = reader.u8()
            check = zlib.crc32(reader.peek(0x11), check) & 0xFF
            if check != crc32byte:
                self.log_warn(F'the CRC32 check byte was {crc32byte}, computed value was {check}')

            _mode_code = reader.u8()

            try:
                _mode = MODE(_mode_code)
            except ValueError:
                msg = F'header contains unknown compression type code {_mode_code}'
                if mode is None:
                    raise ValueError(msg)
                else:
                    self.log_warn(msg)
            else:
                if mode is not None and mode != _mode:
                    logger = self.log_warn
                else:
                    logger = self.log_info
                    mode = _mode
                logger(F'header specifies algorithm {_mode.name}')

            self.log_info(F'using algorithm {mode.name}')
            decompress = self._get_handler(mode)

            final_size = reader.u32()
            _unknown_1 = reader.u32()
            chunk_size = reader.u32()
            _unknown_2 = reader.u32()

            if _unknown_1 != 0:
                self.log_warn(F'unknown value 1 was unexpectedly nonzero: 0x{_unknown_1:08X}')
            if _unknown_2 != 0:
                self.log_warn(F'unknown value 2 was unexpectedly nonzero: 0x{_unknown_2:08X}')

            self.log_debug(F'final size: 0x{final_size:08X}')
            self.log_debug(F'chunk size: 0x{chunk_size:08X}')

            if chunk_size > COMPRESS_MAX_CHUNK:
                raise ValueError('the header chunk size is greater than the maximum value')

            while len(writer) < final_size:
                src_size = reader.u32()
                src_data = reader.read(src_size)
                if len(src_data) != src_size:
                    raise IndexError(F'Attempted to read {src_size} bytes, but got only {len(src_data)}.')
                if src_size + len(writer) == final_size:
                    self.log_debug(F'final chunk is uncompressed, appending {src_size} raw bytes to output')
                    writer.write(src_data)
                    break
                self.log_debug(F'reading chunk of size {src_size}')
                start = writer.tell()
                chunk = StructReader(src_data)
                target = min(chunk_size, final_size - len(writer))
                decompress(chunk, writer, target)
                writer.flush()
                written = writer.tell() - start
                if written != target:
                    raise RuntimeError(F'decompressed output had unexpected size {written} instead of {chunk_size}')

            if not reader.eof:
                self.log_info(F'compression complete with {reader.remaining_bytes} bytes remaining in input')
            return writer.getbuffer()

    def _get_handler(self, mode: MODE) -> Callable[[StructReader, MemoryFile, Optional[int]], None]:
        decompress = {
            mode.MSZIP       : self._decompress_mszip,
            mode.XPRESS_HUFF : self._decompress_xpress_huffman,
            mode.XPRESS      : self._decompress_xpress,
        }.get(mode, None)
        if decompress is None:
            raise NotImplementedError(F'algorithm {mode.name} is not yet implemented')
        return decompress

    def _decompress_mszip(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None):
        header = bytes(reader.read(2))
        if header != B'CK':
            raise ValueError(F'chunk did not begin with CK header, got {header!r} instead')
        decompress = zlib.decompressobj(-zlib.MAX_WBITS, zdict=writer.getbuffer())
        writer.write(decompress.decompress(reader.read()))
        writer.write(decompress.flush())

    def _decompress_xpress_huffman(
        self,
        reader: StructReader,
        writer: MemoryFile,
        target: Optional[int] = None,
        max_chunk_size: int = 0x10000
    ) -> None:
        limit = writer.tell()
        if target is not None:
            target += limit

        while not reader.eof:

            if reader.remaining_bytes < XPRESS_NUM_SYMBOLS // 2:
                raise IndexError(
                    F'There are only {reader.remaining_bytes} bytes reamining in the input buffer,'
                    F' but at least {XPRESS_NUM_SYMBOLS // 2} are required to read a Huffman table.')

            table = bytearray(reader.read_integer(4) for _ in range(XPRESS_NUM_SYMBOLS))
            table = make_huffman_decode_table(table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN)
            limit = limit + max_chunk_size
            flags = BitBufferedReader(reader, 16)

            while True:
                position = writer.tell()
                if position == target:
                    if reader.remaining_bytes:
                        self.log_info(F'chunk decompressed with {reader.remaining_bytes} bytes remaining in input buffer')
                    return
                if position >= limit:
                    if position > limit:
                        limit = position
                        self.log_info(F'decompression of one chunk generated more than the limit of {max_chunk_size} bytes')
                    flags.collect()
                    break
                try:
                    sym = read_huffman_symbol(flags, table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN)
                except EOFError:
                    self.log_debug('end of file while reading huffman symbol')
                    break
                if sym < XPRESS_NUM_CHARS:
                    writer.write_byte(sym)
                    continue
                length = sym & 0xF
                offsetlog = (sym >> 4) & 0xF
                flags.collect()
                if reader.eof:
                    break
                offset = (1 << offsetlog) | flags.read(offsetlog)
                if length == 0xF:
                    nudge = reader.read_byte()
                    if nudge < 0xFF:
                        length += nudge
                    else:
                        length = reader.u16() or reader.u32()
                length += XPRESS_MIN_MATCH_LEN
                writer.replay(offset, length)

    def _decompress_xpress(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None) -> bytearray:
        if target is not None:
            target += writer.tell()
        flags = BitBufferedReader(reader)
        nibble_cache = None
        while not reader.eof:
            if target is not None and writer.tell() >= target:
                return
            if not flags.next():
                writer.write(reader.read(1))
                continue
            offset, length = divmod(reader.u16(), 8)
            offset += 1
            if length == 7:
                length = nibble_cache
                if length is None:
                    length_pair = reader.u8()
                    nibble_cache = length_pair >> 4
                    length = length_pair & 0xF
                else:
                    nibble_cache = None
                if length == 15:
                    length = reader.u8()
                    if length == 0xFF:
                        length = reader.u16() or reader.u32()
                        length -= 22
                        if length < 0:
                            raise RuntimeError(F'Invalid match length of {length} for long delta sequence')
                    length += 15
                length += 7
            length += 3
            writer.replay(offset, length)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        sig = cls._SIGNATURE
        if data[:len(sig)] == sig:
            return True

class msgpack

This unit is implemented in refinery.units.formats.msgpack and has the following commandline Interface:

usage: msgpack [-h] [-L] [-Q] [-0] [-v] [-R]

Converts a message-pack (msgpack) buffer to JSON and vice-versa.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class msgpack(Unit):
    """
    Converts a message-pack (msgpack) buffer to JSON and vice-versa.
    """
    def reverse(self, data):
        return mp.dumps(json.loads(data))

    def process(self, data):
        unpacker = mp.Unpacker(MemoryFile(data, read_as_bytes=True))
        while True:
            try:
                item = unpacker.unpack()
            except mp.exceptions.OutOfData:
                position = unpacker.tell()
                if position < len(data):
                    self.log_warn("oops")
                break
            except Exception as E:
                position = unpacker.tell()
                if not position:
                    raise
                view = memoryview(data)
                raise RefineryPartialResult(str(E), view[position:])
            else:
                yield json.dumps(item).encode(self.codec)

class mspdb (size, salt, iter=100, hash='SHA1')

This unit is implemented in refinery.units.crypto.keyderive.mspdb and has the following commandline Interface:

usage: mspdb [-h] [-L] [-Q] [-0] [-v] size salt [iter] [hash]

An implementation of the PasswordDeriveBytes routine available from the .NET standard library.
According to documentation, it is an extension of PBKDF1.

positional arguments:
  size           The number of bytes to generate.
  salt           Salt for the derivation.
  iter           Number of iterations; default is 100.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mspdb(KeyDerivation):
    """
    An implementation of the PasswordDeriveBytes routine available from the .NET
    standard library. According to documentation, it is an extension of PBKDF1.
    """
    def __init__(self, size, salt, iter=100, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        if self.codec != 'UTF8':
            data = data.decode(self.codec).encode('UTF8')
        data += self.args.salt
        for _ in range(self.args.iter - 1):
            data = self.hash.new(data).digest()
        counter, seedhash = 1, data
        data = self.hash.new(data).digest()
        while len(data) < self.args.size:
            data += self.hash.new(B'%d%s' % (counter, seedhash)).digest()
            counter += 1
        return data[:self.args.size]

class mvg (*names, top=False)

This unit is implemented in refinery.units.meta.mvg and has the following commandline Interface:

usage: mvg [-h] [-L] [-Q] [-0] [-v] [-t] [name [name ...]]

Short for "Make Variable Global": This unit can move meta variables into the scope of the parent
frame. If used at the end of a frame, the variables will be moved the scope of the frame that the
pipeline will return to. Otherwise and if the --top switch is being used, variables will be moved
to scope 0, i.e. to the topmost frame in the current tree.

Note that it is not possible to promote a variable to a parent frame if that variable does not
have the same value on all chunks in the current frame - such variables will always be removed
when the frame closes.

positional arguments:
  name           Name of a variable to be removed. If no variables are explicitly specified, all
                 variables in the current chunk will be rescoped.

optional arguments:
  -t, --top      Move the variable(s) to the topmost frame layer.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class mvg(Unit):
    """
    Short for "Make Variable Global": This unit can move meta variables into the scope of the
    parent frame. If used at the end of a frame, the variables will be moved the scope of the
    frame that the pipeline will return to. Otherwise and if the --top switch is being used,
    variables will be moved to scope 0, i.e. to the topmost frame in the current tree.

    Note that it is not possible to promote a variable to a parent frame if that variable does not
    have the same value on all chunks in the current frame - such variables will always be removed
    when the frame closes.
    """
    def __init__(
        self,
        *names: Arg(type=str, metavar='name', help=(
            'Name of a variable to be removed. If no variables are explicitly specified, all '
            'variables in the current chunk will be rescoped.'
        )),
        top: Arg.Switch('-t', help='Move the variable(s) to the topmost frame layer.') = False
    ):
        super().__init__(names=names, top=top)

    def process(self, data):
        meta = metavars(data)
        nest = self.args.nesting
        if nest < 0 and not self.args.top:
            spot = meta.scope + nest
        else:
            spot = 1
        for name in self.args.names or meta.variable_names():
            try:
                if meta.get_scope(name) <= spot:
                    continue
                meta.set_scope(name, spot)
            except KeyError:
                self.log_info(F'variable not defined: {name}')
        return data

class n40 (key)

This unit is implemented in refinery.units.malware.n40 and has the following commandline Interface:

usage: n40 [-h] [-L] [-Q] [-0] [-v] key

Decrypts hex-encoded strings in various latin-american banker families, including N40.

positional arguments:
  key            Decryption key.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class n40(Unit):
    """
    Decrypts hex-encoded strings in various latin-american banker families, including N40.
    """
    def __init__(self, key: Arg(help='Decryption key.')):
        ...

    def process(self, data):
        try:
            data = b16decode(data, casefold=True)
        except Error:
            self.log_info('Input was not hex-encoded; ignoring this step.')
        mask = data[1:] | xor(self.args.key) | bytearray
        return bytearray(0xFF + b - a if b <= a else b - a for a, b in zip(data, mask))

class neg (bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.neg and has the following commandline Interface:

usage: neg [-h] [-L] [-Q] [-0] [-v] [-E] [-B N]

Each block of the input data is negated bitwise. This is sometimes also called the bitwise
complement or inverse.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class neg(UnaryOperation):
    """
    Each block of the input data is negated bitwise. This is sometimes
    also called the bitwise complement or inverse.
    """
    def operate(self, a): return ~a
    def inplace(self, a): a ^= self.fmask

class netbios (key=b'A')

This unit is implemented in refinery.units.encoding.netbios and has the following commandline Interface:

usage: netbios [-h] [-L] [-Q] [-0] [-v] [-R] [key]

Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte
0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character,
respectively. The default offset is the capital letter A.

positional arguments:
  key            Provide a single letter to use as the offset.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class netbios(Unit):
    """
    Encodes and decodes strings using the same algorithm that is used for NetBIOS
    labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and
    0xL with an offset character, respectively. The default offset is the capital
    letter A.
    """

    def __init__(self, key: Arg(help="Provide a single letter to use as the offset.") = B'A'):
        if len(key) != 1:
            raise ValueError("The key must be a binary string of length exactly 1")
        super().__init__(key=key[0])

    def reverse(self, data):
        result = bytearray(2 * len(data))
        for k, byte in enumerate(data):
            hi, lo = byte >> 4, byte & 15
            result[2 * k + 0] = hi + self.args.key
            result[2 * k + 1] = lo + self.args.key
        return result

    def process(self, data):
        def merge(it):
            while True:
                try:
                    hi = next(it) - self.args.key
                    lo = next(it) - self.args.key
                    if hi not in range(16) or lo not in range(16):
                        raise ValueError(F'Invalid character encoding detected: hi={hi:X}, lo={lo:X}.')
                    yield (hi << 4) | lo
                except StopIteration:
                    break
        return bytearray(merge(iter(data)))

class ngrams (size=slice(2, None, None))

This unit is implemented in refinery.units.strings.ngrams and has the following commandline Interface:

usage: ngrams [-h] [-L] [-Q] [-0] [-v] [start:end:step]

Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
and deduplicates using a set data structure. The number n is taken from an arbitrary range given
as a Python slice expression.

positional arguments:
  start:end:step  Specifies the sizes of each n-gram, i.e. the number n. Defaults to 2:.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ngrams(Unit):
    """
    Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
    and deduplicates using a set data structure. The number n is taken from an arbitrary range given
    as a Python slice expression.
    """
    def __init__(
        self, size: Arg.Bounds(
            help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.') = slice(2, None),
    ):
        super().__init__(size=size)

    def process(self, data: bytearray):
        for n in integers_of_slice(self.args.size):
            if n > len(data):
                break
            deduplicator = set()
            view = memoryview(data)
            for index in range(len(data) - n + 1):
                block = bytes(view[index:index + n])
                if block in deduplicator:
                    continue
                deduplicator.add(block)
                yield self.labelled(block, offset=index)

class nop

This unit is implemented in refinery.units.misc.nop and has the following commandline Interface:

usage: nop [-h] [-L] [-Q] [-0] [-v]

The unit generates the exact output that was received as input. All unknown arguments passed to
nop are completely ignored, which is different from the behavior of other units. As such, nop can
be used to comment out other units in longer refinery pipelines by simply prefixing a command
with nop.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nop(Unit):
    """
    The unit generates the exact output that was received as input. All unknown arguments passed
    to nop are completely ignored, which is different from the behavior of other units. As such,
    nop can be used to comment out other units in longer refinery pipelines by simply prefixing a
    command with nop.
    """
    @classmethod
    def argparser(cls, **keywords):
        argp = NopArgParser(
            keywords, prog=cls.name, description=documentation(cls), add_help=False)
        argp.set_defaults(nesting=0)
        return cls._interface(argp)

Static methods

def argparser(**keywords)

Expand source code Browse git

@classmethod
def argparser(cls, **keywords):
    argp = NopArgParser(
        keywords, prog=cls.name, description=documentation(cls), add_help=False)
    argp.set_defaults(nesting=0)
    return cls._interface(argp)

class nrv2b (bits=32)

This unit is implemented in refinery.units.compression.nrv and has the following commandline Interface:

usage: nrv2b [-h] [-L] [-Q] [-0] [-v] [N]

Decompress data using the NRV2B algorithm.

positional arguments:
  N              Specify the number of codec bits. The default is 32.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nrv2b(NRVUnit):
    """
    Decompress data using the NRV2B algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * offset + next(bb)
            if offset == 2:
                offset = last_offset
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                offset += 1
                last_offset = offset
            length = next(bb)
            length = 2 * length + next(bb)
            if length == 0:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 2
            length += int(bool(offset > 0xD00))
            dst.replay(offset, length + 1)

class nrv2d (bits=32)

This unit is implemented in refinery.units.compression.nrv and has the following commandline Interface:

usage: nrv2d [-h] [-L] [-Q] [-0] [-v] [N]

Decompress data using the NRV2D algorithm.

positional arguments:
  N              Specify the number of codec bits. The default is 32.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nrv2d(NRVUnit):
    """
    Decompress data using the NRV2D algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * (offset - 1) + next(bb) # noqa
                offset = 2 *  offset      + next(bb) # noqa
            if offset == 2:
                offset = last_offset
                length = next(bb)
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                length = (offset  ^ 1) & 1 # noqa
                offset = (offset >> 1) + 1
                last_offset = offset
            length = 2 * length + next(bb)
            if length == 0:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 2
            length += int(bool(offset > 0x500))
            dst.replay(offset, length + 1)

class nrv2e (bits=32)

This unit is implemented in refinery.units.compression.nrv and has the following commandline Interface:

usage: nrv2e [-h] [-L] [-Q] [-0] [-v] [N]

Decompress data using the NRV2E algorithm.

positional arguments:
  N              Specify the number of codec bits. The default is 32.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class nrv2e(NRVUnit):
    """
    Decompress data using the NRV2E algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * (offset - 1) + next(bb) # noqa
                offset = 2 *  offset      + next(bb) # noqa
            if offset == 2:
                offset = last_offset
                length = next(bb)
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                length = (offset ^  1) & 1 # noqa
                offset = (offset >> 1) + 1
                last_offset = offset
            if length:
                length = 1 + next(bb)
            elif next(bb):
                length = 3 + next(bb)
            else:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 3
            length += int(bool(offset > 0x500))
            dst.replay(offset, length + 1)

class ntlm (text=False)

This unit is implemented in refinery.units.crypto.hash.password_hashes and has the following commandline Interface:

usage: ntlm [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the Windows NTLM hash of the input.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ntlm(HashUnit):
    """
    Returns the Windows NTLM hash of the input.
    """
    def _algorithm(self, data: bytes) -> bytes:
        from Cryptodome.Hash import MD4
        return MD4.new(data.decode(self.codec).encode('utf-16le'))

class officecrypt (password=b'VelvetSweatshop')

This unit is implemented in refinery.units.formats.office.officecrypt and has the following commandline Interface:

usage: officecrypt [-h] [-L] [-Q] [-0] [-v] [password]

A simple proxy for the msoffcrypto package to decrypt office documents.

positional arguments:
  password       The document password. By default, the Excel default password "VelvetSweatshop"
                 is used.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class officecrypt(Unit):
    """
    A simple proxy for the `msoffcrypto` package to decrypt office documents.
    """

    def __init__(self, password: Arg.Binary(help=(
        'The document password. By default, the Excel default password "{default}" is used.'
    )) = b'VelvetSweatshop'):
        super().__init__(password=password)

    @Unit.Requires('msoffcrypto-tool', 'formats', 'office')
    def _msoffcrypto():
        import msoffcrypto
        return msoffcrypto

    def process(self, data):
        password: bytes = self.args.password
        with MemoryFile(data) as stream:
            doc = self._msoffcrypto.OfficeFile(stream)
            if not doc.is_encrypted():
                self.log_warn('the document is not encrypted; returning input')
                return data
            if password:
                doc.load_key(password=password.decode(self.codec))
            with MemoryFile(bytearray()) as output:
                doc.decrypt(output)
                return output.getvalue()

class opc (mode='x32', *, count=None, until=None, nvar='name', avar='addr', ovar='arg')

This unit is implemented in refinery.units.formats.exe.opc and has the following commandline Interface:

usage: opc [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-n STR] [-a STR] [-o STR] [[x32|x64|..]]

Disassembles the input data using capstone and generates opcodes with metadata as output. This is
useful for programmatic disassembly, while the asm unit outputs a human-readable representation.
Internally, asm uses this unit and pretty-prints the output.

positional arguments:
  [x32|x64|..]     Machine code architecture, default is x32. Select from the following list:
                   x16, x32, x64, ppc32, ppc64, mips32, mips64.

optional arguments:
  -c, --count N    Maximum number of bytes to disassemble, infinite by default.
  -u, --until STR  Disassemble until the given string appears among the disassembly.
  -n, --nvar STR   Variable to receive the disassembled mnemonic. Default is "name".
  -a, --avar STR   Variable to receive the address of the instruction. Default is "addr".
  -o, --ovar STR   Variable prefix for instruction operands. Default is "arg". The complete
                   operand string will be in args, the first argument in arg1, the second in
                   arg2, and so on.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class opc(Unit):
    """
    Disassembles the input data using capstone and generates opcodes with metadata as output. This
    is useful for programmatic disassembly, while the `refinery.asm` unit outputs a human-readable
    representation. Internally, `refinery.asm` uses this unit and pretty-prints the output.
    """
    def __init__(
        self,
        mode: Arg.Choice(
            help='Machine code architecture, default is {default}. Select from the following list: {choices}.',
            choices=_ARCHES, metavar='[x32|x64|..]') = 'x32', *,
        count: Arg.Number('-c', help='Maximum number of bytes to disassemble, infinite by default.') = None,
        until: Arg.String('-u', help='Disassemble until the given string appears among the disassembly.') = None,
        nvar: Arg.String('-n', help=(
            'Variable to receive the disassembled mnemonic. Default is "{default}".')) = 'name',
        avar: Arg.String('-a', help=(
            'Variable to receive the address of the instruction. Default is "{default}".')) = 'addr',
        ovar: Arg.String('-o', help=(
            'Variable prefix for instruction operands. Default is "{default}". The complete operand '
            'string will be in {default}s, the first argument in {default}1, the second in {default}2, '
            'and so on.')) = 'arg',
        **more
    ):
        super().__init__(
            mode=mode,
            count=count,
            until=until,
            nvar=nvar,
            avar=avar,
            ovar=ovar,
            **more)

    @Unit.Requires('capstone')
    def _capstone():
        import capstone
        return capstone

    @property
    def _capstone_engine(self) -> Cs:
        cs = self._capstone
        return cs.Cs(*{
            'arm'    : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM),
            'mips32' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS32),
            'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS64),
            'ppc32'  : (cs.CS_ARCH_PPC, cs.CS_MODE_32),
            'ppc64'  : (cs.CS_ARCH_PPC, cs.CS_MODE_64),
            'x16'    : (cs.CS_ARCH_X86, cs.CS_MODE_16),
            'x32'    : (cs.CS_ARCH_X86, cs.CS_MODE_32),
            'x64'    : (cs.CS_ARCH_X86, cs.CS_MODE_64),
        }.get(self.args.mode.lower()))

    def process(self, data):
        count = self.args.count or 0
        until = self.args.until
        nvar = self.args.nvar
        avar = self.args.avar
        ovar = self.args.ovar
        if isinstance(until, str):
            until = until.lower()
        for insn in self._capstone_engine.disasm(data, 0, count):
            kwargs = {
                avar: insn.address,
                nvar: insn.mnemonic,
            }
            try:
                ops = insn.op_str
                operands = [op.strip() for op in ops.split(',')]
            except Exception:
                operands = []
            else:
                kwargs[F'{ovar}s'] = ops
            for k, op in enumerate(operands, 1):
                if not op:
                    break
                try:
                    op = int(op, 0)
                except Exception:
                    pass
                kwargs[F'{ovar}{k}'] = op
            yield self.labelled(insn.bytes, **kwargs)
            if until is None:
                continue
            if until in ops.lower() or until in insn.mnemonic.lower():
                break

class pack (base=0, prefix=False, strict=False, width=0, single_floats=False, double_floats=False, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.pack and has the following commandline Interface:

usage: pack [-h] [-L] [-Q] [-0] [-v] [-R] [-r] [-s] [-w N] [-f] [-d] [-E] [-B N] [base]

Scans the input data for numeric constants and packs them into a binary format. This is useful to
convert the textual representation of an array of numbers into its binary form. For example,
123,34,256,12,1,234 would be transformed into the byte sequence 7B22000C01EA, where 256 was
wrapped and packed as a null byte because the default block size is one byte. If the above
sequence would be packed with options -EB2, the result would be equal to 007B00220100000C000100EA
in hexadecimal.

positional arguments:
  base                 Find only numbers in given base. Default of 0 means that common
                       expressions for hexadecimal, octal and binary are accepted.

optional arguments:
  -r, --prefix         Add numeric prefixes like 0x, 0b, and 0o in reverse mode.
  -s, --strict         Only parse integers that fit in one block of the given block size.
  -w, --width N        Pad numbers with the specified amount of leading zeros.
  -f, --single-floats  Pack single-precision floating-point numbers. Implies -B4.
  -d, --double-floats  Pack double-precision floating-point numbers. Implies -B8.
  -E, --bigendian      Read chunks in big endian.
  -B, --blocksize N    The size of each block in bytes, default is 1.

generic options:
  -h, --help           Show this help message and exit.
  -L, --lenient        Allow partial results as output.
  -Q, --quiet          Disables all log output.
  -0, --devnull        Do not produce any output.
  -v, --verbose        Specify up to two times to increase log level.
  -R, --reverse        Use the reverse operation; Specify twice to normalize (first decode, then
                       encode).

Expand source code Browse git

class pack(BlockTransformationBase):
    """
    Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of
    an array of numbers into its binary form. For example, `123,34,256,12,1,234` would be transformed into the byte sequence `7B22000C01EA`,
    where `256` was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed
    with options -EB2, the result would be equal to `007B00220100000C000100EA` in hexadecimal.
    """

    def __init__(self,
        base: Arg(type=number[2:36], help=(
            'Find only numbers in given base. Default of 0 means that '
            'common expressions for hexadecimal, octal and binary are '
            'accepted.')) = 0,
        prefix : Arg.Switch('-r', group='FLT', help='Add numeric prefixes like 0x, 0b, and 0o in reverse mode.') = False,
        strict : Arg.Switch('-s', help='Only parse integers that fit in one block of the given block size.') = False,
        width  : Arg.Number('-w', help='Pad numbers with the specified amount of leading zeros.') = 0,
        single_floats: Arg.Switch('-f', group='FLT', help='Pack single-precision floating-point numbers. Implies -B4.') = False,
        double_floats: Arg.Switch('-d', group='FLT', help='Pack double-precision floating-point numbers. Implies -B8.') = False,
        bigendian=False, blocksize=None
    ):
        if single_floats and double_floats:
            raise ValueError('The floats and doubles option are mutually exclusive.')
        elif single_floats:
            fmode = FMode.SINGLE
            blocksize = 4
        elif double_floats:
            fmode = FMode.DOUBLE
            blocksize = 8
        else:
            fmode = FMode.TO_INT
        super().__init__(
            base=base,
            prefix=prefix,
            strict=strict,
            width=width,
            bigendian=bigendian,
            blocksize=blocksize,
            fmode=fmode,
            _truncate=2,
        )

    @property
    def bytestream(self):
        # never alow bytes to be left unchunked
        return False

    def reverse(self, data):
        base = self.args.base or 10
        width = self.args.width
        mode: FMode = self.args.fmode
        prefix = B''

        self.log_debug(F'using base {base:d}')

        if self.args.prefix:
            prefix = {
                0x02: b'0b',
                0x08: b'0o',
                0x10: b'0x'
            }.get(base, B'')

        if mode is FMode.TO_INT:
            converter = BaseUnit(
                base,
                little_endian=not self.args.bigendian,
                strip_padding=True,
            )
            for n in self.chunk_into_bytes(data):
                converted = converter.reverse(n)
                if width:
                    converted = converted.rjust(width, B'0')
                if prefix:
                    converted = prefix + converted
                yield converted
            return

        elif mode is FMode.SINGLE:
            float_format = 'f'
            float_size = 4

        elif mode is FMode.DOUBLE:
            float_format = 'd'
            float_size = 8

        count, rest = divmod(len(data), float_size)
        if rest:
            self.log_warn(F'data contained {rest} trailing bytes that were ignored')
            data = memoryview(data)[:-rest]
        float_format *= count
        if self.args.bigendian:
            float_format = F'>{float_format}'
        else:
            float_format = F'<{float_format}'
        for n in struct.unpack(float_format, data):
            yield str(n).encode(self.codec)

    def process(self, data):
        base: int = self.args.base
        strict: bool = self.args.strict
        mode: FMode = self.args.fmode
        ep = '>' if self.args.bigendian else '<'

        def evaluate_literals(literals: Iterable[bytes]):
            for literal in literals:
                if mode is FMode.TO_INT:
                    if base == 0 and literal[0] == 0x30 and literal[1:].isdigit():
                        literal = B'0o%s' % literal
                    N = int(literal, base)
                elif mode is FMode.SINGLE:
                    N, = struct.unpack(F'{ep}I', struct.pack(F'{ep}f', float(literal)))
                elif mode is FMode.DOUBLE:
                    N, = struct.unpack(F'{ep}Q', struct.pack(F'{ep}d', float(literal)))
                else:
                    raise TypeError('unexpected floating point mode')
                M = N & self.fmask
                if strict and M != N:
                    continue
                yield M

        if base == 0:
            pattern = formats.number
        elif base <= 10:
            pattern = re.compile(B'[-+]?[0-%d]{1,64}' % (base - 1))
        else:
            pattern = re.compile(B'[-+]?[0-9a-%c]{1,20}' % (0x57 + base), re.IGNORECASE)

        return self.unchunk(evaluate_literals(m[0] for m in pattern.finditer(data)))

class pad (width, padding=b'\x00', left=False, absolute=False)

This unit is implemented in refinery.units.meta.pad and has the following commandline Interface:

usage: pad [-h] [-L] [-Q] [-0] [-v] [-l] [-a] N [padding]

Allows padding of the input data.

positional arguments:
  N               Input is padded to the nearest multiple of this size.
  padding         This custom binary sequence is used (repeatedly, if necessary) to pad the
                  input. The default is a zero byte.

optional arguments:
  -l, --left      Pad on the left instead of the right.
  -a, --absolute  The width argument specifies an absolute size, not a block size.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class pad(Unit):
    """
    Allows padding of the input data.
    """

    def __init__(
        self,
        width: Arg.Number(help='Input is padded to the nearest multiple of this size.'),
        padding: Arg(help=(
            'This custom binary sequence is used (repeatedly, if necessary) to pad the '
            'input. The default is a zero byte.')) = B'\0',
        left: Arg.Switch('-l', help='Pad on the left instead of the right.') = False,
        absolute: Arg.Switch('-a', help=(
            'The width argument specifies an absolute size, not a block size.')) = False
    ):
        super().__init__(width=width, padding=padding, left=left, absolute=absolute)

    def process(self, data):
        width = self.args.width
        if self.args.absolute and len(data) >= width:
            return data
        q, r = divmod(len(data), width)
        size = (q + bool(r)) * width
        missing = (size - len(data))
        if missing <= 0:
            return data
        pad = self.args.padding
        if missing > len(pad):
            pad *= missing // len(pad)
        if self.args.left:
            return pad[:missing] + data
        else:
            data += pad[:missing]
            return data

class pbkdf1 (size, salt=b'\x00\x00\x00\x00\x00\x00\x00\x00', iter=1000, hash='SHA1')

This unit is implemented in refinery.units.crypto.keyderive.pbkdf1 and has the following commandline Interface:

usage: pbkdf1 [-h] [-L] [-Q] [-0] [-v] size [salt] [iter] [hash]

PBKDF1 Key derivation

positional arguments:
  size           The number of bytes to generate.
  salt           Salt for the derivation; default are 8 null bytes.
  iter           Number of iterations; default is 1000.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pbkdf1(KeyDerivation):
    """PBKDF1 Key derivation"""

    @Arg('salt', help='Salt for the derivation; default are 8 null bytes.')
    def __init__(self, size, salt=bytes(8), iter=1000, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        from Cryptodome.Protocol.KDF import PBKDF1
        return multidecode(data, lambda pwd: (
            PBKDF1(pwd, self.args.salt, dkLen=self.args.size, count=self.args.iter, hashAlgo=self.hash)
        ))

class pbkdf2 (size, salt, iter=1000, hash='SHA1')

This unit is implemented in refinery.units.crypto.keyderive.pbkdf2 and has the following commandline Interface:

usage: pbkdf2 [-h] [-L] [-Q] [-0] [-v] size salt [iter] [hash]

PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries.

positional arguments:
  size           The number of bytes to generate.
  salt           Salt for the derivation.
  iter           Number of iterations; default is 1000.
  hash           Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256,
                 sha512, sha224, sha384

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pbkdf2(KeyDerivation):
    """
    PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET
    binaries.
    """

    def __init__(self, size, salt, iter=1000, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data: ByteStr):
        from Cryptodome.Protocol.KDF import PBKDF2
        return multidecode(data, partial(
            PBKDF2,
            salt=self.args.salt,
            dkLen=self.args.size,
            hmac_hash_module=self.hash,
            count=self.args.iter
        ))

class pcap (merge=False)

This unit is implemented in refinery.units.formats.pcap and has the following commandline Interface:

usage: pcap [-h] [-L] [-Q] [-0] [-v] [-m]

Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the
parts of each TCP conversation, attaching several pieces of metadata to each such output:
Included are the source and destination socket address as well as the variable stream which
identifies the conversation which it was part of. The chunks are returned in the order that the
bytes were exchanged between source and destination. When the --merge parameter is specified, the
unit instead collects all bytes going forward and backwards, respectively, and emitting these as
two chunks, for each TCP conversation that took place.

optional arguments:
  -m, --merge    Merge both parts of each TCP conversation into one chunk.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pcap(Unit):
    """
    Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of
    each TCP conversation, attaching several pieces of metadata to each such output: Included are the source
    and destination socket address as well as the variable `stream` which identifies the conversation which
    it was part of. The chunks are returned in the order that the bytes were exchanged between source and
    destination. When the `--merge` parameter is specified, the unit instead collects all bytes going forward
    and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place.
    """

    def __init__(self, merge: Arg.Switch('-m', help='Merge both parts of each TCP conversation into one chunk.') = False):
        super().__init__(merge=merge)

    @Unit.Requires('pypcapkit[scapy]<0.16.0', 'all')
    def _pcapkit():
        import pcapkit
        return pcapkit

    def process(self, data):
        with NoLogging():
            pcapkit = self._pcapkit
            logging.getLogger('pcapkit').disabled = True

        merge = self.args.merge

        with VirtualFileSystem() as fs:
            vf = VirtualFile(fs, data, 'pcap')
            extraction = pcapkit.extract(
                fin=vf.path, engine='scapy', store=False, nofile=True, extension=False, tcp=True, strict=True)
            tcp: list = list(extraction.reassembly.tcp)

        tcp.sort(key=lambda p: min(p.index, default=0))

        count, convo = 0, None
        src_buffer = MemoryFile()
        dst_buffer = MemoryFile()
        for stream in tcp:
            this_convo = Conversation.FromID(stream.id)
            if this_convo != convo:
                if count and merge:
                    if src_buffer.tell():
                        yield self.labelled(src_buffer.getvalue(), **convo.src_to_dst())
                        src_buffer.truncate(0)
                    if dst_buffer.tell():
                        yield self.labelled(dst_buffer.getvalue(), **convo.dst_to_src())
                        dst_buffer.truncate(0)
                count = count + 1
                convo = this_convo
            for packet in stream.packets:
                if not merge:
                    yield self.labelled(packet.data, **this_convo.src_to_dst(), stream=count)
                elif this_convo.src == convo.src:
                    src_buffer.write(packet.data)
                elif this_convo.dst == convo.src:
                    dst_buffer.write(packet.data)
                else:
                    raise RuntimeError(F'direction of packet {convo!s} in conversation {count} is unknown')

class pcap_http

This unit is implemented in refinery.units.formats.pcap_http and has the following commandline Interface:

usage: pcap-http [-h] [-L] [-Q] [-0] [-v]

Extracts HTTP payloads from packet capture (PCAP) files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pcap_http(Unit):
    """
    Extracts HTTP payloads from packet capture (PCAP) files.
    """
    pcap = pcap()

    def process(self, data):
        http_parser = httpresponse()
        requests: List[_HTTP_Request] = []
        responses: List[bytearray] = []

        def lookup(src, dst):
            for k, request in enumerate(requests):
                if request.src == dst and request.dst == src:
                    requests.pop(k)
                    return self.labelled(data, url=request.url)
            return None

        for stream in data | pcap:
            try:
                data = http_parser.process(stream)
            except Exception:
                try:
                    rq = _parse_http_request(stream)
                    requests.append(rq)
                except _HTTPParseError as E:
                    self.log_info(F'error parsing http request: {E!s}')
                except Exception:
                    pass
                continue
            if not data:
                continue
            src, dst = stream['src'], stream['dst']
            item = lookup(src, dst)
            if item is None:
                responses.append((src, dst, data))
                continue
            yield item

        while responses:
            src, dst, data = responses.pop()
            item = lookup(src, dst)
            yield data if item is None else item

class pedebloat (*names, certificate=False, directories=False, memdump=False, resources=False, sections=False, trim_text=False, trim_rsrc=False, threshold=0.05, size_limit=10.0 MB, keep_limit=False, aggressive=False)

This unit is implemented in refinery.units.formats.pe.pedebloat and has the following commandline Interface:

usage: pedebloat [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m] [-r] [-s] [-X] [-Y] [-t T] [-l N] [-k]
                 [-a]
                 [names [names ...]]

Removes junk or excess data from PE files and returns the stripped executable. By default, only
the PE overlay is considered; use the flags -r and -s to also consider resources and entire
sections. Any buffer is only considered for removal if it exceeds a certain size. If this
condition is met, a binary search is performed to determine the offset inside the buffer up to
which the compression ratio is above a certain threshold; everything beyond that point is then
removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.

positional arguments:
  names

optional arguments:
  -c, --certificate   Include digital signatures for the size computation.
  -d, --directories   Include data directories for size computation.
  -m, --memdump       Assume that the file data was a memory-mapped PE file.
  -r, --resources     Strip large resources.
  -s, --sections      Strip large sections.
  -X, --trim-text     Lift the exception on .TEXT section for stripping.
  -Y, --trim-rsrc     Lift the exception on .RSRC section for stripping.
  -t, --threshold T   Trailing data from resources and sections is stripped until the compression
                      ratio of the remaining data rises above this threshold. The default value
                      is 0.05. Set this to 1 to ignore the limit entirely and trim every
                      structure as much as possible without violating alignment. Setting this
                      value to 0 will only strip repeated occurrences of the last byte.
  -l, --size-limit N  Structures below this size are not stripped. Default is 10.0 MB.
  -k, --keep-limit    Do not strip structures to below the above size limit.
  -a, --aggressive    Equivalent to -srt1: Strip large sections and resources aggressively.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Allow partial results as output.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.

Expand source code Browse git

class pedebloat(OverlayUnit):
    """
    Removes junk or excess data from PE files and returns the stripped executable. By default, only
    the PE overlay is considered; use the flags `-r` and `-s` to also consider resources and entire
    sections. Any buffer is only considered for removal if it exceeds a certain size. If this
    condition is met, a binary search is performed to determine the offset inside the buffer up to
    which the compression ratio is above a certain threshold; everything beyond that point is then
    removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.
    """
    def __init__(
        self,
        *names: Arg(type=str),
        certificate=False,
        directories=False,
        memdump=False,
        resources: Arg.Switch('-r', help='Strip large resources.') = False,
        sections : Arg.Switch('-s', help='Strip large sections.') = False,
        trim_text: Arg.Switch('-X', help='Lift the exception on .TEXT section for stripping.') = False,
        trim_rsrc: Arg.Switch('-Y', help='Lift the exception on .RSRC section for stripping.') = False,
        threshold: Arg('-t', metavar='T', type=percent, help=(
            'Trailing data from resources and sections is stripped until the compression ratio '
            'of the remaining data rises above this threshold. The default value is {default}. '
            'Set this to 1 to ignore the limit entirely and trim every structure as much as '
            'possible without violating alignment. Setting this value to 0 will only strip repeated '
            'occurrences of the last byte.')) = 0.05,
        size_limit: Arg.Number('-l', help=(
            'Structures below this size are not stripped. Default is {default!r}.')) = _STRIP,
        keep_limit: Arg.Switch('-k', help=(
            'Do not strip structures to below the above size limit.')) = False,
        aggressive: Arg.Switch('-a', help=(
            'Equivalent to -srt1: Strip large sections and resources aggressively.')) = False,
    ):
        if aggressive:
            sections = True
            resources = True
            threshold = 1

        super().__init__(
            certificate,
            directories,
            memdump,
            sections=sections,
            resources=resources,
            size_limit=size_limit,
            keep_limit=keep_limit,
            threshold=threshold,
            trim_rsrc=trim_rsrc,
            trim_text=trim_text,
            names=names,
        )

    def _right_strip_data(self, data: memoryview, alignment=1, block_size=_MB) -> int:
        if not data:
            return 0
        threshold = self.args.threshold
        data_overhang = len(data) % alignment
        result = data_overhang

        if 0 < threshold < 1:
            def compression_ratio(offset: int):
                ratio = len(zlib.compress(data[:offset], level=1)) / offset
                self.log_debug(F'compressing {SizeInt(offset)!r} ratio={ratio:6.4f}')
                return ratio
            upper = len(data)
            lower = result
            if compression_ratio(upper) <= threshold:
                while block_size < upper - lower:
                    pivot = (lower + upper) // 2
                    ratio = compression_ratio(pivot)
                    if ratio > threshold:
                        lower = pivot + 1
                        continue
                    upper = pivot
                    if abs(ratio - threshold) < 1e-10:
                        break
            result = upper
        elif threshold == 0:
            result = len(data)
        elif threshold == 1:
            result = 0

        while result > 1 and data[result - 2] == data[result - 1]:
            result -= 1

        result = max(result, data_overhang)

        if self.args.keep_limit:
            result = max(result, self.args.size_limit)

        result = result + (data_overhang - result) % alignment

        if result > len(data):
            excess = result - len(data)
            excess = excess + (-excess % alignment)
            result = result - excess

        return result

    def _adjust_offsets(self, pe: PE, gap_offset: int, gap_size: int):
        base = pe.OPTIONAL_HEADER.ImageBase
        alignment = pe.OPTIONAL_HEADER.FileAlignment
        rva_offset = pe.get_rva_from_offset(gap_offset)
        tva_offset = rva_offset + base

        section = pe.get_section_by_offset(gap_offset)
        new_section_size = section.SizeOfRawData - gap_size
        if new_section_size % alignment != 0:
            raise RuntimeError(
                F'trimming 0x{gap_size:X} bytes from section {_ASCII(section.Name)} of size 0x{section.SizeOfRawData:X} '
                F'violates required section alignment of 0x{alignment:X} bytes')
        inside_section_offset = gap_offset - section.PointerToRawData
        if inside_section_offset > new_section_size:
            overlap = inside_section_offset - new_section_size
            raise RuntimeError(F'trimming from section {_ASCII(section.Name)}; data extends {overlap} beyond section')

        rva_lbound = section.VirtualAddress
        rva_ubound = section.VirtualAddress + section.Misc_VirtualSize - 1
        tva_lbound = rva_lbound + base
        tva_ubound = rva_ubound + base

        def adjust_attributes_of_structure(
            structure: Structure,
            gap_offset: int,
            valid_values_lower_bound: Optional[int],
            valid_values_upper_bound: Optional[int],
            attributes: Iterable[str]
        ):
            for attribute in attributes:
                old_value = getattr(structure, attribute, 0)
                if old_value <= gap_offset:
                    continue
                if valid_values_lower_bound is not None and old_value < valid_values_lower_bound:
                    continue
                if valid_values_upper_bound is not None and old_value > valid_values_upper_bound:
                    continue
                new_value = old_value - gap_size
                if new_value < gap_offset:
                    raise BrokenLink(F'attribute {attribute} points into removed region')
                self.log_debug(F'adjusting field in {structure.name}: {attribute}')
                setattr(structure, attribute, new_value)

        it: Iterable[Structure] = iter(pe.__structures__)
        remove = []

        for index, structure in enumerate(it):
            old_offset = structure.get_file_offset()
            new_offset = old_offset - gap_offset

            if old_offset > gap_offset:
                if old_offset < gap_offset + gap_size:
                    self.log_debug(F'removing structure {structure.name}; starts inside removed region')
                    remove.append(index)
                    continue
                if isinstance(structure, SectionStructure) and new_offset % alignment != 0:
                    raise RuntimeError(
                        F'structure {structure.name} would be moved to offset 0x{new_offset:X}, '
                        F'violating section alignment value 0x{alignment:X}.')
                structure.set_file_offset(new_offset)

            try:
                adjust_attributes_of_structure(structure, rva_offset, rva_lbound, rva_ubound, (
                    'OffsetToData',
                    'AddressOfData',
                    'VirtualAddress',
                    'AddressOfNames',
                    'AddressOfNameOrdinals',
                    'AddressOfFunctions',
                    'AddressOfEntryPoint',
                    'AddressOfRawData',
                    'BaseOfCode',
                    'BaseOfData',
                ))
                adjust_attributes_of_structure(structure, tva_offset, tva_lbound, tva_ubound, (
                    'StartAddressOfRawData',
                    'EndAddressOfRawData',
                    'AddressOfIndex',
                    'AddressOfCallBacks',
                ))
                adjust_attributes_of_structure(structure, gap_offset, None, None, (
                    'OffsetModuleName',
                    'PointerToRawData',
                ))
            except BrokenLink as error:
                self.log_debug(F'removing structure {structure.name}; {error!s}')
                remove.append(index)
                continue

            for attribute in (
                'CvHeaderOffset',
                'OffsetIn2Qwords',
                'OffsetInQwords',
                'Offset',
                'OffsetLow',
                'OffsetHigh'
            ):
                if not hasattr(structure, attribute):
                    continue
                self.log_warn(F'potential offset in structure {structure.name} ignored: {attribute}')

        while remove:
            index = remove.pop()
            pe.__structures__[index:index + 1] = []

        section.SizeOfRawData = new_section_size

    def _trim_sections(self, pe: PE, data: bytearray) -> int:
        S = self.args.size_limit
        P = self.args.names
        trimmed = 0
        for section in pe.sections:
            section: SectionStructure
            offset = section.PointerToRawData
            name = _ASCII(section.Name)
            if not self.args.trim_text and name.lower() in ('.text', '.code'):
                self.log_info(F'skipping code section {name}')
                continue
            if not self.args.trim_rsrc and name.lower() == '.rsrc':
                self.log_info(F'skipping rsrc section {name}')
                continue
            old_size = section.SizeOfRawData
            if old_size <= S and not any(fnmatch(name, p) for p in P):
                self.log_debug(F'criteria not satisfied for section: {SizeInt(old_size)!r} {name}')
                continue
            new_size = self._right_strip_data(
                memoryview(data)[offset:offset + old_size],
                pe.OPTIONAL_HEADER.FileAlignment)
            self.log_info(F'stripping section {name} from {TI(old_size)!r} to {TI(new_size)!r}')
            gap_size = old_size - new_size
            gap_offset = offset + new_size
            if gap_size <= 0:
                continue
            self._adjust_offsets(pe, gap_offset, gap_size)
            trimmed += gap_size
            data[gap_offset:gap_offset + gap_size] = []
        return trimmed

    def _trim_pe_resources(self, pe: PE, data: bytearray) -> int:
        S = self.args.size_limit
        P = self.args.names
        trimmed = 0

        def find_bloated_resources(pe: PE, directory, level: int = 0, *path) -> Generator[Structure, None, None]:
            for entry in directory.entries:
                name = getattr(entry, 'name')
                numeric = getattr(entry, 'id')
                if not name:
                    if level == 0 and numeric in iter(RSRC):
                        name = RSRC(entry.id)
                    elif numeric is not None:
                        name = str(numeric)
                name = name and str(name) or '?'
                if entry.struct.DataIsDirectory:
                    yield from find_bloated_resources(pe, entry.directory, level + 1, *path, name)
                    continue
                struct: Structure = entry.data.struct
                name = '/'.join((*path, name))
                if struct.Size <= S and not any(fnmatch(name, p) for p in P):
                    self.log_debug(F'criteria not satisfied for resource: {SizeInt(struct.Size)!r} {name}')
                    continue
                yield name, struct

        RSRC_INDEX = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']
        pe.parse_data_directories(directories=[RSRC_INDEX])

        try:
            resources = pe.DIRECTORY_ENTRY_RESOURCE
        except AttributeError:
            return 0
        for name, resource in find_bloated_resources(pe, resources):
            offset = pe.get_offset_from_rva(resource.OffsetToData)
            old_size = resource.Size
            new_size = self._right_strip_data(
                memoryview(data)[offset:offset + old_size],
                pe.OPTIONAL_HEADER.FileAlignment)
            self.log_info(F'stripping resource {name} from {old_size} to {new_size}')
            gap_size = old_size - new_size
            gap_offset = offset + new_size
            if gap_size <= 0:
                continue
            resource.Size = new_size
            self._adjust_offsets(pe, gap_offset, gap_size)
            trimmed += gap_size
            data[gap_offset:gap_offset + gap_size] = []

        pe.OPTIONAL_HEADER.DATA_DIRECTORY[RSRC_INDEX].Size -= trimmed
        self.log_info(F'trimming size of resource data directory by {TI(trimmed)!r}')
        return trimmed

    def process(self, data: bytearray) -> bytearray:
        overlay_offset = self._get_size(data)
        if len(data) - overlay_offset >= self.args.size_limit:
            view = memoryview(data)
            overlay_length = self._right_strip_data(view[overlay_offset:])
            body_size = overlay_offset + overlay_length
            try:
                data[body_size:] = []
            except Exception:
                data = data[:body_size]
        if not self.args.resources and not self.args.sections:
            return data
        pe = PE(data=data, fast_load=True)
        total = len(data)
        trimmed = 0
        view = pe.__data__
        copy = False
        if not isinstance(view, bytearray):
            view = memoryview(view)
            try:
                view[0] = 0x4D
            except Exception:
                copy = True
                view = bytearray(pe.__data__)
        if self.args.resources:
            trimmed += self._trim_pe_resources(pe, view)
        if self.args.sections:
            trimmed += self._trim_sections(pe, view)
        if copy:
            pe.__data__ = view
        data = pe.write()
        end = total - trimmed
        if end < len(data):
            self.log_warn(F'output contains {len(data) - end} trailing bytes')
        return data

class peek (lines=10, all=False, brief=False, decode=0, escape=False, bare=False, meta=0, gray=False, index=False, stdout=False, narrow=False, blocks=1, dense=False, expand=False, width=0)

This unit is implemented in refinery.units.sinks.peek and has the following commandline Interface:

usage: peek [-h] [-L] [-Q] [-0] [-v] [-l N | -a | -b] [-d | -e] [-r | -m] [-g] [-i] [-2] [-N]
            [-B N] [-D] [-E] [-W N]

The unit extracts preview information of the input data and displays it on the standard error
stream. If the standard output of this unit is connected by a pipe, the incoming data is
forwarded. However, if the unit outputs to a terminal, the data is discarded instead.

optional arguments:
  -l, --lines N   Specify number N of lines in the preview, default is 10.
  -a, --all       Output all possible preview lines without restriction
  -b, --brief     One line peek, implies --lines=1.
  -d, --decode    Attempt to decode and display printable data. Specify twice to enable line
                  wrapping.
  -e, --escape    Always peek data as string, escape characters if necessary.
  -r, --bare      Only peek the data itself, do not show a metadata preview.
  -m, --meta      Show more auto-derivable metadata. Specify multiple times to populate more
                  variables.
  -g, --gray      Do not colorize the output.
  -i, --index     Display the index of each chunk within the current frame.
  -2, --stdout    Print the peek to STDOUT rather than STDERR; the input data is lost.
  -N, --narrow    Do not show addresses in hexdump
  -B, --blocks N  Group hexadecimal bytes in blocks of the given size; default is 1.
  -D, --dense     Do not insert spaces in hexdump.
  -E, --expand    Do not compress sequences of identical lines in hexdump
  -W, --width N   Specify the number of hexadecimal characters to use in preview.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class peek(HexViewer):
    """
    The unit extracts preview information of the input data and displays it on the standard error stream. If the standard
    output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal,
    the data is discarded instead.
    """

    def __init__(
        self,
        lines  : Arg.Number('-l', group='SIZE', help='Specify number N of lines in the preview, default is 10.') = 10,
        all    : Arg.Switch('-a', group='SIZE', help='Output all possible preview lines without restriction') = False,
        brief  : Arg.Switch('-b', group='SIZE', help='One line peek, implies --lines=1.') = False,
        decode : Arg.Counts('-d', group='MODE', help=(
            'Attempt to decode and display printable data. Specify twice to enable line wrapping.')) = 0,
        escape : Arg.Switch('-e', group='MODE', help='Always peek data as string, escape characters if necessary.') = False,
        bare   : Arg.Switch('-r', group='META', help='Only peek the data itself, do not show a metadata preview.') = False,
        meta   : Arg.Counts('-m', group='META', help=(
            'Show more auto-derivable metadata. Specify multiple times to populate more variables.')) = 0,
        gray   : Arg.Switch('-g', help='Do not colorize the output.') = False,
        index  : Arg.Switch('-i', help='Display the index of each chunk within the current frame.') = False,
        stdout : Arg.Switch('-2', help='Print the peek to STDOUT rather than STDERR; the input data is lost.') = False,
        narrow=False, blocks=1, dense=False, expand=False, width=0
    ):
        if decode and escape:
            raise ValueError('The decode and esc options are exclusive.')
        if brief:
            narrow = True
        if environment.colorless.value:
            gray = True
        lines = 1 if brief else INF if all else lines
        super(peek, self).__init__(
            brief=brief,
            gray=gray,
            blocks=blocks,
            decode=decode,
            dense=dense,
            index=index,
            escape=escape,
            expand=expand,
            narrow=narrow,
            lines=lines,
            meta=meta,
            bare=bare,
            width=width,
            stdout=stdout,
        )

    @HexViewer.Requires('colorama', 'display', 'default', 'extended')
    def _colorama():
        import colorama
        return colorama

    def process(self, data):
        colorize = not self.args.gray and not self.args.stdout
        lines = self._peeklines(data, colorize)

        if self.args.stdout:
            for line in lines:
                yield line.encode(self.codec)
            return

        stderr = sys.stderr

        if colorize:
            colorama = self._colorama
            if os.name == 'nt':
                stderr = colorama.AnsiToWin32(stderr).stream
            _erase = ' ' * get_terminal_size()
            _reset = F'\r{colorama.Style.RESET_ALL}{_erase}\r'
        else:
            _reset = ''

        try:
            for line in lines:
                print(line, file=stderr)
        except BaseException:
            stderr.write(_reset)
            raise
        if not self.isatty:
            self.log_info('forwarding input to next unit')
            yield data

    def _peekmeta(self, linewidth, sep, _x_peek=None, **meta) -> Generator[str, None, None]:
        if not meta and not _x_peek:
            return
        width = max((len(name) for name in meta), default=0)
        separators = iter([sep])
        if _x_peek is not None:
            if len(_x_peek) > linewidth:
                _x_peek = _x_peek[:linewidth - 3] + '...'
            yield from separators
            yield _x_peek
        for name in sorted(meta):
            value = meta[name]
            if value is None:
                continue
            if isinstance(value, CustomStringRepresentation):
                value = repr(value).strip()
            elif isbuffer(value):
                value = repr(ByteStringWrapper(value))
            elif isinstance(value, int):
                if value in range(-999, 1000):
                    value = str(value)
                elif value > 0:
                    value = F'0x{value:X}'
                else:
                    value = F'-0x{-value:X}'
            elif isinstance(value, float):
                value = F'{value:.4f}'
            metavar = F'{name:>{width + 2}} = {value!s}'
            if len(metavar) > linewidth:
                metavar = metavar[:linewidth - 3] + '...'
            yield from separators
            yield metavar

    def _trydecode(self, data, codec: Optional[str], width: int, linecount: int) -> str:
        remaining = linecount
        result = []
        wrap = self.args.decode > 1
        if codec is None:
            from refinery.units.encoding.esc import esc
            decoded = data[:abs(width * linecount)]
            decoded = str(decoded | -esc(bare=True))
            limit = abs(min(linecount * width, len(decoded)))
            for k in range(0, limit, width):
                result.append(decoded[k:k + width])
            return result
        try:
            import unicodedata
            unprintable = {'Cc', 'Cf', 'Co', 'Cs'}
            self.log_info(F'trying to decode as {codec}.')
            decoded = codecs.decode(data, codec, errors='strict')
            count = sum(unicodedata.category(c) not in unprintable for c in decoded)
            ratio = count / len(decoded)
        except UnicodeDecodeError as DE:
            self.log_info('decoding failed:', DE.reason)
            return None
        except ValueError as V:
            self.log_info('decoding failed:', V)
            return None
        if ratio < 0.8:
            self.log_info(F'data contains {ratio * 100:.2f}% printable characters, this is too low.')
            return None
        decoded = decoded.splitlines(False)
        if not wrap:
            for k, line in enumerate(decoded):
                if len(line) <= width:
                    continue
                if self.args.gray:
                    decoded[k] = line[:width]
                else:
                    c = self._colorama
                    d = line.replace('\t', '    ')[:width - 1]
                    decoded[k] = F'{d}{c.Fore.LIGHTRED_EX}…{c.Style.RESET_ALL}'
            return decoded[:abs(linecount)]
        for paragraph in decoded:
            if not remaining:
                break
            wrapped = [
                line for chunk in textwrap.wrap(
                    paragraph,
                    width,
                    break_long_words=True,
                    break_on_hyphens=False,
                    drop_whitespace=False,
                    expand_tabs=True,
                    max_lines=abs(remaining + 1),
                    replace_whitespace=False,
                    tabsize=4,
                )
                for line in chunk.splitlines(keepends=False)
            ]
            remaining -= len(wrapped)
            result.extend(wrapped)
        return result[:abs(linecount)]

    def _peeklines(self, data: bytearray, colorize: bool) -> Generator[str, None, None]:

        meta = metavars(data)

        codec = None
        lines = None
        final = data.temp or False
        empty = True

        if not self.args.index:
            meta.discard('index')
            index = None
        else:
            index = meta.get('index', None)

        if not self.args.brief:
            padding = 0
        else:
            padding = SizeInt.width + 2
            if index is not None:
                padding += 6

        metrics = self._get_metrics(len(data), self.args.lines, padding)

        if self.args.brief:
            metrics.address_width = 0
            metrics.fit_to_width(allow_increase=True)

        sepsize = metrics.hexdump_width
        txtsize = self.args.width or sepsize

        if self.args.lines and data:
            if self.args.escape:
                lines = self._trydecode(data, None, txtsize, metrics.line_count)
            if self.args.decode > 0:
                for codec in ('utf8', 'utf-16le', 'utf-16', 'utf-16be'):
                    lines = self._trydecode(data, codec, txtsize, metrics.line_count)
                    if lines:
                        codec = codec
                        break
                else:
                    codec = None
            if lines is None:
                lines = list(self.hexdump(data, metrics, colorize))
            else:
                sepsize = txtsize

        def separator(title=None):
            if title is None or sepsize <= len(title) + 8:
                return sepsize * '-'
            return '-' * (sepsize - len(title) - 5) + F'[{title}]---'

        if self.args.brief:
            final = False
        elif not self.args.bare:
            peek = repr(meta.size)
            line = separator()
            if len(data) <= 5_000_000:
                peek = F'{peek}; {meta.entropy!r} entropy'
            peek = F'{peek}; {meta.magic!s}'
            if self.args.lines == 0:
                peek = None
            elif not data:
                peek = None
                line = separator('empty chunk')
            if self.args.meta > 0:
                meta.derive('size')
                meta.derive('magic')
                meta.derive('entropy')
                peek = None
            if self.args.meta > 1:
                meta.derive('crc32')
                meta.derive('sha256')
            if self.args.meta > 2:
                for name in meta.derivations:
                    meta[name]
            for line in self._peekmeta(metrics.hexdump_width, line, _x_peek=peek, **meta):
                empty = False
                yield line

        if lines:
            empty = False
            if not self.args.brief:
                yield separator(codec or None)
                yield from lines
            else:
                brief = next(iter(lines))
                brief = F'{SizeInt(len(data))!r}: {brief}'
                if index is not None:
                    brief = F'#{index:03d}: {brief}'
                yield brief

        if final and not empty:
            yield separator()

    def filter(self, chunks):
        try:
            self._colorama.init(wrap=False)
        except ImportError:
            pass
        discarded = 0
        it = iter(chunks)
        buffer = collections.deque(itertools.islice(it, 0, 2))
        buffer.reverse()

        while buffer:
            if self.isatty and not buffer[0].visible:
                buffer.popleft()
                discarded += 1
            else:
                item = buffer.pop()
                last = not bool(buffer)
                item.temp = last
                if not item.visible and self.isatty:
                    discarded += 1
                else:
                    yield item
            try:
                buffer.appendleft(next(it))
            except StopIteration:
                pass

        if discarded:
            self.log_warn(F'discarded {discarded} invisible chunks to prevent them from leaking into the terminal.')

class pemeta (all=True, debug=False, dotnet=False, signatures=False, timestamps=False, version=False, header=False, exports=False, imports=False, tabular=False, timeraw=False)

This unit is implemented in refinery.units.formats.pe.pemeta and has the following commandline Interface:

usage: pemeta [-h] [-L] [-Q] [-0] [-v] [-c] [-D] [-N] [-S] [-T] [-V] [-H] [-E] [-I] [-t] [-r]

Extract metadata from PE files. By default, all information except for imports and exports are
extracted.

optional arguments:
  -c, --custom      Unless enabled, all default categories will be extracted.
  -D, --debug       Parse the PDB path from the debug directory.
  -N, --dotnet      Parse the .NET header.
  -S, --signatures  Parse digital signatures.
  -T, --timestamps  Extract time stamps.
  -V, --version     Parse the VERSION resource.
  -H, --header      Parse data from the PE header.
  -E, --exports     List all exported functions.
  -I, --imports     List all imported functions.
  -t, --tabular     Print information in a table rather than as JSON
  -r, --timeraw     Extract time stamps as numbers instead of human-readable format.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class pemeta(Unit):
    """
    Extract metadata from PE files. By default, all information except for imports and exports are
    extracted.
    """
    def __init__(
        self, all : Arg('-c', '--custom',
            help='Unless enabled, all default categories will be extracted.') = True,
        debug      : Arg('-D', help='Parse the PDB path from the debug directory.') = False,
        dotnet     : Arg('-N', help='Parse the .NET header.') = False,
        signatures : Arg('-S', help='Parse digital signatures.') = False,
        timestamps : Arg('-T', help='Extract time stamps.') = False,
        version    : Arg('-V', help='Parse the VERSION resource.') = False,
        header     : Arg('-H', help='Parse data from the PE header.') = False,
        exports    : Arg('-E', help='List all exported functions.') = False,
        imports    : Arg('-I', help='List all imported functions.') = False,
        tabular    : Arg('-t', help='Print information in a table rather than as JSON') = False,
        timeraw    : Arg('-r', help='Extract time stamps as numbers instead of human-readable format.') = False,
    ):
        super().__init__(
            debug=all or debug,
            dotnet=all or dotnet,
            signatures=all or signatures,
            timestamps=all or timestamps,
            version=all or version,
            header=all or header,
            imports=imports,
            exports=exports,
            timeraw=timeraw,
            tabular=tabular,
        )

    @classmethod
    def _ensure_string(cls, x):
        if not isinstance(x, str):
            x = repr(x) if not isinstance(x, bytes) else x.decode(cls.codec, 'backslashreplace')
        return x

    @classmethod
    def _parse_pedict(cls, bin):
        return dict((
            cls._ensure_string(key),
            cls._ensure_string(val)
        ) for key, val in bin.items() if val)

    @classmethod
    def parse_signature(cls, data: bytearray) -> dict:
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        time stamp and code signing certificates that are attached to the input PE file.
        """
        from refinery.units.formats.pkcs7 import pkcs7

        try:
            signature = data | pkcs7 | json.loads
        except Exception as E:
            raise ValueError(F'PKCS7 parser failed with error: {E!s}')

        info = {}

        def find_timestamps(entry):
            if isinstance(entry, dict):
                if set(entry.keys()) == {'type', 'value'}:
                    if entry['type'] == 'signing_time':
                        return {'Timestamp': entry['value']}
                for value in entry.values():
                    result = find_timestamps(value)
                    if result is None:
                        continue
                    with suppress(KeyError):
                        result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name'])
                    return result
            elif isinstance(entry, list):
                for value in entry:
                    result = find_timestamps(value)
                    if result is None:
                        continue
                    return result

        timestamp_info = find_timestamps(signature)
        if timestamp_info is not None:
            info.update(timestamp_info)

        try:
            certificates = signature['content']['certificates']
        except KeyError:
            return info

        if len(certificates) == 1:
            main_certificate = certificates[0]
        else:
            certificates_with_extended_use = []
            main_certificate = None
            for certificate in certificates:
                with suppress(Exception):
                    crt = certificate['tbs_certificate']
                    ext = [e for e in crt['extensions'] if e['extn_id'] == 'extended_key_usage' and e['extn_value'] != ['time_stamping']]
                    key = [e for e in crt['extensions'] if e['extn_id'] == 'key_usage']
                    if ext:
                        certificates_with_extended_use.append(certificate)
                    if any('key_cert_sign' in e['extn_value'] for e in key):
                        continue
                    if any('code_signing' in e['extn_value'] for e in ext):
                        main_certificate = certificate
                        break
            if main_certificate is None and len(certificates_with_extended_use) == 1:
                main_certificate = certificates_with_extended_use[0]
        if main_certificate:
            crt = main_certificate['tbs_certificate']
            serial = crt['serial_number']
            if isinstance(serial, int):
                serial = F'{serial:x}'
            if len(serial) % 2 != 0:
                serial = F'0{serial}'
            assert bytes.fromhex(serial) in data
            subject = crt['subject']
            location = [subject.get(t, '') for t in ('locality_name', 'state_or_province_name', 'country_name')]
            info.update(Subject=subject['common_name'])
            if any(location):
                info.update(SubjectLocation=', '.join(filter(None, location)))
            for signer_info in signature['content'].get('signer_infos', ()):
                try:
                    if signer_info['sid']['serial_number'] != crt['serial_number']:
                        continue
                    for attr in signer_info['signed_attrs']:
                        if attr['type'] == 'authenticode_info':
                            info.update(ProgramName=attr['value']['programName'])
                            info.update(MoreInfo=attr['value']['moreInfo'])
                except KeyError:
                    continue
            try:
                valid_from = crt['validity']['not_before']
                valid_until = crt['validity']['not_after']
            except KeyError:
                pass
            else:
                info.update(ValidFrom=valid_from, ValidUntil=valid_until)
            info.update(
                Issuer=crt['issuer']['common_name'], Fingerprint=main_certificate['fingerprint'], Serial=serial)
            return info
        return info

    def parse_version(self, pe: PE, data=None) -> dict:
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        the version resource of an input PE file, if available.
        """
        pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']])
        string_table_entries = []
        for FileInfo in pe.FileInfo:
            for FileInfoEntry in FileInfo:
                with suppress(AttributeError):
                    for StringTableEntry in FileInfoEntry.StringTable:
                        StringTableEntryParsed = self._parse_pedict(StringTableEntry.entries)
                        with suppress(AttributeError):
                            LangID = StringTableEntry.entries.get('LangID', None) or StringTableEntry.LangID
                            LangID = int(LangID, 0x10) if not isinstance(LangID, int) else LangID
                            LangHi = LangID >> 0x10
                            LangLo = LangID & 0xFFFF
                            Language = self._LCID.get(LangHi, 'Language Neutral')
                            Charset = self._CHARSET.get(LangLo, 'Unknown Charset')
                            StringTableEntryParsed.update(
                                LangID=F'{LangID:08X}',
                                Charset=Charset,
                                Language=Language
                            )
                        for key in StringTableEntryParsed:
                            if key.endswith('Version'):
                                value = StringTableEntryParsed[key]
                                separator = ', '
                                if re.match(F'\\d+({re.escape(separator)}\\d+){{3}}', value):
                                    StringTableEntryParsed[key] = '.'.join(value.split(separator))
                        string_table_entries.append(StringTableEntryParsed)
        if not string_table_entries:
            return None
        elif len(string_table_entries) == 1:
            return string_table_entries[0]
        else:
            return string_table_entries

    def parse_exports(self, pe: PE, data=None) -> list:
        pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']])
        info = []
        for k, exp in enumerate(pe.DIRECTORY_ENTRY_EXPORT.symbols):
            if not exp.name:
                info.append(F'@{k}')
            else:
                info.append(exp.name.decode('ascii'))
        return info

    def parse_imports(self, pe: PE, data=None) -> list:
        info = {}
        dirs = []
        for name in [
            'DIRECTORY_ENTRY_IMPORT',
            'DIRECTORY_ENTRY_DELAY_IMPORT',
            'DIRECTORY_ENTRY_BOUND_IMPORT',
        ]:
            pe.parse_data_directories(directories=[DIRECTORY_ENTRY[F'IMAGE_{name}']])
            with suppress(AttributeError):
                dirs.append(getattr(pe, name))
        for idd in itertools.chain(*dirs):
            dll = idd.dll.decode('ascii')
            if dll.lower().endswith('.dll'):
                dll = dll[:-4]
            imports = info.setdefault(dll, [])
            for imp in idd.imports:
                name = imp.name and imp.name.decode('ascii') or F'@{imp.ordinal}'
                imports.append(name)
        return info

    def parse_header(self, pe: PE, data=None) -> dict:
        def format_macro_name(name: str, prefix, convert=True):
            name = name.split('_')[prefix:]
            if convert:
                for k, part in enumerate(name):
                    name[k] = part.upper() if len(part) <= 3 else part.capitalize()
            return ' '.join(name)

        major = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        minor = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
        version = self._WINVER.get(major, {0: 'Unknown'})

        try:
            MinimumOS = version[minor]
        except LookupError:
            MinimumOS = version[0]
        header_information = {
            'Machine': format_macro_name(MACHINE_TYPE[pe.FILE_HEADER.Machine], 3, False),
            'Subsystem': format_macro_name(SUBSYSTEM_TYPE[pe.OPTIONAL_HEADER.Subsystem], 2),
            'MinimumOS': MinimumOS,
        }

        pe.parse_data_directories(directories=[
            DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'],
        ])

        try:
            export_name = pe.DIRECTORY_ENTRY_EXPORT.name
            if isinstance(export_name, bytes):
                export_name = export_name.decode('utf8')
            if not export_name.isprintable():
                export_name = None
        except Exception:
            export_name = None
        if export_name:
            header_information['ExportName'] = export_name

        rich_header = pe.parse_rich_header()
        rich = []
        if rich_header:
            it = rich_header.get('values', [])
            if self.args.tabular:
                cw = max(len(F'{c:d}') for c in it[1::2])
            for idv, count in zip(it[0::2], it[1::2]):
                info = get_rich_info(idv)
                if not info:
                    continue
                pid = info.pid.upper()
                if self.args.tabular:
                    short_pid = get_rich_short_pid(pid)
                    rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}')
                else:
                    rich.append({
                        'Counter': count,
                        'Encoded': F'{idv:08x}',
                        'Library': pid,
                        'Product': info.ver,
                    })
            header_information['RICH'] = rich

        characteristics = [
            name for name, mask in image_characteristics
            if pe.FILE_HEADER.Characteristics & mask
        ]
        for typespec, flag in {
            'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE',
            'DLL': 'IMAGE_FILE_DLL',
            'SYS': 'IMAGE_FILE_SYSTEM'
        }.items():
            if flag in characteristics:
                header_information['Type'] = typespec
        address_width = None
        if 'IMAGE_FILE_16BIT_MACHINE' in characteristics:
            address_width = 4
        elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in ['IMAGE_FILE_MACHINE_I386']:
            address_width = 8
        elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in [
            'IMAGE_FILE_MACHINE_AMD64',
            'IMAGE_FILE_MACHINE_IA64',
        ]:
            address_width = 16
        if address_width:
            header_information['Bits'] = 4 * address_width
        else:
            address_width = 16
        header_information['ImageBase'] = F'0x{pe.OPTIONAL_HEADER.ImageBase:0{address_width}X}'
        header_information['ImageSize'] = get_pe_size(pe)
        return header_information

    def parse_time_stamps(self, pe: PE, raw_time_stamps: bool) -> dict:
        """
        Extracts time stamps from the PE header (link time), as well as from the imports,
        exports, debug, and resource directory. The resource time stamp is also parsed as
        a DOS time stamp and returned as the "Delphi" time stamp.
        """
        if raw_time_stamps:
            def dt(ts): return ts
        else:
            def dt(ts):
                # parse as UTC but then forget time zone information
                return datetime.fromtimestamp(
                    ts,
                    tz=timezone.utc
                ).replace(tzinfo=None)

        pe.parse_data_directories(directories=[
            DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT'],
            DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'],
            DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG'],
            DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']
        ])

        info = {}

        with suppress(AttributeError):
            info.update(Linker=dt(pe.FILE_HEADER.TimeDateStamp))

        with suppress(AttributeError):
            for entry in pe.DIRECTORY_ENTRY_IMPORT:
                info.update(Import=dt(entry.TimeDateStamp()))

        with suppress(AttributeError):
            for entry in pe.DIRECTORY_ENTRY_DEBUG:
                info.update(DbgDir=dt(entry.struct.TimeDateStamp))

        with suppress(AttributeError):
            Export = pe.DIRECTORY_ENTRY_EXPORT.struct.TimeDateStamp
            if Export: info.update(Export=dt(Export))

        with suppress(AttributeError):
            res_timestamp = pe.DIRECTORY_ENTRY_RESOURCE.struct.TimeDateStamp
            if res_timestamp:
                with suppress(ValueError):
                    from refinery.units.misc.datefix import datefix
                    dos = datefix.dostime(res_timestamp)
                    info.update(Delphi=dos)
                    info.update(RsrcTS=dt(res_timestamp))

        def norm(value):
            if isinstance(value, int):
                return value
            return str(value)

        return {key: norm(value) for key, value in info.items()}

    def parse_dotnet(self, pe: PE, data):
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        the .NET metadata of an input PE file.
        """
        header = DotNetHeader(data, pe=pe)
        tables = header.meta.Streams.Tables
        info = dict(
            RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}',
            Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}',
            VersionString=header.meta.VersionString
        )

        info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check]

        if len(tables.Assembly) == 1:
            assembly = tables.Assembly[0]
            info.update(
                AssemblyName=assembly.Name,
                Release='{}.{}.{}.{}'.format(
                    assembly.MajorVersion,
                    assembly.MinorVersion,
                    assembly.BuildNumber,
                    assembly.RevisionNumber
                )
            )

        try:
            entry = header.head.EntryPointToken + pe.OPTIONAL_HEADER.ImageBase
            info.update(EntryPoint=F'0x{entry:08X}')
        except AttributeError:
            pass

        if len(tables.Module) == 1:
            module = tables.Module[0]
            info.update(ModuleName=module.Name)

        return info

    def parse_debug(self, pe: PE, data=None):
        result = {}
        pe.parse_data_directories(directories=[
            DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']])
        for dbg in pe.DIRECTORY_ENTRY_DEBUG:
            if DEBUG_TYPE.get(dbg.struct.Type, None) != 'IMAGE_DEBUG_TYPE_CODEVIEW':
                continue
            with suppress(Exception):
                pdb = dbg.entry.PdbFileName
                if 0 in pdb:
                    pdb = pdb[:pdb.index(0)]
                result.update(
                    PdbPath=pdb.decode(self.codec),
                    PdbAge=dbg.entry.Age
                )
        return result

    def process(self, data):
        result = {}
        pe = PE(data=data, fast_load=True)

        for switch, resolver, name in [
            (self.args.debug,   self.parse_debug,    'Debug'),    # noqa
            (self.args.dotnet,  self.parse_dotnet,   'DotNet'),   # noqa
            (self.args.header,  self.parse_header,   'Header'),   # noqa
            (self.args.version, self.parse_version,  'Version'),  # noqa
            (self.args.imports, self.parse_imports,  'Imports'),  # noqa
            (self.args.exports, self.parse_exports,  'Exports'),  # noqa
        ]:
            if not switch:
                continue
            self.log_debug(F'parsing: {name}')
            try:
                info = resolver(pe, data)
            except Exception as E:
                self.log_info(F'failed to obtain {name}: {E!s}')
                continue
            if info:
                result[name] = info

        signature = {}

        if self.args.timestamps or self.args.signatures:
            with suppress(Exception):
                from refinery.units.formats.pe.pesig import pesig
                signature = self.parse_signature(next(data | pesig))

        if self.args.timestamps:
            ts = self.parse_time_stamps(pe, self.args.timeraw)
            with suppress(KeyError):
                ts.update(Signed=signature['Timestamp'])
            result.update(TimeStamp=ts)

        if signature and self.args.signatures:
            result['Signature'] = signature

        if result:
            yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False)

    _LCID = {
        0x0C00: 'Default Custom Locale Language',
        0x1400: 'Default Custom MUI Locale Language',
        0x007F: 'Invariant Locale Language',
        0x0000: 'Neutral Locale Language',
        0x0800: 'System Default Locale Language',
        0x1000: 'Unspecified Custom Locale Language',
        0x0400: 'User Default Locale Language',
        0x0436: 'Afrikaans-South Africa',
        0x041c: 'Albanian-Albania',
        0x045e: 'Amharic-Ethiopia',
        0x0401: 'Arabic (Saudi Arabia)',
        0x1401: 'Arabic (Algeria)',
        0x3c01: 'Arabic (Bahrain)',
        0x0c01: 'Arabic (Egypt)',
        0x0801: 'Arabic (Iraq)',
        0x2c01: 'Arabic (Jordan)',
        0x3401: 'Arabic (Kuwait)',
        0x3001: 'Arabic (Lebanon)',
        0x1001: 'Arabic (Libya)',
        0x1801: 'Arabic (Morocco)',
        0x2001: 'Arabic (Oman)',
        0x4001: 'Arabic (Qatar)',
        0x2801: 'Arabic (Syria)',
        0x1c01: 'Arabic (Tunisia)',
        0x3801: 'Arabic (U.A.E.)',
        0x2401: 'Arabic (Yemen)',
        0x042b: 'Armenian-Armenia',
        0x044d: 'Assamese',
        0x082c: 'Azeri (Cyrillic)',
        0x042c: 'Azeri (Latin)',
        0x042d: 'Basque',
        0x0423: 'Belarusian',
        0x0445: 'Bengali (India)',
        0x0845: 'Bengali (Bangladesh)',
        0x141A: 'Bosnian (Bosnia/Herzegovina)',
        0x0402: 'Bulgarian',
        0x0455: 'Burmese',
        0x0403: 'Catalan',
        0x045c: 'Cherokee-United States',
        0x0804: 'Chinese (People\'s Republic of China)',
        0x1004: 'Chinese (Singapore)',
        0x0404: 'Chinese (Taiwan)',
        0x0c04: 'Chinese (Hong Kong SAR)',
        0x1404: 'Chinese (Macao SAR)',
        0x041a: 'Croatian',
        0x101a: 'Croatian (Bosnia/Herzegovina)',
        0x0405: 'Czech',
        0x0406: 'Danish',
        0x0465: 'Divehi',
        0x0413: 'Dutch-Netherlands',
        0x0813: 'Dutch-Belgium',
        0x0466: 'Edo',
        0x0409: 'English (United States)',
        0x0809: 'English (United Kingdom)',
        0x0c09: 'English (Australia)',
        0x2809: 'English (Belize)',
        0x1009: 'English (Canada)',
        0x2409: 'English (Caribbean)',
        0x3c09: 'English (Hong Kong SAR)',
        0x4009: 'English (India)',
        0x3809: 'English (Indonesia)',
        0x1809: 'English (Ireland)',
        0x2009: 'English (Jamaica)',
        0x4409: 'English (Malaysia)',
        0x1409: 'English (New Zealand)',
        0x3409: 'English (Philippines)',
        0x4809: 'English (Singapore)',
        0x1c09: 'English (South Africa)',
        0x2c09: 'English (Trinidad)',
        0x3009: 'English (Zimbabwe)',
        0x0425: 'Estonian',
        0x0438: 'Faroese',
        0x0429: 'Farsi',
        0x0464: 'Filipino',
        0x040b: 'Finnish',
        0x040c: 'French (France)',
        0x080c: 'French (Belgium)',
        0x2c0c: 'French (Cameroon)',
        0x0c0c: 'French (Canada)',
        0x240c: 'French (Democratic Rep. of Congo)',
        0x300c: 'French (Cote d\'Ivoire)',
        0x3c0c: 'French (Haiti)',
        0x140c: 'French (Luxembourg)',
        0x340c: 'French (Mali)',
        0x180c: 'French (Monaco)',
        0x380c: 'French (Morocco)',
        0xe40c: 'French (North Africa)',
        0x200c: 'French (Reunion)',
        0x280c: 'French (Senegal)',
        0x100c: 'French (Switzerland)',
        0x1c0c: 'French (West Indies)',
        0x0462: 'Frisian-Netherlands',
        0x0467: 'Fulfulde-Nigeria',
        0x042f: 'FYRO Macedonian',
        0x083c: 'Gaelic (Ireland)',
        0x043c: 'Gaelic (Scotland)',
        0x0456: 'Galician',
        0x0437: 'Georgian',
        0x0407: 'German (Germany)',
        0x0c07: 'German (Austria)',
        0x1407: 'German (Liechtenstein)',
        0x1007: 'German (Luxembourg)',
        0x0807: 'German (Switzerland)',
        0x0408: 'Greek',
        0x0474: 'Guarani-Paraguay',
        0x0447: 'Gujarati',
        0x0468: 'Hausa-Nigeria',
        0x0475: 'Hawaiian (United States)',
        0x040d: 'Hebrew',
        0x0439: 'Hindi',
        0x040e: 'Hungarian',
        0x0469: 'Ibibio-Nigeria',
        0x040f: 'Icelandic',
        0x0470: 'Igbo-Nigeria',
        0x0421: 'Indonesian',
        0x045d: 'Inuktitut',
        0x0410: 'Italian (Italy)',
        0x0810: 'Italian (Switzerland)',
        0x0411: 'Japanese',
        0x044b: 'Kannada',
        0x0471: 'Kanuri-Nigeria',
        0x0860: 'Kashmiri',
        0x0460: 'Kashmiri (Arabic)',
        0x043f: 'Kazakh',
        0x0453: 'Khmer',
        0x0457: 'Konkani',
        0x0412: 'Korean',
        0x0440: 'Kyrgyz (Cyrillic)',
        0x0454: 'Lao',
        0x0476: 'Latin',
        0x0426: 'Latvian',
        0x0427: 'Lithuanian',
        0x043e: 'Malay-Malaysia',
        0x083e: 'Malay-Brunei Darussalam',
        0x044c: 'Malayalam',
        0x043a: 'Maltese',
        0x0458: 'Manipuri',
        0x0481: 'Maori-New Zealand',
        0x044e: 'Marathi',
        0x0450: 'Mongolian (Cyrillic)',
        0x0850: 'Mongolian (Mongolian)',
        0x0461: 'Nepali',
        0x0861: 'Nepali-India',
        0x0414: 'Norwegian (Bokmål)',
        0x0814: 'Norwegian (Nynorsk)',
        0x0448: 'Oriya',
        0x0472: 'Oromo',
        0x0479: 'Papiamentu',
        0x0463: 'Pashto',
        0x0415: 'Polish',
        0x0416: 'Portuguese-Brazil',
        0x0816: 'Portuguese-Portugal',
        0x0446: 'Punjabi',
        0x0846: 'Punjabi (Pakistan)',
        0x046B: 'Quecha (Bolivia)',
        0x086B: 'Quecha (Ecuador)',
        0x0C6B: 'Quecha (Peru)',
        0x0417: 'Rhaeto-Romanic',
        0x0418: 'Romanian',
        0x0818: 'Romanian (Moldava)',
        0x0419: 'Russian',
        0x0819: 'Russian (Moldava)',
        0x043b: 'Sami (Lappish)',
        0x044f: 'Sanskrit',
        0x046c: 'Sepedi',
        0x0c1a: 'Serbian (Cyrillic)',
        0x081a: 'Serbian (Latin)',
        0x0459: 'Sindhi (India)',
        0x0859: 'Sindhi (Pakistan)',
        0x045b: 'Sinhalese-Sri Lanka',
        0x041b: 'Slovak',
        0x0424: 'Slovenian',
        0x0477: 'Somali',
        0x042e: 'Sorbian',
        0x0c0a: 'Spanish (Modern Sort)',
        0x040a: 'Spanish (Traditional Sort)',
        0x2c0a: 'Spanish (Argentina)',
        0x400a: 'Spanish (Bolivia)',
        0x340a: 'Spanish (Chile)',
        0x240a: 'Spanish (Colombia)',
        0x140a: 'Spanish (Costa Rica)',
        0x1c0a: 'Spanish (Dominican Republic)',
        0x300a: 'Spanish (Ecuador)',
        0x440a: 'Spanish (El Salvador)',
        0x100a: 'Spanish (Guatemala)',
        0x480a: 'Spanish (Honduras)',
        0x580a: 'Spanish (Latin America)',
        0x080a: 'Spanish (Mexico)',
        0x4c0a: 'Spanish (Nicaragua)',
        0x180a: 'Spanish (Panama)',
        0x3c0a: 'Spanish (Paraguay)',
        0x280a: 'Spanish (Peru)',
        0x500a: 'Spanish (Puerto Rico)',
        0x540a: 'Spanish (United States)',
        0x380a: 'Spanish (Uruguay)',
        0x200a: 'Spanish (Venezuela)',
        0x0430: 'Sutu',
        0x0441: 'Swahili',
        0x041d: 'Swedish',
        0x081d: 'Swedish-Finland',
        0x045a: 'Syriac',
        0x0428: 'Tajik',
        0x045f: 'Tamazight (Arabic)',
        0x085f: 'Tamazight (Latin)',
        0x0449: 'Tamil',
        0x0444: 'Tatar',
        0x044a: 'Telugu',
        0x041e: 'Thai',
        0x0851: 'Tibetan (Bhutan)',
        0x0451: 'Tibetan (People\'s Republic of China)',
        0x0873: 'Tigrigna (Eritrea)',
        0x0473: 'Tigrigna (Ethiopia)',
        0x0431: 'Tsonga',
        0x0432: 'Tswana',
        0x041f: 'Turkish',
        0x0442: 'Turkmen',
        0x0480: 'Uighur-China',
        0x0422: 'Ukrainian',
        0x0420: 'Urdu',
        0x0820: 'Urdu-India',
        0x0843: 'Uzbek (Cyrillic)',
        0x0443: 'Uzbek (Latin)',
        0x0433: 'Venda',
        0x042a: 'Vietnamese',
        0x0452: 'Welsh',
        0x0434: 'Xhosa',
        0x0478: 'Yi',
        0x043d: 'Yiddish',
        0x046a: 'Yoruba',
        0x0435: 'Zulu',
        0x04ff: 'HID (Human Interface DeVITe)'
    }

    _CHARSET = {
        0x0000: '7-bit ASCII',
        0x03A4: 'Japan (Shift ? JIS X-0208)',
        0x03B5: 'Korea (Shift ? KSC 5601)',
        0x03B6: 'Taiwan (Big5)',
        0x04B0: 'Unicode',
        0x04E2: 'Latin-2 (Eastern European)',
        0x04E3: 'Cyrillic',
        0x04E4: 'Multilingual',
        0x04E5: 'Greek',
        0x04E6: 'Turkish',
        0x04E7: 'Hebrew',
        0x04E8: 'Arabic',
    }

    _WINVER = {
        3: {
            0x00: 'Windows NT 3',
            0x0A: 'Windows NT 3.1',
            0x32: 'Windows NT 3.5',
            0x33: 'Windows NT 3.51',
        },
        4: {
            0x00: 'Windows 95',
            0x0A: 'Windows 98',
        },
        5: {
            0x00: 'Windows 2000',
            0x5A: 'Windows Me',
            0x01: 'Windows XP',
            0x02: 'Windows Server 2003',
        },
        6: {
            0x00: 'Windows Vista',
            0x01: 'Windows 7',
            0x02: 'Windows 8',
            0x03: 'Windows 8.1',
        },
        10: {
            0x00: 'Windows 10',
        }
    }

Static methods

def parse_signature(data)

Extracts a JSON-serializable and human-readable dictionary with information about time stamp and code signing certificates that are attached to the input PE file.

Expand source code Browse git

@classmethod
def parse_signature(cls, data: bytearray) -> dict:
    """
    Extracts a JSON-serializable and human-readable dictionary with information about
    time stamp and code signing certificates that are attached to the input PE file.
    """
    from refinery.units.formats.pkcs7 import pkcs7

    try:
        signature = data | pkcs7 | json.loads
    except Exception as E:
        raise ValueError(F'PKCS7 parser failed with error: {E!s}')

    info = {}

    def find_timestamps(entry):
        if isinstance(entry, dict):
            if set(entry.keys()) == {'type', 'value'}:
                if entry['type'] == 'signing_time':
                    return {'Timestamp': entry['value']}
            for value in entry.values():
                result = find_timestamps(value)
                if result is None:
                    continue
                with suppress(KeyError):
                    result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name'])
                return result
        elif isinstance(entry, list):
            for value in entry:
                result = find_timestamps(value)
                if result is None:
                    continue
                return result

    timestamp_info = find_timestamps(signature)
    if timestamp_info is not None:
        info.update(timestamp_info)

    try:
        certificates = signature['content']['certificates']
    except KeyError:
        return info

    if len(certificates) == 1:
        main_certificate = certificates[0]
    else:
        certificates_with_extended_use = []
        main_certificate = None
        for certificate in certificates:
            with suppress(Exception):
                crt = certificate['tbs_certificate']
                ext = [e for e in crt['extensions'] if e['extn_id'] == 'extended_key_usage' and e['extn_value'] != ['time_stamping']]
                key = [e for e in crt['extensions'] if e['extn_id'] == 'key_usage']
                if ext:
                    certificates_with_extended_use.append(certificate)
                if any('key_cert_sign' in e['extn_value'] for e in key):
                    continue
                if any('code_signing' in e['extn_value'] for e in ext):
                    main_certificate = certificate
                    break
        if main_certificate is None and len(certificates_with_extended_use) == 1:
            main_certificate = certificates_with_extended_use[0]
    if main_certificate:
        crt = main_certificate['tbs_certificate']
        serial = crt['serial_number']
        if isinstance(serial, int):
            serial = F'{serial:x}'
        if len(serial) % 2 != 0:
            serial = F'0{serial}'
        assert bytes.fromhex(serial) in data
        subject = crt['subject']
        location = [subject.get(t, '') for t in ('locality_name', 'state_or_province_name', 'country_name')]
        info.update(Subject=subject['common_name'])
        if any(location):
            info.update(SubjectLocation=', '.join(filter(None, location)))
        for signer_info in signature['content'].get('signer_infos', ()):
            try:
                if signer_info['sid']['serial_number'] != crt['serial_number']:
                    continue
                for attr in signer_info['signed_attrs']:
                    if attr['type'] == 'authenticode_info':
                        info.update(ProgramName=attr['value']['programName'])
                        info.update(MoreInfo=attr['value']['moreInfo'])
            except KeyError:
                continue
        try:
            valid_from = crt['validity']['not_before']
            valid_until = crt['validity']['not_after']
        except KeyError:
            pass
        else:
            info.update(ValidFrom=valid_from, ValidUntil=valid_until)
        info.update(
            Issuer=crt['issuer']['common_name'], Fingerprint=main_certificate['fingerprint'], Serial=serial)
        return info
    return info

Methods

def parse_version(self, pe, data=None)

Extracts a JSON-serializable and human-readable dictionary with information about the version resource of an input PE file, if available.

Expand source code Browse git

def parse_version(self, pe: PE, data=None) -> dict:
    """
    Extracts a JSON-serializable and human-readable dictionary with information about
    the version resource of an input PE file, if available.
    """
    pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']])
    string_table_entries = []
    for FileInfo in pe.FileInfo:
        for FileInfoEntry in FileInfo:
            with suppress(AttributeError):
                for StringTableEntry in FileInfoEntry.StringTable:
                    StringTableEntryParsed = self._parse_pedict(StringTableEntry.entries)
                    with suppress(AttributeError):
                        LangID = StringTableEntry.entries.get('LangID', None) or StringTableEntry.LangID
                        LangID = int(LangID, 0x10) if not isinstance(LangID, int) else LangID
                        LangHi = LangID >> 0x10
                        LangLo = LangID & 0xFFFF
                        Language = self._LCID.get(LangHi, 'Language Neutral')
                        Charset = self._CHARSET.get(LangLo, 'Unknown Charset')
                        StringTableEntryParsed.update(
                            LangID=F'{LangID:08X}',
                            Charset=Charset,
                            Language=Language
                        )
                    for key in StringTableEntryParsed:
                        if key.endswith('Version'):
                            value = StringTableEntryParsed[key]
                            separator = ', '
                            if re.match(F'\\d+({re.escape(separator)}\\d+){{3}}', value):
                                StringTableEntryParsed[key] = '.'.join(value.split(separator))
                    string_table_entries.append(StringTableEntryParsed)
    if not string_table_entries:
        return None
    elif len(string_table_entries) == 1:
        return string_table_entries[0]
    else:
        return string_table_entries

def parse_exports(self, pe, data=None)

Expand source code Browse git

def parse_exports(self, pe: PE, data=None) -> list:
    pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']])
    info = []
    for k, exp in enumerate(pe.DIRECTORY_ENTRY_EXPORT.symbols):
        if not exp.name:
            info.append(F'@{k}')
        else:
            info.append(exp.name.decode('ascii'))
    return info

def parse_imports(self, pe, data=None)

Expand source code Browse git

def parse_imports(self, pe: PE, data=None) -> list:
    info = {}
    dirs = []
    for name in [
        'DIRECTORY_ENTRY_IMPORT',
        'DIRECTORY_ENTRY_DELAY_IMPORT',
        'DIRECTORY_ENTRY_BOUND_IMPORT',
    ]:
        pe.parse_data_directories(directories=[DIRECTORY_ENTRY[F'IMAGE_{name}']])
        with suppress(AttributeError):
            dirs.append(getattr(pe, name))
    for idd in itertools.chain(*dirs):
        dll = idd.dll.decode('ascii')
        if dll.lower().endswith('.dll'):
            dll = dll[:-4]
        imports = info.setdefault(dll, [])
        for imp in idd.imports:
            name = imp.name and imp.name.decode('ascii') or F'@{imp.ordinal}'
            imports.append(name)
    return info

def parse_header(self, pe, data=None)

Expand source code Browse git

def parse_header(self, pe: PE, data=None) -> dict:
    def format_macro_name(name: str, prefix, convert=True):
        name = name.split('_')[prefix:]
        if convert:
            for k, part in enumerate(name):
                name[k] = part.upper() if len(part) <= 3 else part.capitalize()
        return ' '.join(name)

    major = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
    minor = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
    version = self._WINVER.get(major, {0: 'Unknown'})

    try:
        MinimumOS = version[minor]
    except LookupError:
        MinimumOS = version[0]
    header_information = {
        'Machine': format_macro_name(MACHINE_TYPE[pe.FILE_HEADER.Machine], 3, False),
        'Subsystem': format_macro_name(SUBSYSTEM_TYPE[pe.OPTIONAL_HEADER.Subsystem], 2),
        'MinimumOS': MinimumOS,
    }

    pe.parse_data_directories(directories=[
        DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'],
    ])

    try:
        export_name = pe.DIRECTORY_ENTRY_EXPORT.name
        if isinstance(export_name, bytes):
            export_name = export_name.decode('utf8')
        if not export_name.isprintable():
            export_name = None
    except Exception:
        export_name = None
    if export_name:
        header_information['ExportName'] = export_name

    rich_header = pe.parse_rich_header()
    rich = []
    if rich_header:
        it = rich_header.get('values', [])
        if self.args.tabular:
            cw = max(len(F'{c:d}') for c in it[1::2])
        for idv, count in zip(it[0::2], it[1::2]):
            info = get_rich_info(idv)
            if not info:
                continue
            pid = info.pid.upper()
            if self.args.tabular:
                short_pid = get_rich_short_pid(pid)
                rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}')
            else:
                rich.append({
                    'Counter': count,
                    'Encoded': F'{idv:08x}',
                    'Library': pid,
                    'Product': info.ver,
                })
        header_information['RICH'] = rich

    characteristics = [
        name for name, mask in image_characteristics
        if pe.FILE_HEADER.Characteristics & mask
    ]
    for typespec, flag in {
        'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE',
        'DLL': 'IMAGE_FILE_DLL',
        'SYS': 'IMAGE_FILE_SYSTEM'
    }.items():
        if flag in characteristics:
            header_information['Type'] = typespec
    address_width = None
    if 'IMAGE_FILE_16BIT_MACHINE' in characteristics:
        address_width = 4
    elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in ['IMAGE_FILE_MACHINE_I386']:
        address_width = 8
    elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in [
        'IMAGE_FILE_MACHINE_AMD64',
        'IMAGE_FILE_MACHINE_IA64',
    ]:
        address_width = 16
    if address_width:
        header_information['Bits'] = 4 * address_width
    else:
        address_width = 16
    header_information['ImageBase'] = F'0x{pe.OPTIONAL_HEADER.ImageBase:0{address_width}X}'
    header_information['ImageSize'] = get_pe_size(pe)
    return header_information

def parse_time_stamps(self, pe, raw_time_stamps)

Extracts time stamps from the PE header (link time), as well as from the imports, exports, debug, and resource directory. The resource time stamp is also parsed as a DOS time stamp and returned as the "Delphi" time stamp.

Expand source code Browse git

def parse_time_stamps(self, pe: PE, raw_time_stamps: bool) -> dict:
    """
    Extracts time stamps from the PE header (link time), as well as from the imports,
    exports, debug, and resource directory. The resource time stamp is also parsed as
    a DOS time stamp and returned as the "Delphi" time stamp.
    """
    if raw_time_stamps:
        def dt(ts): return ts
    else:
        def dt(ts):
            # parse as UTC but then forget time zone information
            return datetime.fromtimestamp(
                ts,
                tz=timezone.utc
            ).replace(tzinfo=None)

    pe.parse_data_directories(directories=[
        DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT'],
        DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'],
        DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG'],
        DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']
    ])

    info = {}

    with suppress(AttributeError):
        info.update(Linker=dt(pe.FILE_HEADER.TimeDateStamp))

    with suppress(AttributeError):
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
            info.update(Import=dt(entry.TimeDateStamp()))

    with suppress(AttributeError):
        for entry in pe.DIRECTORY_ENTRY_DEBUG:
            info.update(DbgDir=dt(entry.struct.TimeDateStamp))

    with suppress(AttributeError):
        Export = pe.DIRECTORY_ENTRY_EXPORT.struct.TimeDateStamp
        if Export: info.update(Export=dt(Export))

    with suppress(AttributeError):
        res_timestamp = pe.DIRECTORY_ENTRY_RESOURCE.struct.TimeDateStamp
        if res_timestamp:
            with suppress(ValueError):
                from refinery.units.misc.datefix import datefix
                dos = datefix.dostime(res_timestamp)
                info.update(Delphi=dos)
                info.update(RsrcTS=dt(res_timestamp))

    def norm(value):
        if isinstance(value, int):
            return value
        return str(value)

    return {key: norm(value) for key, value in info.items()}

def parse_dotnet(self, pe, data)

Extracts a JSON-serializable and human-readable dictionary with information about the .NET metadata of an input PE file.

Expand source code Browse git

def parse_dotnet(self, pe: PE, data):
    """
    Extracts a JSON-serializable and human-readable dictionary with information about
    the .NET metadata of an input PE file.
    """
    header = DotNetHeader(data, pe=pe)
    tables = header.meta.Streams.Tables
    info = dict(
        RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}',
        Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}',
        VersionString=header.meta.VersionString
    )

    info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check]

    if len(tables.Assembly) == 1:
        assembly = tables.Assembly[0]
        info.update(
            AssemblyName=assembly.Name,
            Release='{}.{}.{}.{}'.format(
                assembly.MajorVersion,
                assembly.MinorVersion,
                assembly.BuildNumber,
                assembly.RevisionNumber
            )
        )

    try:
        entry = header.head.EntryPointToken + pe.OPTIONAL_HEADER.ImageBase
        info.update(EntryPoint=F'0x{entry:08X}')
    except AttributeError:
        pass

    if len(tables.Module) == 1:
        module = tables.Module[0]
        info.update(ModuleName=module.Name)

    return info

def parse_debug(self, pe, data=None)

Expand source code Browse git

def parse_debug(self, pe: PE, data=None):
    result = {}
    pe.parse_data_directories(directories=[
        DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']])
    for dbg in pe.DIRECTORY_ENTRY_DEBUG:
        if DEBUG_TYPE.get(dbg.struct.Type, None) != 'IMAGE_DEBUG_TYPE_CODEVIEW':
            continue
        with suppress(Exception):
            pdb = dbg.entry.PdbFileName
            if 0 in pdb:
                pdb = pdb[:pdb.index(0)]
            result.update(
                PdbPath=pdb.decode(self.codec),
                PdbAge=dbg.entry.Age
            )
    return result

class peoverlay (certificate=False, directories=False, memdump=False)

This unit is implemented in refinery.units.formats.pe.peoverlay and has the following commandline Interface:

usage: peoverlay [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m]

Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This
does not include digital signatures. Use pestrip to obtain only the body of the PE file after
removing the overlay.

optional arguments:
  -c, --cert     Include digital signatures for the size computation.
  -d, --dirs     Include data directories for size computation.
  -m, --memdump  Assume that the file data was a memory-mapped PE file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class peoverlay(OverlayUnit):
    """
    Returns the overlay of a PE file, i.e. anything that may have been appended to the file.
    This does not include digital signatures. Use `refinery.pestrip` to obtain only the body
    of the PE file after removing the overlay.
    """
    def process(self, data: bytearray) -> bytearray:
        size = self._get_size(data)
        try:
            data[:size] = []
        except Exception:
            return data[size:]
        else:
            return data

class perc (*paths, pretty=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.pe.perc and has the following commandline Interface:

usage: perc [-h] [-L] [-Q] [-0] [-v] [-p] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
            [path [path ...]]

Extract PE file resources.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -p, --pretty     Add missing headers to bitmap and icon resources.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class perc(PathExtractorUnit):
    """
    Extract PE file resources.
    """
    def __init__(
        self, *paths,
        pretty: Arg.Switch('-p', help='Add missing headers to bitmap and icon resources.') = False,
        **kwargs
    ):
        super().__init__(*paths, pretty=pretty, **kwargs)

    def _get_icon_dir(self, pe: pefile.PE):
        try:
            group, = (e for e in pe.DIRECTORY_ENTRY_RESOURCE.entries if e.id == RSRC.ICON_GROUP.value)
            group = group.directory.entries[0].directory.entries[0].data.struct
            return GRPICONDIR(pe.get_data(group.OffsetToData, group.Size))
        except Exception:
            return None

    def _search(self, pe: pefile.PE, directory, level=0, *parts):
        if level >= 3:
            self.log_warn(F'unexpected resource tree level {level + 1:d}')
        for entry in directory.entries:
            if entry.name:
                identifier = str(entry.name)
            elif level == 0 and entry.id in iter(RSRC):
                identifier = RSRC(entry.id)
            elif entry.id is not None:
                identifier = entry.id
            else:
                self.log_warn(F'resource entry has name {entry.name} and id {entry.id} at level {level + 1:d}')
                continue
            if entry.struct.DataIsDirectory:
                yield from self._search(pe, entry.directory, level + 1, *parts, identifier)
            else:
                rva = entry.data.struct.OffsetToData
                size = entry.data.struct.Size
                path = '/'.join(str(p) for p in (*parts, identifier))
                extract = None
                if self.args.pretty:
                    if parts[0] is RSRC.BITMAP:
                        extract = self._handle_bitmap(pe, rva, size)
                    elif parts[0] is RSRC.ICON:
                        extract = self._handle_icon(pe, parts, rva, size)
                    elif parts[0] is RSRC.STRING:
                        extract = self._handle_strings(pe, parts, rva, size)
                if extract is None:
                    def extract(pe=pe):
                        return pe.get_data(rva, size)
                yield UnpackResult(
                    path,
                    extract,
                    offset=pe.get_offset_from_rva(rva),
                    lcid=self._get_lcid(entry.data),
                )

    def _get_lcid(self, node_data) -> Optional[str]:
        try:
            pid = node_data.lang or 0
            sid = node_data.sublang or 0
        except AttributeError:
            return None
        try:
            pid = self._LANG_ID_TO_LCID[pid]
        except KeyError:
            return None
        lcid = pid.get(sid, 0)
        return pemeta._LCID.get(lcid)

    def _handle_strings(self, pe: pefile.PE, parts: Tuple[RSRC, int, int], rva: int, size: int):
        def extract(pe=pe):
            self.log_debug(parts)
            base = (parts[1] - 1) << 4
            reader = StructReader(pe.get_data(rva, size))
            table = {}
            index = 0
            while not reader.eof:
                string = reader.read_exactly(reader.u16() * 2)
                if not string:
                    break
                key = F'{base + index:04X}'
                table[key] = string.decode('utf-16le')
                index += 1
            return json.dumps(table, indent=4).encode(self.codec)
        return extract

    def _handle_bitmap(self, pe: pefile.PE, rva: int, size: int):
        def extract(pe=pe):
            bitmap = pe.get_data(rva, size)
            total = (len(bitmap) + 14).to_bytes(4, 'little')
            return B'BM' + total + B'\0\0\0\0\x36\0\0\0' + bitmap
        return extract

    def _handle_icon(self, pe: pefile.PE, parts: Tuple[RSRC, int, int], rva: int, size: int):
        try:
            icondir = self._get_icon_dir(pe)
            index = int(parts[1]) - 1
            info = icondir.entries[index]
            icon = pe.get_data(rva, size)
        except Exception:
            return None
        if icon.startswith(B'(\0\0\0'):
            header = struct.pack('<HHHBBBBHHII',
                0,
                1,
                1,
                info.width,
                info.height,
                info.color_count,
                0,
                info.planes,
                info.bit_count,
                len(icon),
                0x16
            )
            icon = header + icon
        return icon

    def unpack(self, data):
        pe = pefile.PE(data=data, fast_load=True)
        pe.parse_data_directories(
            directories=pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE'])
        try:
            rsrc = pe.DIRECTORY_ENTRY_RESOURCE
        except AttributeError:
            pass
        else:
            yield from self._search(pe, rsrc)

    def _mktbl(ids: List[Tuple[int, int, int]]) -> Dict[int, Dict[int, int]]:
        table = {}
        for pid, sid, lcid in ids:
            if pid not in table:
                table[pid] = {0: lcid}
            table[pid][sid] = lcid
        return table

    _LANG_ID_TO_LCID = _mktbl([
        (0x00, 0x03, 0x0C00),
        (0x00, 0x05, 0x1400),
        (0x7F, 0x00, 0x007F),
        (0x00, 0x00, 0x0000),
        (0x02, 0x02, 0x0800),
        (0x00, 0x04, 0x1000),
        (0x00, 0x01, 0x0400),
        (0x36, 0x01, 0x0436),
        (0x1c, 0x01, 0x041C),
        (0x84, 0x01, 0x0484),
        (0x5E, 0x01, 0x045E),
        (0x01, 0x05, 0x1401),
        (0x01, 0x0f, 0x3C01),
        (0x01, 0x03, 0x0C01),
        (0x01, 0x02, 0x0801),
        (0x01, 0x0B, 0x2C01),
        (0x01, 0x0D, 0x3401),
        (0x01, 0x0C, 0x3001),
        (0x01, 0x04, 0x1001),
        (0x01, 0x06, 0x1801),
        (0x01, 0x08, 0x2001),
        (0x01, 0x10, 0x4001),
        (0x01, 0x01, 0x0401),
        (0x01, 0x0A, 0x2801),
        (0x01, 0x07, 0x1C01),
        (0x01, 0x0E, 0x3801),
        (0x01, 0x09, 0x2401),
        (0x2B, 0x01, 0x042B),
        (0x4D, 0x01, 0x044D),
        (0x2C, 0x02, 0x082C),
        (0x2C, 0x01, 0x042C),
        (0x45, 0x02, 0x0445),
        (0x6D, 0x01, 0x046D),
        (0x2d, 0x01, 0x042D),
        (0x23, 0x01, 0x0423),
        (0x1A, 0x08, 0x201A),
        (0x1A, 0x05, 0x141A),
        (0x7E, 0x01, 0x047E),
        (0x02, 0x01, 0x0402),
        (0x92, 0x01, 0x0492),
        (0x5C, 0x01, 0x045C),
        (0x03, 0x01, 0x0403),
        (0x04, 0x03, 0x0C04),
        (0x04, 0x05, 0x1404),
        (0x04, 0x04, 0x1004),
        (0x04, 0x02, 0x0004),
        (0x04, 0x01, 0x7C04),
        (0x83, 0x01, 0x0483),
        (0x1A, None, 0x001A),
        (0x1a, 0x04, 0x101A),
        (0x1a, 0x01, 0x041A),
        (0x05, 0x01, 0x0405),
        (0x06, 0x01, 0x0406),
        (0x8C, 0x01, 0x048C),
        (0x65, 0x01, 0x0465),
        (0x13, 0x02, 0x0813),
        (0x13, 0x01, 0x0413),
        (0x09, 0x03, 0x0C09),
        (0x09, 0x0A, 0x2809),
        (0x09, 0x04, 0x1009),
        (0x09, 0x09, 0x2409),
        (0x09, 0x10, 0x4009),
        (0x09, 0x06, 0x1809),
        (0x09, 0x08, 0x2009),
        (0x09, 0x11, 0x4409),
        (0x09, 0x05, 0x1409),
        (0x09, 0x0D, 0x3409),
        (0x09, 0x12, 0x4809),
        (0x09, 0x07, 0x1c09),
        (0x09, 0x0B, 0x2C09),
        (0x09, 0x02, 0x0809),
        (0x09, 0x01, 0x0409),
        (0x09, 0x0C, 0x3009),
        (0x25, 0x01, 0x0425),
        (0x38, 0x01, 0x0438),
        (0x64, 0x01, 0x0464),
        (0x0B, 0x01, 0x040B),
        (0x0C, 0x02, 0x080c),
        (0x0C, 0x03, 0x0C0C),
        (0x0C, 0x01, 0x040c),
        (0x0C, 0x05, 0x140C),
        (0x0C, 0x06, 0x180C),
        (0x0C, 0x04, 0x100C),
        (0x62, 0x01, 0x0462),
        (0x56, 0x01, 0x0456),
        (0x37, 0x01, 0x0437),
        (0x07, 0x03, 0x0C07),
        (0x07, 0x01, 0x0407),
        (0x07, 0x05, 0x1407),
        (0x07, 0x04, 0x1007),
        (0x07, 0x02, 0x0807),
        (0x08, 0x01, 0x0408),
        (0x6F, 0x01, 0x046F),
        (0x47, 0x01, 0x0447),
        (0x68, 0x01, 0x0468),
        (0x75, 0x01, 0x0475),
        (0x0D, 0x01, 0x040D),
        (0x39, 0x01, 0x0439),
        (0x0E, 0x01, 0x040E),
        (0x0F, 0x01, 0x040F),
        (0x70, 0x01, 0x0470),
        (0x21, 0x01, 0x0421),
        (0x5D, 0x02, 0x085D),
        (0x5D, 0x01, 0x045D),
        (0x3C, 0x02, 0x083C),
        (0x34, 0x01, 0x0434),
        (0x35, 0x01, 0x0435),
        (0x10, 0x01, 0x0410),
        (0x10, 0x02, 0x0810),
        (0x11, 0x01, 0x0411),
        (0x4B, 0x01, 0x044B),
        (0x3F, 0x01, 0x043F),
        (0x53, 0x01, 0x0453),
        (0x86, 0x01, 0x0486),
        (0x87, 0x01, 0x0487),
        (0x57, 0x01, 0x0457),
        (0x12, 0x01, 0x0412),
        (0x40, 0x01, 0x0440),
        (0x54, 0x01, 0x0454),
        (0x26, 0x01, 0x0426),
        (0x27, 0x01, 0x0427),
        (0x2E, 0x02, 0x082E),
        (0x6E, 0x01, 0x046E),
        (0x2F, 0x01, 0x042F),
        (0x3E, 0x02, 0x083E),
        (0x3E, 0x01, 0x043e),
        (0x4C, 0x01, 0x044C),
        (0x3A, 0x01, 0x043A),
        (0x81, 0x01, 0x0481),
        (0x7A, 0x01, 0x047A),
        (0x4E, 0x01, 0x044E),
        (0x7C, 0x01, 0x047C),
        (0x50, 0x01, 0x0450),
        (0x50, 0x02, 0x0850),
        (0x61, 0x01, 0x0461),
        (0x14, 0x01, 0x0414),
        (0x14, 0x02, 0x0814),
        (0x82, 0x01, 0x0482),
        (0x48, 0x01, 0x0448),
        (0x63, 0x01, 0x0463),
        (0x29, 0x01, 0x0429),
        (0x15, 0x01, 0x0415),
        (0x16, 0x01, 0x0416),
        (0x16, 0x02, 0x0816),
        (0x67, 0x02, 0x0867),
        (0x46, 0x01, 0x0446),
        (0x46, 0x02, 0x0846),
        (0x6B, 0x01, 0x046B),
        (0x6B, 0x02, 0x086B),
        (0x6B, 0x03, 0x0C6B),
        (0x18, 0x01, 0x0418),
        (0x17, 0x01, 0x0417),
        (0x19, 0x01, 0x0419),
        (0x85, 0x01, 0x0485),
        (0x3B, 0x09, 0x243B),
        (0x3B, 0x04, 0x103B),
        (0x3B, 0x05, 0x143B),
        (0x3B, 0x03, 0x0C3B),
        (0x3B, 0x01, 0x043B),
        (0x3B, 0x02, 0x083B),
        (0x3B, 0x08, 0x203B),
        (0x3B, 0x06, 0x183B),
        (0x3B, 0x07, 0x1C3B),
        (0x4F, 0x01, 0x044F),
        (0x1a, 0x07, 0x1C1A),
        (0x1a, 0x06, 0x181A),
        (0x1a, 0x03, 0x0C1A),
        (0x1a, 0x02, 0x081A),
        (0x6C, 0x01, 0x046C),
        (0x32, 0x02, 0x0832),
        (0x32, 0x01, 0x0432),
        (0x32, 0x01, 0x0459),
        (0x32, 0x02, 0x0859),
        (0x5B, 0x01, 0x045B),
        (0x1b, 0x01, 0x041B),
        (0x24, 0x01, 0x0424),
        (0x0A, 0x0b, 0x2C0A),
        (0x0A, 0x10, 0x400A),
        (0x0A, 0x0D, 0x340A),
        (0x0A, 0x09, 0x240A),
        (0x0A, 0x05, 0x140A),
        (0x0A, 0x07, 0x1C0A),
        (0x0A, 0x0C, 0x300A),
        (0x0A, 0x11, 0x440A),
        (0x0A, 0x04, 0x100A),
        (0x0A, 0x12, 0x480A),
        (0x0A, 0x02, 0x080A),
        (0x0A, 0x13, 0x4C0A),
        (0x0A, 0x06, 0x180A),
        (0x0A, 0x0F, 0x3C0A),
        (0x0A, 0x0A, 0x280A),
        (0x0A, 0x14, 0x500A),
        (0x0A, 0x03, 0x0C0A),
        (0x0A, 0x01, 0x040A),
        (0x0A, 0x15, 0x540A),
        (0x0A, 0x0E, 0x380A),
        (0x0A, 0x08, 0x200A),
        (0x41, 0x01, 0x0441),
        (0x1D, 0x02, 0x081D),
        (0x1D, 0x01, 0x041D),
        (0x5A, 0x01, 0x045A),
        (0x28, 0x01, 0x0428),
        (0x5F, 0x02, 0x085F),
        (0x49, 0x01, 0x0449),
        (0x49, 0x02, 0x0849),
        (0x44, 0x01, 0x0444),
        (0x4A, 0x01, 0x044A),
        (0x1E, 0x01, 0x041E),
        (0x51, 0x01, 0x0451),
        (0x73, 0x02, 0x0873),
        (0x73, 0x01, 0x0473),
        (0x1F, 0x01, 0x041F),
        (0x42, 0x01, 0x0442),
        (0x22, 0x01, 0x0422),
        (0x2E, 0x01, 0x042E),
        (0x20, 0x02, 0x0820),
        (0x20, 0x01, 0x0420),
        (0x80, 0x01, 0x0480),
        (0x43, 0x02, 0x0843),
        (0x43, 0x01, 0x0443),
        (0x03, 0x02, 0x0803),
        (0x2A, 0x01, 0x042A),
        (0x52, 0x01, 0x0452),
        (0x88, 0x01, 0x0488),
        (0x78, 0x01, 0x0478),
        (0x6A, 0x01, 0x046A),
    ])

class pesig

This unit is implemented in refinery.units.formats.pe.pesig and has the following commandline Interface:

usage: pesig [-h] [-L] [-Q] [-0] [-v]

Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital
signatures in DER format.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pesig(Unit):
    """
    Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file,
    i.e. the digital signatures in DER format.
    """

    _SECDIRID = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']

    def __init__(self): pass

    def process(self, data: bytearray) -> bytearray:
        pe = PE(data=data, fast_load=True)
        pe.parse_data_directories(directories=[self._SECDIRID])
        security = pe.OPTIONAL_HEADER.DATA_DIRECTORY[self._SECDIRID]
        self.log_info(F'signature offset: 0x{security.VirtualAddress:08X}')
        self.log_info(F'signature length: 0x{security.Size:08X}')
        if security.VirtualAddress == 0 or security.Size == 0:
            raise ValueError(F'IMAGE_DIRECTORY_ENTRY_SECURITY ({self._SECDIRID}) is corrupt.')
        sgnoff = security.VirtualAddress + 8
        sgnend = sgnoff + security.Size
        length, revision, certtype = unpack('<IHH', data[sgnoff - 8:sgnoff])
        signature = data[sgnoff:sgnend]

        if len(signature) + 8 != length:
            raise RefineryPartialResult(
                F'Found {len(signature) + 8} bytes of signature, but length should be {length}.',
                partial=signature)

        return signature

class pestrip (certificate=False, directories=False, memdump=False)

This unit is implemented in refinery.units.formats.pe.pestrip and has the following commandline Interface:

usage: pestrip [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m]

Removes the overlay of a PE file and returns the main executable. Use peoverlay to extract the
overlay.

optional arguments:
  -c, --cert     Include digital signatures for the size computation.
  -d, --dirs     Include data directories for size computation.
  -m, --memdump  Assume that the file data was a memory-mapped PE file.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pestrip(OverlayUnit):
    """
    Removes the overlay of a PE file and returns the main executable. Use `refinery.peoverlay` to
    extract the overlay.
    """

    def process(self, data: bytearray) -> bytearray:
        size = self._get_size(data)
        try:
            data[size:] = []
        except Exception:
            data = data[:size]
        else:
            return data

class pick (*bounds)

This unit is implemented in refinery.units.meta.pick and has the following commandline Interface:

usage: pick [-h] [-L] [-Q] [-0] [-v] [start:end:step [start:end:step ...]]

Picks sequences from the array of multiple inputs. For example, pick 0 2: will return all but the
second ingested input (which has index 1).

positional arguments:
  start:end:step  Specify start:end:step in Python slice syntax. The default is 0.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class pick(Unit):
    """
    Picks sequences from the array of multiple inputs. For example, `pick 0 2:`
    will return all but the second ingested input (which has index `1`).
    """
    def __init__(self, *bounds: Arg.Bounds(nargs='*', default=[0])):
        super().__init__(bounds=[sliceobj(s) for s in bounds])

    def process(self, data: Chunk):
        if not data.visible:
            yield data
            return

        state: _PickState = data.temp
        a = state.accessor
        lower = a.start
        upper = a.stop

        if lower is not None:
            lower -= state.discarded
        if upper is not None:
            upper -= state.discarded
        if state.consumed:
            yield from state.remaining[slice(lower, upper, a.step)]
            return

        while lower:
            try:
                chunk = next(state.chunks)
            except StopIteration:
                upper = None
                break
            if chunk.visible:
                lower -= 1
                upper -= 1
                state.discarded += 1
            else:
                yield chunk
        if upper is None:
            yield from state.chunks
            return
        while upper:
            try:
                chunk = next(state.chunks)
            except StopIteration:
                break
            if chunk.visible:
                upper -= 1
                state.discarded += 1
            yield chunk

    def filter(self, chunks: Iterable[Chunk]):
        chunks = begin(chunks)
        if chunks is None:
            return
        container, chunks = chunks
        container = container.copy()
        container.visible = True
        state = _PickState(deque(self.args.bounds), chunks)
        while state.next():
            if not state.consumed:
                if not state.discardable():
                    self.log_debug(F'consumed input into buffer after {state.discarded} skips')
                    for chunk in state.chunks:
                        if not chunk.visible:
                            yield chunk
                            continue
                        state.remaining.append(chunk)
                    state.consumed = True
            container.temp = state
            yield container

class pkcs7

This unit is implemented in refinery.units.formats.pkcs7 and has the following commandline Interface:

usage: pkcs7 [-h] [-L] [-Q] [-0] [-v]

Converts PKCS7 encoded data to a JSON representation.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pkcs7(Unit):
    """
    Converts PKCS7 encoded data to a JSON representation.
    """
    @Unit.Requires('asn1crypto', 'default', 'extended')
    def _asn1crypto():
        import asn1crypto
        import asn1crypto.cms
        import asn1crypto.core
        import asn1crypto.x509
        return asn1crypto

    def process(self, data: bytes):
        asn1 = self._asn1crypto.core
        cms = self._asn1crypto.cms
        signature = cms.ContentInfo.load(data)

        def unsign(data):
            if isinstance(data, int):
                size = data.bit_length()
                if data < 0:
                    data = (1 << (size + 1)) - ~data - 1
                if data > 0xFFFFFFFF_FFFFFFFF:
                    size, r = divmod(size, 8)
                    size += bool(r)
                    data = data.to_bytes(size, 'big').hex()
                return data
            elif isinstance(data, dict):
                return {key: unsign(value) for key, value in data.items()}
            elif isinstance(data, list):
                return [unsign(x) for x in data]
            else:
                return data

        class SpcString(asn1.Choice):
            _alternatives = [
                ('unicode', asn1.BMPString, {'implicit': 0}),
                ('ascii', asn1.IA5String, {'implicit': 1})
            ]

        SpcUuid = asn1.OctetString

        class SpcSerializedObject(asn1.Sequence):
            _fields = [
                ('classId', SpcUuid),
                ('serializedData', asn1.OctetString),
            ]

        class SpcLink(asn1.Choice):
            _alternatives = [
                ('url', asn1.IA5String, {'implicit': 0}),
                ('monikier', SpcSerializedObject, {'implicit': 1}),
                ('file', SpcString, {'explicit': 2})
            ]

        class SpcSpOpusInfo(asn1.Sequence):
            _fields = [
                ('programName', SpcString, {'optional': True, 'explicit': 0}),
                ('moreInfo', SpcLink, {'optional': True, 'explicit': 1}),
            ]

        class SetOfInfos(asn1.SetOf):
            _child_spec = SpcSpOpusInfo

        cms.CMSAttributeType._map['1.3.6.1.4.1.311.2.1.12'] = 'authenticode_info'
        cms.CMSAttribute._oid_specs['authenticode_info'] = SetOfInfos

        class ParsedASN1ToJSON(BytesAsArrayEncoder):
            unit = self

            @classmethod
            def _is_keyval(cls, obj):
                return (
                    isinstance(obj, dict)
                    and set(obj.keys()) == {'type', 'values'}
                    and len(obj['values']) == 1
                )

            @classmethod
            def handled(cls, obj) -> bool:
                return BytesAsArrayEncoder.handled(obj) or cls._is_keyval(obj)

            def encode_bytes(self, obj: bytes):
                with suppress(Exception):
                    string = obj.decode('latin1')
                    if string.isprintable():
                        return string
                return super().encode_bytes(obj)

            def default(self, obj):
                if self._is_keyval(obj):
                    return dict(type=obj['type'], value=obj['values'][0])
                with suppress(TypeError):
                    return super().default(obj)
                if isinstance(obj, (set, tuple)):
                    return list(obj)
                if isinstance(obj, datetime):
                    return str(obj)
                dict_result = {}
                list_result = None
                if isinstance(obj, self.unit._asn1crypto.x509.Certificate):
                    dict_result.update(fingerprint=obj.sha1.hex())
                if isinstance(obj, asn1.BitString):
                    return {'bit_string': obj.native}
                with suppress(Exception):
                    list_result = list(obj)
                    if all(isinstance(k, str) for k in list_result):
                        dict_result.update((key, obj[key]) for key in list_result)
                if dict_result:
                    return dict_result
                if list_result is not None:
                    return list_result
                if isinstance(obj, self.unit._asn1crypto.cms.CertificateChoices):
                    return obj.chosen
                if isinstance(obj, asn1.Sequence):
                    children = obj.children
                    if children:
                        return children
                    return obj.dump()
                with suppress(Exception):
                    return obj.native
                if isinstance(obj, asn1.Any):
                    parsed = None
                    with suppress(Exception):
                        parsed = obj.parse()
                    if parsed:
                        return parsed
                    return obj.dump()
                if isinstance(obj, asn1.Asn1Value):
                    return obj.dump()
                raise ValueError(F'Unable to determine JSON encoding of {obj.__class__.__name__} object.')

        with ParsedASN1ToJSON as encoder:
            encoded = encoder.dumps(signature)
            converted = unsign(json.loads(encoded))
            return json.dumps(converted, indent=4).encode(self.codec)

class pkcs7sig (tabular=False)

This unit is implemented in refinery.units.formats.pkcs7sig and has the following commandline Interface:

usage: pkcs7sig [-h] [-L] [-Q] [-0] [-v] [-t]

Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to
parse authenticode signatures appended to files that are not PE files to get the same output that
is produced by the pemeta unit.

optional arguments:
  -t, --tabular  Print information in a table rather than as JSON

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class pkcs7sig(Unit):
    """
    Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used
    to parse authenticode signatures appended to files that are not PE files to get the same output
    that is produced by the pemeta unit.
    """
    def __init__(self, tabular: Arg('-t', help='Print information in a table rather than as JSON') = False):
        super().__init__(tabular=tabular)

    def process(self, data: bytes):
        json = pemeta.parse_signature(data)
        yield from ppjson(tabular=self.args.tabular)._pretty_output(json, indent=4, ensure_ascii=False)

class pop (*names)

This unit is implemented in refinery.units.meta.pop and has the following commandline Interface:

usage: pop [-h] [-L] [-Q] [-0] [-v]
           [[name[:conversion]|count|@] [[name[:conversion]|count|@] ...]]

In processing order, remove visible chunks from the current frame and store their contents in the
given meta variables. All chunks in the input stream are consequently made visible again. If pop
is used at the end of a frame, then variables will be local to the parent frame.

positional arguments:
  [name[:conversion]|count|@]
                              Specify either the name of a single variable to receive the
                              contents of an input chunk, or an integer expression that specifies
                              a number of values to be removed from the input without storing
                              them. Additionally, it is possible to specify the symbol "@" to
                              remove a single chunk from the input and merge its meta data into
                              the following ones. By default, a single merge is performed. When a
                              variable name is specified, a sequence of transformations can be
                              appended to be applied before storing it. For example, the argument
                              k:le:b64 would first decode the chunk using base64, then convert it
                              to an integer in little endian format, and store the integer result
                              in the variable k. The visual aid is that the content is passed
                              from right to left through all conversions, into the variable k.

generic options:
  -h, --help                  Show this help message and exit.
  -L, --lenient               Allow partial results as output.
  -Q, --quiet                 Disables all log output.
  -0, --devnull               Do not produce any output.
  -v, --verbose               Specify up to two times to increase log level.

Expand source code Browse git

class pop(Unit):
    """
    In processing order, remove visible chunks from the current frame and store their contents in the given
    meta variables. All chunks in the input stream are consequently made visible again. If pop is used at
    the end of a frame, then variables will be local to the parent frame.
    """
    def __init__(
        self,
        *names: Arg(type=str, metavar=F'[name[:conversion]|count|{_popcount._MERGE_SYMBOL}]', help=(
            R'Specify either the name of a single variable to receive the contents of an input chunk, or '
            R'an integer expression that specifies a number of values to be removed from the input without '
            F'storing them. Additionally, it is possible to specify the symbol "{_popcount._MERGE_SYMBOL}" '
            R'to remove a single chunk from the input and merge its meta data into the following ones. By '
            R'default, a single merge is performed. When a variable name is specified, a sequence of '
            R'transformations can be appended to be applied before storing it. For example, the argument '
            R'k:le:b64 would first decode the chunk using base64, then convert it to an integer in little '
            R'endian format, and store the integer result in the variable `k`. The visual aid is that the '
            R'content is passed from right to left through all conversions, into the variable `k`.'
        ))
    ):
        if not names:
            names = _popcount._MERGE_SYMBOL,
        super().__init__(names=[_popcount(n) for n in names])

    def process(self, data):
        return data

    def filter(self, chunks: Iterable[Chunk]):
        invisible = []
        variables = {}
        remaining: Iterator[_popcount] = iter(self.args.names)

        it = iter(chunks)
        pop = next(remaining).reset()
        done = False

        for chunk in it:
            if not chunk.visible:
                self.log_debug('buffering invisible chunk')
                invisible.append(chunk)
                continue
            try:
                while not pop.into(variables, chunk):
                    pop = next(remaining).reset()
            except StopIteration:
                done = True
                invisible.append(chunk)
                break

        if not done and pop.done:
            try:
                next(remaining)
            except StopIteration:
                done = True

        if not done:
            raise ValueError('Not all variables could be assigned.')

        nesting = self.args.nesting

        for chunk in chain(invisible, it):
            meta = chunk.meta
            meta.update(variables)
            if nesting < 0:
                for name in variables:
                    meta.set_scope(name, chunk.scope + nesting)
            chunk.visible = True
            yield chunk

class ppjscript (indent=4)

This unit is implemented in refinery.units.sinks.ppjscript and has the following commandline Interface:

usage: ppjscript [-h] [-L] [-Q] [-0] [-v] [-i N]

Pretty-prints JavaScript without any reflection or evaluation.

optional arguments:
  -i, --indent N  Controls the amount of space characters used for indentation in the output.
                  Default is 4.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ppjscript(Unit):
    """
    Pretty-prints JavaScript without any reflection or evaluation.
    """
    def __init__(self, indent: Arg.Number('-i', help=(
        'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4
    ):
        return super().__init__(indent=indent)

    @Unit.Requires('jsbeautifier', 'display', 'extended')
    def _jsb():
        import jsbeautifier
        import jsbeautifier.unpackers.javascriptobfuscator
        # TODO: This is a workaround for the following bug:
        # https://github.com/beautify-web/js-beautify/issues/1350
        jsbeautifier.unpackers.javascriptobfuscator.detect = lambda *_: False
        return jsbeautifier

    @unicoded
    def process(self, data: str) -> str:
        return self._jsb.beautify(data, dict(eval_code=False, indent_size=self.args.indent))

class ppjson (tabular=False, indent=4)

This unit is implemented in refinery.units.sinks.ppjson and has the following commandline Interface:

usage: ppjson [-h] [-L] [-Q] [-0] [-v] [-t | -i N]

Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to
zero, the output is minified.

optional arguments:
  -t, --tabular   Convert JSON input into a flattened table.
  -i, --indent N  Number of spaces used for indentation. Default is 4.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ppjson(Unit):
    """
    Expects JSON input data and outputs it in a neatly formatted manner.
    If the indentation is set to zero, the output is minified.
    """
    _TRAILING_COMMA = re.compile(BR',\s*(}|])')

    def __init__(
        self,
        tabular: Arg.Switch('-t', group='OUT', help='Convert JSON input into a flattened table.') = False,
        indent : Arg.Number('-i', group='OUT', help='Number of spaces used for indentation. Default is {default}.') = 4
    ):
        return super().__init__(indent=indent, tabular=tabular)

    def _pretty_output(self, parsed, **kwargs):
        if self.args.tabular:
            table = list(flattened(parsed))
            width = max(len(key) for key, _ in table)
            tsize = get_terminal_size(80) - width - 4
            for key, value in table:
                value = str(value).rstrip()
                value = textwrap.wrap(value, tsize)
                it = iter(value)
                try:
                    item = next(it)
                except StopIteration:
                    continue
                yield F'{key:<{width}} : {item}'.encode(self.codec)
                for wrap in it:
                    yield F'{"":<{width + 3}}{wrap}'.encode(self.codec)
        else:
            yield json.dumps(parsed, **kwargs).encode(self.codec)

    def process(self, data):
        if self._TRAILING_COMMA.search(data):
            def smartfix(match):
                k = match.start()
                return match.group(0 if any(k in s for s in strings) else 1)
            from refinery.lib.patterns import formats
            strings = {range(*m.span()) for m in formats.string.finditer(data)}
            data = self._TRAILING_COMMA.sub(smartfix, data)
        kwargs = {'indent': self.args.indent} if self.args.indent else {'separators': (',', ':')}
        yield from self._pretty_output(json.loads(data), **kwargs)

class ppxml (indent=4, header=False)

This unit is implemented in refinery.units.sinks.ppxml and has the following commandline Interface:

usage: ppxml [-h] [-L] [-Q] [-0] [-v] [-i N] [-x]

Expects XML input data and outputs it in a neatly formatted manner.

optional arguments:
  -i, --indent N  Controls the amount of space characters used for indentation in the output.
                  Default is 4.
  -x, --header    Add an XML header to the formatted output.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class ppxml(Unit):
    """
    Expects XML input data and outputs it in a neatly formatted manner.
    """

    def __init__(self,
        indent: Arg.Number('-i', help=(
            'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4,
        header: Arg.Switch('-x', help='Add an XML header to the formatted output.') = False
    ):
        super().__init__(indent=indent, header=header)

    def process(self, data):

        pad = self.args.indent * ' '
        etm = {}

        try:
            dom = ForgivingParse(data, etm)
        except Exception:
            from refinery.lib.meta import metavars
            msg = 'error parsing as XML, returning original content'
            path = metavars(data).get('path')
            if path:
                msg = F'{msg}: {path}'
            self.log_warn(msg)
            return data

        def indent(element, level=0, more_sibs=False):
            """
            The credit for this one goes to:
            https://stackoverflow.com/a/12940014
            """
            indentation = '\n'
            if level:
                indentation += (level - 1) * pad
            childcount = len(element)
            if childcount:
                if not element.text or not element.text.strip():
                    element.text = indentation + pad
                    if level:
                        element.text += pad
                for count, child in enumerate(element):
                    indent(child, level + 1, count < childcount - 1)
                if level and (not element.tail or element.tail.isspace()):
                    element.tail = indentation
                    if more_sibs:
                        element.tail += pad
            elif level and (not element.tail or element.tail.isspace()):
                element.tail = indentation
                if more_sibs: element.tail += pad

        indent(dom.getroot())

        with io.BytesIO() as output:
            dom.write(output, encoding=self.codec, xml_declaration=self.args.header)
            result = output.getvalue()

        for uid, key in etm.items():
            entity = F'&{key};'.encode(self.codec)
            needle = uid.encode(self.codec)
            result = result.replace(needle, entity)

        return result

class ps1str

This unit is implemented in refinery.units.encoding.ps1str and has the following commandline Interface:

usage: ps1str [-h] [-L] [-Q] [-0] [-v] [-R]

Escapes and unescapes PowerShell strings.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class ps1str(Unit):
    """
    Escapes and unescapes PowerShell strings.
    """
    UNESCAPE = {
        '`0': '\0',
        '`a': '\a',
        '`b': '\b',
        '`f': '\f',
        '`n': '\n',
        '`r': '\r',
        '`t': '\t',
        '`v': '\v',
        '``': '`',
        "`'": '\'',
        '`"': '\"',
    }
    ESCAPE = {
        '`' : '``',
        '$' : '`$',
        '\0': '`0',
        '\a': '`a',
        '\b': '`b',
        '\f': '`f',
        '\n': '`n',
        '\r': '`r',
        '\t': '`t',
        '\v': '`v',
        '\'': "`'",
        '\"': '""',
    }

    def __init__(self): pass

    @unicoded
    def process(self, data):
        match = re.fullmatch(R'''@(['"])\s*?[\r\n](.*?)[\r\n]\1@''', data, flags=re.DOTALL)
        if match:
            return match.group(2)
        if data[0] not in ''''"''' or data[-1] != data[0]:
            raise ValueError(
                'No quotes found at beginning of input. To escape a PowerShell string, the '
                'quotes must be included because quote escaping depends on whether a single '
                'or double quote was used.')

        quote, data = data[0], data[1:-1]

        def unescape(match):
            string = match[0]
            return self.UNESCAPE.get(string, string[1:])

        if quote == '"':
            if re.search(R'(?<!`)\$(?=[\w\(\{\$\?\^:])', data):
                self.log_warn('Loss of information: double quoted string contains variable substitutions.')
            data = re.sub('`.', unescape, data)

        return data.replace(quote + quote, quote)

    @unicoded
    def reverse(self, data):
        def escaper(match):
            char = match[0]
            return ps1str.ESCAPE.get(char, char)
        return '"{}"'.format(re.sub(R'''[\x00\x07-\x0D`$'"]''', escaper, data))

class push (data=b'')

This unit is implemented in refinery.units.meta.push and has the following commandline Interface:

usage: push [-h] [-L] [-Q] [-0] [-v] [data]

The unit inserts an additional chunk before each input chunk and moves the original data out of
scope. This chunk is considered the "original" data, while the one inserted in front of it is
used as an intermediate result. By default, this intermediate data is a copy of the input data.
For example:

    emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]

will output key=censored. The application of rex turns the (duplicated) data into just the value,
which is then stored in the variable v. The application of repl replaces this value with the
hard-coded string censored.

positional arguments:
  data           The data to be pushed, by default a copy of the input.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class push(Unit):
    """
    The unit inserts an additional chunk before each input chunk and moves the original
    data out of scope. This chunk is considered the "original" data, while the one inserted
    in front of it is used as an intermediate result. By default, this intermediate data is
    a copy of the input data. For example:

        emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]

    will output `key=censored`. The application of `refinery.rex` turns the (duplicated)
    data into just the value, which is then stored in the variable `v`. The application
    of `refinery.repl` replaces this value with the hard-coded string `censored`.
    """
    def __init__(self, data: Arg(help='The data to be pushed, by default a copy of the input.') = B''):
        super().__init__(data=data)

    def process(self, data: Chunk):
        src = self.args.data
        tos = data.copy(meta=True, data=False)
        tos[:] = src or data
        if self.args.nesting > 0:
            data.set_next_scope(False)
        else:
            try:
                data.visible = False
            except AttributeError:
                self.log_warn('application has no effect outside frame.')
        yield data
        yield tos

class put (name, value=<object object>)

This unit is implemented in refinery.units.meta.put and has the following commandline Interface:

usage: put [-h] [-L] [-Q] [-0] [-v] name [value]

Can be used to add a meta variable to the processed chunk. Note that meta variables cease to
exist outside a frame.

positional arguments:
  name           The name of the variable to be used.
  value          The value for the variable. If no value is given, the entire current chunk is
                 stored.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class put(Unit):
    """
    Can be used to add a meta variable to the processed chunk. Note that meta variables
    cease to exist outside a frame.
    """
    def __init__(
        self,
        name : Arg(help='The name of the variable to be used.', type=str),
        value: Arg(help='The value for the variable. If no value is given, the entire current chunk is stored.',
            type=functools.partial(numseq, typecheck=False)) = _EMPTY
    ):
        super().__init__(name=check_variable_name(name), value=value)

    def process(self, data: Chunk):
        value = self.args.value
        if value is _EMPTY:
            value = data
        if not isinstance(value, (int, float)) and not isbuffer(value):
            try:
                len(value)
            except TypeError:
                if isinstance(value, itertools.repeat):
                    value = next(value)
                if not isinstance(value, (int, float)):
                    raise NotImplementedError(F'put does not support {value.__class__.__name__} values.')
            else:
                if not isinstance(value, list):
                    value = list(value)
        self.log_debug(F'storing {typename(value)}:', value, clip=True)
        data.meta[self.args.name] = value
        return data

class pyc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.pyc and has the following commandline Interface:

usage: pyc [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
           [path [path ...]]

Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does
not work on recent Python versions, but anything below 3.9 should work.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class pyc(ArchiveUnit):
    """
    Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does
    not work on recent Python versions, but anything below 3.9 should work.
    """

    def unpack(self, data):
        input_path = metavars(data).get(self.args.path.decode(self.codec))
        for k, code in enumerate(extract_code_from_buffer(bytes(data), input_path)):
            path = code.container.co_filename or F'__unknown_name_{k:02d}.py'
            date = datetime.fromtimestamp(code.timestamp)
            data = decompile_buffer(code)
            yield self._pack(path, date, data)

class qb (*data)

This unit is implemented in refinery.units.meta.queue and has the following commandline Interface:

usage: qb [-h] [-L] [-Q] [-0] [-v] [data [data ...]]

Short for "queue back": Insert new chunks at the end of the current frame.

positional arguments:
  data           The arguments are inserted into the current frame in the given order. These
                 arguments are multibin expressions; If the expression depends on the input data,
                 it will always refer to the first chunk in the current frame. If no argument is
                 given, a single empty chunk is inserted.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qb(QueueUnit):
    """
    Short for "queue back": Insert new chunks at the end of the current frame.
    """
    def filter(self, chunks: Iterable[Chunk]):
        yield from self._queue(chunks, False)

class qf (*data)

This unit is implemented in refinery.units.meta.queue and has the following commandline Interface:

usage: qf [-h] [-L] [-Q] [-0] [-v] [data [data ...]]

Short for "queue front": Insert new chunks at the beginning of the current frame.

positional arguments:
  data           The arguments are inserted into the current frame in the given order. These
                 arguments are multibin expressions; If the expression depends on the input data,
                 it will always refer to the first chunk in the current frame. If no argument is
                 given, a single empty chunk is inserted.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qf(QueueUnit):
    """
    Short for "queue front": Insert new chunks at the beginning of the current frame.
    """
    def filter(self, chunks: Iterable[Chunk]):
        yield from self._queue(chunks, True)

class qlz

This unit is implemented in refinery.units.compression.qlz and has the following commandline Interface:

usage: qlz [-h] [-L] [-Q] [-0] [-v]

This unit implements QuickLZ decompression levels 1 and 3.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class qlz(Unit):
    """
    This unit implements QuickLZ decompression levels 1 and 3.
    """

    def process(self, data):
        source = memoryview(data)
        head = source[0]
        clvl = (head >> 2) & 0x3

        if head & 2:
            self.log_info('long header detected')
            size = int.from_bytes(source[5:9], 'little')
            source = source[9:]
        else:
            self.log_info('short header detected')
            size = source[3]
            source = source[3:]
        if head & 1 != 1:
            self.log_warn('header indicates that data is uncompressed, returning remaining data')
            return source
        else:
            self.log_info(F'compression level {clvl}, decompressed size {SizeInt(size)!r}')

        def fetchhash():
            return int.from_bytes(destination[hashvalue + 1:hashvalue + 4], byteorder='little')

        codeword = 1
        destination = bytearray()
        hashtable = [0] * _HASH_VALUES
        hashvalue = -1
        last_matchstart = size - _UNCONDITIONAL_MATCHLEN - _UNCOMPRESSED_END - 1
        fetch = 0

        if clvl == 2:
            raise ValueError("This version only supports level 1 and 3")
        while source:
            if codeword == 1:
                codeword = int.from_bytes(source[:4], byteorder='little')
                source = source[4:]
                if len(destination) <= last_matchstart:
                    c = 3 if clvl == 1 else 4
                    fetch = int.from_bytes(source[:c], byteorder='little')
            if codeword & 1:
                codeword = codeword >> 1
                if clvl == 1:
                    hash = (fetch >> 4) & 0xFFF
                    offset = hashtable[hash]
                    if fetch & 0xF:
                        matchlen = (fetch & 0xF) + 2
                        source = source[2:]
                    else:
                        matchlen = source[2]
                        source = source[3:]
                else:
                    if (fetch & 3) == 0:
                        delta = (fetch & 0xFF) >> 2
                        matchlen = 3
                        source = source[1:]
                    elif (fetch & 2) == 0:
                        delta = (fetch & 0xFFFF) >> 2
                        matchlen = 3
                        source = source[2:]
                    elif (fetch & 1) == 0:
                        delta = (fetch & 0xFFFF) >> 6
                        matchlen = ((fetch >> 2) & 15) + 3
                        source = source[2:]
                    elif (fetch & 127) != 3:
                        delta = (fetch >> 7) & 0x1FFFF
                        matchlen = ((fetch >> 2) & 0x1F) + 2
                        source = source[3:]
                    else:
                        delta = fetch >> 15
                        matchlen = ((fetch >> 7) & 255) + 3
                        source = source[4:]
                    offset = (len(destination) - delta) & 0xFFFFFFFF

                for i in range(offset, offset + matchlen):
                    destination.append(destination[i])

                if clvl == 1:
                    fetch = fetchhash()
                    while hashvalue < len(destination) - matchlen:
                        hashvalue += 1
                        hash = ((fetch >> 12) ^ fetch) & _HASH_MASK
                        hashtable[hash] = hashvalue
                        fetch = fetch >> 8 & 0xFFFF
                        try:
                            fetch |= destination[hashvalue + 3] << 16
                        except IndexError:
                            pass
                    fetch = int.from_bytes(source[:3], byteorder='little')
                else:
                    fetch = int.from_bytes(source[:4], byteorder='little')
                hashvalue = len(destination) - 1
            else:
                if len(destination) <= last_matchstart:
                    destination.append(source[0])
                    source = source[1:]
                    codeword = codeword >> 1
                    if clvl == 1:
                        while hashvalue < len(destination) - 3:
                            fetch2 = fetchhash()
                            hashvalue += 1
                            hash = ((fetch2 >> 12) ^ fetch2) & _HASH_MASK
                            hashtable[hash] = hashvalue
                        fetch = fetch >> 8 & 0xFFFF | source[2] << 16
                    else:
                        fetch = fetch >> 8 & 0xFFFF
                        fetch |= source[2] << 16
                        fetch |= source[3] << 24
                else:
                    while len(destination) <= size - 1:
                        if codeword == 1:
                            source = source[4:]
                            codeword = 0x80000000
                        destination.append(source[0])
                        source = source[1:]
                        codeword = codeword >> 1
                    break
        if len(destination) != size:
            raise RefineryPartialResult(
                F'Header indicates decompressed size 0x{size:X}, but 0x{len(destination):X} bytes '
                F'were decompressed.', destination)
        return destination

class rabbit (key, discard=0, stateful=False, iv=b'')

This unit is implemented in refinery.units.crypto.cipher.rabbit and has the following commandline Interface:

usage: rabbit [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] [-i IV] key

RABBIT encryption and decryption.

positional arguments:
  key              The encryption key.

optional arguments:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.
  -i, --iv IV      Optional initialization vector.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class rabbit(StreamCipherUnit):
    """
    RABBIT encryption and decryption.
    """
    key_size = {16}

    def __init__(self, key, discard=0, stateful=False, iv: Arg('-i', '--iv', help='Optional initialization vector.') = B''):
        super().__init__(key=key, iv=iv, stateful=stateful, discard=discard)

    def keystream(self) -> Iterable[int]:
        if len(self.args.iv) not in (0, 8):
            raise ValueError('The IV length must be exactly 8 bytes.')
        return RabbitCipher(self.args.key, self.args.iv)

class rc2 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.rc2 and has the following commandline Interface:

usage: rc2 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key

RC2 encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX,
                        ECB, OFB. By default, the CBC mode is used when an IV is is provided, and
                        ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -M, --mac-len N       Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in
                        bytes.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class rc2(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(ARC2)):
    """
    RC2 encryption and decryption.
    """

    def _new_cipher(self, **optionals) -> CipherInterface:
        optionals.update(effective_keylen=max(ARC2.key_size))
        return super()._new_cipher(**optionals)

class rc4 (key, discard=0)

This unit is implemented in refinery.units.crypto.cipher.rc4 and has the following commandline Interface:

usage: rc4 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] key

RC4 encryption and decryption.

positional arguments:
  key              The encryption key.

optional arguments:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class rc4(StandardCipherUnit, cipher=PyCryptoFactoryWrapper(ARC4)):
    """
    RC4 encryption and decryption.
    """
    def __init__(
        self, key,
        discard: Arg.Number('-d', help='Discard the first {varname} bytes of the keystream, {default} by default.') = 0,
    ):
        super().__init__(key, discard=discard)

    def _new_cipher(self, **optionals):
        return super()._new_cipher(drop=self.args.discard, **optionals)

class rc4mod (key, stateful=False, discard=0, *, size=256)

This unit is implemented in refinery.units.crypto.cipher.rc4mod and has the following commandline Interface:

usage: rc4mod [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-t N] key

Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be
altered.

positional arguments:
  key              The encryption key.

optional arguments:
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -t, --size N     Table size, 256 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class rc4mod(StreamCipherUnit):
    """
    Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered.
    """

    def __init__(
        self, key, stateful=False, discard=0, *,
        size: Arg.Number('-t', help='Table size, {default} by default.', bound=(1, None)) = 0x100
    ):
        super().__init__(key=key, stateful=stateful, discard=discard, size=size)

    def keystream(self):
        size = self.args.size
        tablerange = range(max(size, 0x100))
        b, table = 0, bytearray(k & 0xFF for k in tablerange)
        for a, keybyte in zip(tablerange, cycle(self.args.key)):
            t = table[a]
            b = (b + keybyte + t) % size
            table[a] = table[b]
            table[b] = t
        self.log_debug(lambda: F'SBOX = {table.hex(" ").upper()}', clip=True)
        b, a = 0, 0
        while True:
            a = (a + 1) % size
            t = table[a]
            b = (b + t) % size
            table[a] = table[b]
            table[b] = t
            yield table[(table[a] + t) % size]

class rc5 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=12, word_size=32, assoc_len=0, mac_len=0)

This unit is implemented in refinery.units.crypto.cipher.rc5 and has the following commandline Interface:

usage: rc5 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-k N] [-w N] key

RC5 encryption and decryption.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -k, --rounds N        Number of rounds to use, the default is 12
  -w, --word-size N     The word size in bits, 32 by default.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class rc5(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC5)):
    """
    RC5 encryption and decryption.
    """
    def __init__(
        self, key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0,
        rounds    : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R,
        word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W,
        **more
    ):
        super().__init__(
            key,
            iv,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            rounds=rounds,
            word_size=word_size,
            **more
        )

    @property
    def block_size(self):
        return self.args.word_size // 4

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            rounds=self.args.rounds,
            word_size=self.args.word_size,
            **optionals
        )

class rc6 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=20, word_size=32)

This unit is implemented in refinery.units.crypto.cipher.rc6 and has the following commandline Interface:

usage: rc6 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-k N] [-w N] key

RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for
the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the
unit will allow any key size up to 256 bits.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.
  -k, --rounds N        Number of rounds to use, the default is 20
  -w, --word-size N     The word size in bits, 32 by default.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class rc6(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC6)):
    """
    RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen
    for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but
    the unit will allow any key size up to 256 bits.
    """
    def __init__(
        self, key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0,
        rounds    : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R,
        word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W,
    ):
        super().__init__(
            key,
            iv,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            rounds=rounds,
            word_size=word_size
        )

    @property
    def block_size(self):
        return self.args.word_size // 2

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            rounds=self.args.rounds,
            word_size=self.args.word_size,
            **optionals
        )

class recode (decode=None, encode='UTF8', decerr=None, encerr=None, errors=None)

This unit is implemented in refinery.units.encoding.recode and has the following commandline Interface:

usage: recode [-h] [-L] [-Q] [-0] [-v] [-R] [-d Handler] [-e Handler] [-E Handler]
              [decode-as] [encode-as]

Expects input string data encoded in the from encoding and encodes it in the to encoding, then
outputs the result.

positional arguments:
  decode-as             Input encoding; Guess encoding by default.
  encode-as             Output encoding; The default is UTF8.

optional arguments:
  -d, --decerr Handler  Specify an error handler for decoding.
  -e, --encerr Handler  Specify an error handler for encoding.
  -E, --errors Handler  Specify an error handler for both encoding and decoding. The possible
                        choices are the following: strict, ignore, replace, xmlref, backslash,
                        surrogate

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class recode(Unit):
    """
    Expects input string data encoded in the `from` encoding and encodes it in
    the `to` encoding, then outputs the result.
    """

    def __init__(
        self,
        decode: Arg(metavar='decode-as', type=str, help='Input encoding; Guess encoding by default.') = None,
        encode: Arg(metavar='encode-as', type=str, help=F'Output encoding; The default is {Unit.codec}.') = Unit.codec,
        decerr: Arg.Option('-d', choices=Handler,
            help='Specify an error handler for decoding.') = None,
        encerr: Arg.Option('-e', choices=Handler,
            help='Specify an error handler for encoding.') = None,
        errors: Arg.Option('-E', choices=Handler, help=(
            'Specify an error handler for both encoding and decoding. '
            'The possible choices are the following: {choices}')) = None,
    ):
        super().__init__(
            decode=decode,
            encode=encode,
            decerr=Arg.AsOption(decerr or errors or 'STRICT', Handler).value,
            encerr=Arg.AsOption(encerr or errors or 'STRICT', Handler).value
        )

    @Unit.Requires('chardet', 'default', 'extended')
    def _chardet():
        import chardet
        return chardet

    def _detect(self, data):
        mv = memoryview(data)
        if not any(mv[1::2]): return 'utf-16le'
        if not any(mv[0::2]): return 'utf-16be'
        detection = self._chardet.detect(data)
        codec = detection['encoding']
        self.log_info(lambda: F'Using input encoding: {codec}, detected with {int(detection["confidence"] * 100)}% confidence.')
        return codec

    def _recode(self, enc, dec, encerr, decerr, data):
        dec = dec or self._detect(data)
        return codecs.encode(codecs.decode(data, dec, errors=decerr), enc, errors=encerr)

    def reverse(self, data):
        return self._recode(self.args.decode, self.args.encode, self.args.decerr, self.args.encerr, data)

    def process(self, data):
        return self._recode(self.args.encode, self.args.decode, self.args.encerr, self.args.decerr, data)

class reduce (*reduction, just=0, temp='t')

This unit is implemented in refinery.units.meta.reduce and has the following commandline Interface:

usage: reduce [-h] [-L] [-Q] [-0] [-v] [-j N] [-t name] [pipeline [pipeline ...]]

The reduce unit applies an arbitrary pipeline repeatedly to reduce the current frame to a single
chunk. The first chunk in the frame serves as initialization.

positional arguments:
  pipeline         The remaining command line is a refinery pipeline. While reducing the frame,
                   the binary contents of each chunk are placed in a temporary variable, then the
                   chunk body is replaced with the accumulated data, and then processed with this
                   pipeline. The result is written back to the accumulator and the process
                   repeats.

optional arguments:
  -j, --just N     Optionally specify a maximum number of chunks to process beyond the first.
  -t, --temp name  The name of the temporary variable. The default is "t".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class reduce(Unit):
    """
    The reduce unit applies an arbitrary pipeline repeatedly to reduce the current frame to a
    single chunk. The first chunk in the frame serves as initialization.
    """

    def __init__(self,
        *reduction: Arg(type=str, metavar='pipeline', help=(
            'The remaining command line is a refinery pipeline. While reducing the frame, the '
            'binary contents of each chunk are placed in a temporary variable, then the chunk '
            'body is replaced with the accumulated data, and then processed with this pipeline. '
            'The result is written back to the accumulator and the process repeats.'
        )),
        just: Arg.Number('-j',
            help='Optionally specify a maximum number of chunks to process beyond the first.') = 0,
        temp: Arg.String('-t', metavar='name',
            help='The name of the temporary variable. The default is "{default}".') = 't',
    ):
        super().__init__(reduction=reduction, temp=temp, just=just)

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        just = self.args.just
        name = self.args.temp
        accu = next(it)
        init = ' '.join(self.args.reduction)
        unit: Unit = load_pipeline(init)
        meta = dict(accu.meta)
        for chunk in islice(it, 0, just) if just else it:
            accu.meta.update(chunk.meta)
            accu[name] = chunk
            unit.args(accu)
            self.log_info('current input:', accu, clip=True)
            accu[:] = unit.act(accu)
        temp = [key for key in accu.meta.keys() if key not in meta]
        for key in temp:
            accu.meta.discard(key)
        accu.meta.update(meta)
        yield accu
        yield from it

class rep (count=2, label=None)

This unit is implemented in refinery.units.strings.rep and has the following commandline Interface:

usage: rep [-h] [-L] [-Q] [-0] [-v] [count] [label]

Duplicates the given input a given number of times. It is also possible to specify an iterable
instead of a number, in which case the input will be replicated once for each item in this
iterable.

positional arguments:
  count          Defines the number of outputs to generate for each input. The default is 2. You
                 can specify any multibin expression that defines an integer iterable here: Each
                 input chunk will be replicated once for each element of that sequence.
  label          If specified, the meta variable with this name will be populated with the index
                 of the replicated chunk. When the count parameter is an integer, this label will
                 be equivalent to the index meta variable.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rep(Unit):
    """
    Duplicates the given input a given number of times. It is also possible to specify
    an iterable instead of a number, in which case the input will be replicated once for
    each item in this iterable.
    """

    def __init__(
        self,
        count: Arg.NumSeq(help=(
            'Defines the number of outputs to generate for each input. The default is {default}. '
            'You can specify any multibin expression that defines an integer iterable here: Each '
            'input chunk will be replicated once for each element of that sequence.')) = 2,
        label: Arg(type=str, help=(
            'If specified, the meta variable with this name will be populated with the index of '
            'the replicated chunk. When the count parameter is an integer, this label will be '
            'equivalent to the index meta variable.')) = None
    ):
        super().__init__(count=count, label=label)

    def process(self, data: bytes):
        def count():
            count = self.args.count
            if isinstance(count, int):
                return count
            return sum(1 for _ in count)

        if self.args.squeeze or not self._framed:
            self.log_debug('compressing all repeated items into a single chunk')
            yield data * count()
            return

        self.log_debug('emitting each repeated item as an individual chunk')

        label = self.args.label
        if label is None:
            yield from repeat(data, count())
            return

        meta = {}
        for counter in self.args.count:
            meta[label] = counter
            yield self.labelled(data, **meta)

class repl (search, replace=b'', count=-1)

This unit is implemented in refinery.units.strings.repl and has the following commandline Interface:

usage: repl [-h] [-L] [-Q] [-0] [-v] [-n N] search [replace]

Performs a simple binary string replacement on the input data.

positional arguments:
  search         This is the search term.
  replace        The substitution string. Leave this empty to remove all occurrences of the
                 search term.

optional arguments:
  -n, --count N  Only replace the given number of occurrences

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class repl(Unit):
    """
    Performs a simple binary string replacement on the input data.
    """

    def __init__(
        self,
        search : Arg(help='This is the search term.'),
        replace: Arg(help='The substitution string. Leave this empty to remove all occurrences of the search term.') = B'',
        count  : Arg.Number('-n', help='Only replace the given number of occurrences') = -1
    ):
        super().__init__(search=search, replace=replace, count=count)

    def process(self, data: bytes):
        return data.replace(
            self.args.search,
            self.args.replace,
            self.args.count
        )

class resplit (regex=b'\\r?\\n', multiline=False, ignorecase=False, count=0)

This unit is implemented in refinery.units.pattern.resplit and has the following commandline Interface:

usage: resplit [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-c N] [regex]

Splits the data at the given regular expression and returns the sequence of chunks between the
separators. By default, the input is split along line breaks.

positional arguments:
  regex             Regular expression to match.

optional arguments:
  -M, --multiline   Caret and dollar match the beginning and end of a line, a dot does not match
                    line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters.
  -c, --count N     Specify the maximum number of operations to perform.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class resplit(RegexUnit):
    """
    Splits the data at the given regular expression and returns the sequence of
    chunks between the separators. By default, the input is split along line breaks.
    """

    def __init__(
        self, regex=RB'\r?\n', multiline=False, ignorecase=False, count=0
    ):
        super().__init__(regex=regex, multiline=multiline, ignorecase=ignorecase, count=count)

    def process(self, data):
        view = memoryview(data)
        cursor = 0
        count = self.args.count
        for k, match in enumerate(self.regex.finditer(view), 2):
            yield view[cursor:match.start()]
            cursor = match.end()
            yield from match.groups()
            if k > count > 0:
                break
        yield view[cursor:]

class resub (regex='\\s+', subst=b'', multiline=False, ignorecase=False, count=0)

This unit is implemented in refinery.units.pattern.resub and has the following commandline Interface:

usage: resub [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-c N] [regex] [subst]

A unit for performing substitutions based on a binary regular expression pattern. Besides the
syntax {k} to insert the k-th match group, the unit supports processing the contents of match
groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

    {match-group:handlers}

where :handlers is an optional reverse multibin expression that is used to post-process the
binary data from the match. For example, {2:hex:b64} represents the base64-decoding of the hex-
decoding of the second match group.

positional arguments:
  regex             Regular expression to be searched and replaced. The default is "\s+".
  subst             Substitution value: use {1} for group 1, {0} for entire match. Matches are
                    removed (replaced by an empty string) by default.

optional arguments:
  -M, --multiline   Caret and dollar match the beginning and end of a line, a dot does not match
                    line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters.
  -c, --count N     Specify the maximum number of operations to perform.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class resub(RegexUnit):
    """
    A unit for performing substitutions based on a binary regular expression pattern. Besides the
    syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of
    match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

        {match-group:handlers}

    where `:handlers` is an optional reverse multibin expression that is used to post-process the
    binary data from the match. For example, `{2:hex:b64}` represents the base64-decoding of the
    hex-decoding of the second match group.
    """
    def __init__(
        self,
        regex: Arg(help='Regular expression to be searched and replaced. The default is "{default}".') = '\\s+',
        subst: Arg('subst', help=(
            'Substitution value: use {1} for group 1, {0} for entire match. Matches are removed '
            '(replaced by an empty string) by default.'
        )) = B'',
        multiline=False,
        ignorecase=False,
        count=0
    ):
        super().__init__(regex=regex, subst=subst, multiline=multiline, ignorecase=ignorecase, count=count)

    def process(self, data):
        def repl(match: Match):
            return meta.format_bin(spec, self.codec, [match[0], *match.groups()], match.groupdict())
        self.log_info('pattern:', getattr(self.regex, 'pattern', self.regex))
        self.log_info('replace:', self.args.subst)
        meta = metavars(data)
        spec = self.args.subst.decode('ascii', 'backslashreplace')
        substitute = self.regex.sub
        if self.args.count:
            from functools import partial
            substitute = partial(substitute, count=self.args.count)
        return substitute(repl, data)

class rev (blocksize=None)

This unit is implemented in refinery.units.blockwise.rev and has the following commandline Interface:

usage: rev [-h] [-L] [-Q] [-0] [-v] [-B N]

The blocks of the input data are output in reverse order. If the length of the input data is not
a multiple of the block size, the data is truncated.

optional arguments:
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class rev(UnaryOperation):
    """
    The blocks of the input data are output in reverse order. If the length of
    the input data is not a multiple of the block size, the data is truncated.
    """

    def __init__(self, blocksize=None):
        super().__init__(blocksize=blocksize, _truncate=2)

    def inplace(self, block: ndarray):
        return self._numpy.flip(block)

    operate = NotImplemented

    def process(self, data: bytearray):
        if self.bytestream:
            data.reverse()
            return data
        try:
            return self._fastblock(data)
        except FastBlockError:
            b = self.blocksize
            n = len(data)
            q = n // b
            m = q * b
            view = memoryview(data)
            temp = bytearray(b)
            for k in range(0, (q // 2) * b, b):
                lhs = slice(k, k + b)
                rhs = slice(m - k - b, m - k)
                temp[:] = view[rhs]
                data[rhs] = view[lhs]
                data[lhs] = temp
            if m < n:
                del view
                del temp
                del data[m:]
            return data

class rex (regex, *transformation, unicode=False, unique=False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.rex and has the following commandline Interface:

usage: rex [-h] [-L] [-Q] [-0] [-v] [-u] [-q] [-M] [-I] [-n N] [-m N] [-e N] [-x] [-l] [-t N]
           regex [transformation [transformation ...]]

Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each
match. Each match is an individual output. Besides the syntax {k} to insert the k-th match group,
the unit supports processing the contents of match groups with arbitrary refinery units. To do
so, use the following F-string-like syntax:

    {match-group:pipeline}

where :pipeline is an optional pipeline of refinery commands as it would be specified on the
command line. The value of the corresponding match is post-processed with this command.

positional arguments:
  regex             Regular expression to match.
  transformation    An optional sequence of transformations to be applied to each match. Each
                    transformation produces one output in the order in which they are given. The
                    default transformation is {0}, i.e. the entire match.

optional arguments:
  -u, --unicode     Also find unicode strings.
  -q, --unique      Yield every (transformed) match only once.
  -M, --multiline   Caret and dollar match the beginning and end of a line, a dot does not match
                    line breaks.
  -I, --ignorecase  Ignore capitalization for alphabetic characters.
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -l, --longest     Sort results by length.
  -t, --take N      Return only the first N occurrences in order of appearance.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class rex(RegexUnit, PatternExtractor):
    """
    Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each
    match. Each match is an individual output. Besides the syntax `{k}` to insert the `k`-th match
    group, the unit supports processing the contents of match groups with arbitrary refinery units.
    To do so, use the following F-string-like syntax:

        {match-group:pipeline}

    where `:pipeline` is an optional pipeline of refinery commands as it would be specified on
    the command line. The value of the corresponding match is post-processed with this command.
    """
    def __init__(
        self, regex,
        # TODO: Use positional only in Python 3.8
        # /,
        *transformation: Arg(type=utf8, help=(
            'An optional sequence of transformations to be applied to each match. '
            'Each transformation produces one output in the order in which they   '
            'are given. The default transformation is {0}, i.e. the entire match.  '
        )),
        unicode: Arg.Switch('-u', help='Also find unicode strings.') = False,
        unique: Arg.Switch('-q', help='Yield every (transformed) match only once.') = False,
        multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False,
        longest=False, take=None
    ):
        super().__init__(
            regex=regex,
            transformation=transformation,
            unicode=unicode,
            unique=unique,
            multiline=multiline,
            ignorecase=ignorecase,
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            longest=longest,
            take=take,
            utf16=unicode,
            ascii=True,
            duplicates=not unique
        )

    def process(self, data):
        meta = metavars(data)
        self.log_debug('regular expression:', getattr(self.regex, 'pattern', self.regex))
        transformations = []
        specs: List[bytes] = list(self.args.transformation)
        if not specs:
            specs.append(B'{0}')
        for spec in specs:
            def transformation(match: Match, s=spec.decode(self.codec)):
                symb: dict = match.groupdict()
                args: list = [match.group(0), *match.groups()]
                used = set()
                for key, value in symb.items():
                    if value is None:
                        symb[key] = B''
                item = meta.format(s, self.codec, args, symb, True, True, used)
                used.update(key for key, value in symb.items() if not value)
                for variable in used:
                    symb.pop(variable, None)
                symb.update(offset=match.start())
                chunk = Chunk(item)
                chunk.meta.update(meta)
                chunk.meta.update(symb)
                return chunk
            transformations.append(transformation)
        yield from self.matches_filtered(memoryview(data), self.regex, *transformations)

class rijndael (key, iv=b'', block_size=16, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False, raw=False, mode=None, padding=None)

This unit is implemented in refinery.units.crypto.cipher.rijndael and has the following commandline Interface:

usage: rijndael [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-b N] [-p P] [-m M] [-r] [-e] [-S N] key

Rijndael encryption and decryption. Note that there is also a aes unit which has much better
performance because it calls into the PyCryptodome library. You would have to use this specific
Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in
which case it is equivalent to AES.

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -b, --block-size N    Cipher block size, default is 16. Valid choices are 16, 24, and 32.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class rijndael(StandardBlockCipherUnit, cipher=BlockCipherFactory(Rijndael)):
    """
    Rijndael encryption and decryption. Note that there is also a `refinery.aes` unit which has
    much better performance because it calls into the PyCryptodome library. You would have to
    use this specific Rijndael unit only if Rijndael is used with a block size that is different
    from 16 bytes, in which case it is equivalent to AES.
    """
    def __init__(
        self, key, iv=b'',
        block_size: Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 16, 24, and 32.') = 16,
        **more
    ):
        return super().__init__(key, iv, block_size=block_size, **more)

    @property
    def block_size(self):
        return self.args.block_size

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(block_size=self.args.block_size, **optionals)

class ripemd128 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: ripemd128 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the RIPEMD-128 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ripemd128(HashUnit):
    """
    Returns the RIPEMD-128 hash of the input data.
    """
    def _algorithm(self, data):
        from refinery.lib.ripemd128 import ripemd128
        return ripemd128(data)

class ripemd160 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: ripemd160 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the RIPEMD-160 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ripemd160(HashUnit):
    """
    Returns the RIPEMD-160 hash of the input data.
    """
    def _algorithm(self, data):
        from Cryptodome.Hash import RIPEMD160
        return RIPEMD160.new(data)

class rmv (*names)

This unit is implemented in refinery.units.meta.rmv and has the following commandline Interface:

usage: rmv [-h] [-L] [-Q] [-0] [-v] [name [name ...]]

Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no
variable names are given, the unit removes all of them. Note that this can recover variables from
outer frames that were previously shadowed.

positional arguments:
  name           Name of a variable to be removed.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rmv(Unit):
    """
    Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no
    variable names are given, the unit removes all of them. Note that this can recover variables from
    outer frames that were previously shadowed.
    """
    def __init__(self, *names: Arg(type=str, metavar='name', help='Name of a variable to be removed.')):
        super().__init__(names=names)

    def process(self, data: Chunk):
        meta = metavars(data)
        keys = self.args.names or list(meta.variable_names())
        for key in keys:
            meta.discard(key)
        return data

class rncrypt (password)

This unit is implemented in refinery.units.crypto.cipher.rncrypt and has the following commandline Interface:

usage: rncrypt [-h] [-L] [-Q] [-0] [-v] [-R] password

Implements encryption and decryption using the RNCryptor specification. See also:
https://github.com/RNCryptor

positional arguments:
  password

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class rncrypt(Unit):
    """
    Implements encryption and decryption using the RNCryptor specification.
    See also: https://github.com/RNCryptor
    """
    def __init__(self, password: bytearray):
        super().__init__(password=password)

    def process(self, data: bytes) -> bytes:
        encryption_salt = data[2:10]
        hmac_salt = data[10:18]
        iv = data[18:34]
        cipher_text = data[34:-32]
        hmac_signature = data[-32:]
        encryption_key = self._pbkdf2(self.args.password, encryption_salt)
        hmac_key = self._pbkdf2(self.args.password, hmac_salt)
        if not hmac.compare_digest(self._hmac(hmac_key, data[:-32]), hmac_signature):
            raise ValueError("Failed to verify signature.")
        return unpad(
            self._aes_decrypt(encryption_key, iv, cipher_text),
            block_size=AES.block_size
        )

    def reverse(self, data: bytes) -> bytes:
        prng = Random.new()
        data = pad(data, block_size=AES.block_size)
        encryption_salt = prng.read(8)
        encryption_key = self._pbkdf2(self.args.password, encryption_salt)
        hmac_salt = prng.read(8)
        hmac_key = self._pbkdf2(self.args.password, hmac_salt)
        iv = prng.read(AES.block_size)
        cipher_text = self._aes_encrypt(encryption_key, iv, data)
        new_data = b'\x03\x01' + encryption_salt + hmac_salt + iv + cipher_text
        return new_data + self._hmac(hmac_key, new_data)

    def _aes_encrypt(self, key, iv, text):
        return AES.new(key, AES.MODE_CBC, iv).encrypt(text)

    def _aes_decrypt(self, key, iv, text):
        return AES.new(key, AES.MODE_CBC, iv).decrypt(text)

    def _hmac(self, key, data):
        return hmac.new(key, data, hashlib.sha256).digest()

    def _prf(self, secret, salt):
        return hmac.new(secret, salt, hashlib.sha1).digest()

    def _pbkdf2(self, password, salt, iterations=10000, key_length=32):
        return KDF.PBKDF2(password, salt, dkLen=key_length, count=iterations, prf=self._prf)

class rot (amount=13)

This unit is implemented in refinery.units.crypto.cipher.rot and has the following commandline Interface:

usage: rot [-h] [-L] [-Q] [-0] [-v] [N]

Rotate the characters of the alphabet by the given amount. The default amount is 13, providing
the common (and weak) string obfuscation method.

positional arguments:
  N              Number of letters to rotate by; Default is 13.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rot(Unit):
    """
    Rotate the characters of the alphabet by the given amount. The default
    amount is 13, providing the common (and weak) string obfuscation method.
    """

    def __init__(self, amount: Arg.Number(help='Number of letters to rotate by; Default is 13.') = 13):
        super().__init__(amount=amount)

    def process(self, data: bytearray):
        rot = self.args.amount % 26
        for index, byte in enumerate(data):
            for alphabet in _LCASE, _UCASE:
                if byte in alphabet:
                    zero = alphabet[0]
                    data[index] = zero + (byte - zero + rot) % 26
                    break
        return data

class rotl (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.rotl and has the following commandline Interface:

usage: rotl [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Rotate the bits of each block left.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class rotl(BinaryOperation):
    """
    Rotate the bits of each block left.
    """
    def operate(self, value, shift):
        shift %= self.fbits
        return (value << shift) | (value >> (self.fbits - shift))

    def inplace(self, value, shift):
        shift %= self.fbits
        lower = value >> (self.fbits - shift)
        value <<= shift
        value |= lower

class rotr (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.rotr and has the following commandline Interface:

usage: rotr [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Rotate the bits of each block right.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class rotr(BinaryOperation):
    """
    Rotate the bits of each block right.
    """
    def operate(self, value, shift):
        shift %= self.fbits
        return (value >> shift) | (value << (self.fbits - shift))

    def inplace(self, value, shift):
        shift %= self.fbits
        lower = value >> shift
        value <<= self.fbits - shift
        value |= lower

class rsa (key, swapkeys=False, textbook=False, padding=PAD.AUTO, rsautl=False)

This unit is implemented in refinery.units.crypto.cipher.rsa and has the following commandline Interface:

usage: rsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-t | -p PAD | -r] key

Implements single block RSA encryption and decryption. This unit can be used to encrypt and
decrypt blocks generated by openssl's rsautl tool when using the mode -verify. When it is
executed with a public key for decryption or with a private key for encryption, it will perform a
raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5.

positional arguments:
  key                RSA key in PEM, DER, or Microsoft BLOB format.

optional arguments:
  -s, --swapkeys     Swap public and private exponent.
  -t, --textbook     Equivalent to --padding=NONE.
  -p, --padding PAD  Choose one of the following padding modes: auto, none, oaep, pkcs15, pkcs10.
                     The default is AUTO.
  -r, --rsautl       Act as rsautl from OpenSSH; This is equivalent to --swapkeys
                     --padding=PKCS10

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class rsa(Unit):
    """
    Implements single block RSA encryption and decryption. This unit can be used to encrypt
    and decrypt blocks generated by openssl's `rsautl` tool when using the mode `-verify`.
    When it is executed with a public key for decryption or with a private key for encryption,
    it will perform a raw RSA operation. The result of these operations are (un)padded using
    EMSA-PKCS1-v1_5.
    """
    def __init__(
        self,
        key: Arg(help='RSA key in PEM, DER, or Microsoft BLOB format.'),
        swapkeys: Arg.Switch('-s', help='Swap public and private exponent.') = False,
        textbook: Arg.Switch('-t', group='PAD', help='Equivalent to --padding=NONE.') = False,
        padding : Arg.Option('-p', group='PAD', choices=PAD,
            help='Choose one of the following padding modes: {choices}. The default is AUTO.') = PAD.AUTO,
        rsautl  : Arg.Switch('-r', group='PAD',
            help='Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10') = False,
    ):
        padding = Arg.AsOption(padding, PAD)
        if textbook:
            if padding != PAD.AUTO:
                raise ValueError('Conflicting padding options!')
            padding = padding.NONE
        if rsautl:
            if padding and padding != PAD.PKCS10:
                raise ValueError('Conflicting padding options!')
            swapkeys = True
            padding = PAD.PKCS10

        super().__init__(key=key, textbook=textbook, padding=padding, swapkeys=swapkeys)

        self._key_hash = None
        self._key_data = None

    @property
    def blocksize(self) -> int:
        return self.key.size_in_bytes()

    @property
    def _blocksize_plain(self) -> int:
        # PKCS#1 v1.5 padding is at least 11 bytes.
        return self.blocksize - 11

    @property
    def pub(self):
        return self.key.d if self.args.swapkeys else self.key.e

    @property
    def prv(self):
        return self.key.e if self.args.swapkeys else self.key.d

    def _get_msg(self, data):
        msg = int.from_bytes(data, byteorder='big')
        if msg > self.key.n:
            raise ValueError(F'This key can only handle messages of size {self.blocksize}.')
        return msg

    def _encrypt_raw(self, data):
        return pow(
            self._get_msg(data),
            self.pub,
            self.key.n
        ).to_bytes(self.blocksize, byteorder='big')

    def _decrypt_raw(self, data):
        return pow(
            self._get_msg(data),
            self.prv,
            self.key.n
        ).to_bytes(self.blocksize, byteorder='big')

    def _unpad(self, data, head, padbyte=None):
        if len(data) > self.blocksize:
            raise ValueError(F'This key can only handle messages of size {self.blocksize}.')
        if data.startswith(head):
            pos = data.find(B'\0', 2)
            if pos > 0:
                pad = data[2:pos]
                if padbyte is None or all(b == padbyte for b in pad):
                    return data[pos + 1:]
        raise ValueError('Incorrect padding')

    def _pad(self, data, head, padbyte=None):
        if len(data) > self._blocksize_plain:
            raise ValueError(F'This key can only encrypt messages of size at most {self._blocksize_plain}.')
        pad = self.blocksize - len(data) - len(head) - 1
        if padbyte is not None:
            padding = pad * bytes((padbyte,))
        else:
            padding = bytearray(1)
            while not all(padding):
                padding = bytearray(filter(None, padding))
                padding.extend(get_random_bytes(pad - len(padding)))
        return head + padding + B'\0' + data

    def _unpad_pkcs10(self, data):
        return self._unpad(data, B'\x00\x01', 0xFF)

    def _unpad_pkcs15(self, data):
        return self._unpad(data, B'\x00\x02', None)

    def _pad_pkcs10(self, data):
        return self._pad(data, B'\x00\x01', 0xFF)

    def _pad_pkcs15(self, data):
        return self._pad(data, B'\x00\x02', None)

    def _decrypt_block_OAEP(self, data):
        self.log_debug('Attempting decryption with PyCrypto PKCS1 OAEP.')
        result = PKCS1_OAEP.new(self.key).decrypt(data)
        if result is not None:
            return result
        raise ValueError('OAEP decryption was unsuccessful.')

    def _encrypt_block_OAEP(self, data):
        self.log_debug('Attempting encryption with PyCrypto PKCS1 OAEP.')
        result = PKCS1_OAEP.new(self.key).encrypt(data)
        if result is None:
            return result
        raise ValueError('OAEP encryption was unsuccessful.')

    def _decrypt_block(self, data):
        if self._oaep and self._pads in {PAD.AUTO, PAD.OAEP}:
            try:
                return self._decrypt_block_OAEP(data)
            except ValueError:
                if self._pads: raise
                self.log_debug('PyCrypto primitives failed, no longer attempting OAEP.')
                self._oaep = False

        data = self._decrypt_raw(data)
        return self._unpad_per_argument(data)

    def _unpad_per_argument(self, data):
        if self._pads == PAD.NONE:
            return data
        elif self._pads == PAD.PKCS10:
            return self._unpad_pkcs10(data)
        elif self._pads == PAD.PKCS15:
            return self._unpad_pkcs15(data)
        elif self._pads == PAD.AUTO:
            with suppress(ValueError):
                data = self._unpad_pkcs10(data)
                self.log_info('Detected PKCS1.0 padding.')
                self._pads = PAD.PKCS10
                return data
            with suppress(ValueError):
                data = self._unpad_pkcs15(data)
                self.log_info('Detected PKCS1.5 padding.')
                self._pads = PAD.PKCS15
                return data
            self.log_warn('No padding worked, returning raw decrypted blocks.')
            self._pads = PAD.NONE
            return data
        else:
            raise ValueError(F'Invalid padding value: {self._pads!r}')

    def _encrypt_block(self, data):
        if self._pads in {PAD.AUTO, PAD.OAEP}:
            try:
                return self._encrypt_block_OAEP(data)
            except ValueError:
                if self._pads: raise
                self.log_debug('PyCrypto primitives for OAEP failed, falling back to PKCS1.5.')
                self._pads = PAD.PKCS15

        if self._pads == PAD.PKCS15:
            data = self._pad_pkcs15(data)
        elif self._pads == PAD.PKCS10:
            data = self._pad_pkcs10(data)

        return self._encrypt_raw(data)

    @property
    def key(self) -> RSA.RsaKey:
        key_blob = self.args.key
        key_hash = hash(key_blob)
        if key_hash != self._key_hash:
            self._key_hash = key_hash
            self._key_data = normalize_rsa_key(key_blob)
        return self._key_data

    def process(self, data):
        self._oaep = True
        self._pads = self.args.padding
        if not self.key.has_private():
            try:
                return self._unpad_per_argument(self._encrypt_raw(data))
            except Exception as E:
                raise ValueError(F'A public key was given for decryption and rsautl mode resulted in an error: {E}') from E
        return B''.join(self._decrypt_block(block) for block in splitchunks(data, self.blocksize))

    def reverse(self, data):
        self._pads = self.args.padding
        return B''.join(self._encrypt_block(block) for block in splitchunks(data, self._blocksize_plain))

class rsakey (public=False, output=RSAFormat.PEM)

This unit is implemented in refinery.units.crypto.cipher.rsakey and has the following commandline Interface:

usage: rsakey [-h] [-L] [-Q] [-0] [-v] [-p] [output]

Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are
supported. The same formats are supported for the input format, but you can also specify a key in
the following format, where both modulus and exponent have to be hex-encoded:
[modulus]:[exponent]

positional arguments:
  output         Select an output format (PEM/DER/XKMS/TEXT/JSON), default is PEM.

optional arguments:
  -p, --public   Force public key output even if the input is private.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class rsakey(Unit):
    """
    Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported.
    The same formats are supported for the input format, but you can also specify a key in the following
    format, where both modulus and exponent have to be hex-encoded: `[modulus]:[exponent]`
    """
    def __init__(
        self,
        public: Arg.Switch('-p', help='Force public key output even if the input is private.') = False,
        output: Arg(help='Select an output format (PEM/DER/XKMS/TEXT/JSON), default is PEM.') = RSAFormat.PEM
    ):
        super().__init__(public=public, output=Arg.AsOption(output, RSAFormat))

    def _xkms_wrap(self, number: int):
        size, r = divmod(number.bit_length(), 8)
        size += int(bool(r))
        return base64.b64encode(number.to_bytes(size, 'big'))

    def process(self, data):
        key = normalize_rsa_key(data, force_public=self.args.public)
        out = self.args.output
        if out is RSAFormat.PEM:
            yield key.export_key('PEM')
            return
        if out is RSAFormat.DER:
            yield key.export_key('DER')
            return
        components = {
            'Modulus' : key.n,
            'Exponent': key.e,
        }
        if key.has_private():
            decoded = DerSequence()
            decoded.decode(key.export_key('DER'))
            it = itertools.islice(decoded, 3, None)
            for v in ('D', 'P', 'Q', 'DP', 'DQ', 'InverseQ'):
                try:
                    components[v] = next(it)
                except StopIteration:
                    break
        if out is RSAFormat.XKMS:
            for tag in components:
                components[tag] = base64.b64encode(number.long_to_bytes(components[tag])).decode('ascii')
            tags = '\n'.join(F'\t<{tag}>{value}</{tag}>' for tag, value in components.items())
            yield F'<RSAKeyPair>\n{tags}\n</RSAKeyPair>'.encode(self.codec)
            return
        components['BitSize'] = key.n.bit_length()
        for tag, value in components.items():
            if value.bit_length() > 32:
                components[tag] = F'{value:X}'
        if out is RSAFormat.JSON:
            yield json.dumps(components, indent=4).encode(self.codec)
            return
        if out is RSAFormat.TEXT:
            table = list(flattened(components))
            for key, value in table:
                value = F'0x{value}' if isinstance(value, str) else str(value)
                value = '\n'.join(F'{L}' for L in textwrap.wrap(value, 80))
                yield F'-- {key + " ":-<77}\n{value!s}'.encode(self.codec)

class salsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

This unit is implemented in refinery.units.crypto.cipher.salsa and has the following commandline Interface:

usage: salsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce]

Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as
the key, this data is interpreted as the initial state box and all other parameters are ignored.

positional arguments:
  key                The encryption key.
  nonce              The nonce. Default is the string REFINERY.

optional arguments:
  -s, --stateful     Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N    Discard the first N bytes of the keystream, 0 by default.
  -m, --magic MAGIC  The magic constant; depends on the key size by default.
  -x, --offset N     Optionally specify the stream index, default is 0.
  -r, --rounds N     The number of rounds. Has to be an even number.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class salsa(LatinCipherUnit):
    """
    Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided
    as the key, this data is interpreted as the initial state box and all other parameters are
    ignored.
    """
    def keystream(self) -> Iterable[int]:
        key = self.args.key
        if len(key) == 64:
            it = SalsaCipher.FromState(key)
        else:
            it = SalsaCipher(
                key,
                self.args.nonce,
                self.args.magic,
                self.args.rounds,
                self.args.offset,
            )
        yield from it

class salsa20 (key, nonce=b'REFINERY')

This unit is implemented in refinery.units.crypto.cipher.salsa and has the following commandline Interface:

usage: salsa20 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce]

Salsa20 encryption and decryption. This unit is functionally equivalent to salsa with 20 rounds,
but it uses the PyCryptodome library C implementation rather than the pure Python implementation
used by salsa.

positional arguments:
  key            The encryption key.
  nonce          The nonce. Default is the string REFINERY.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class salsa20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(Salsa20)):
    """
    Salsa20 encryption and decryption. This unit is functionally equivalent to `refinery.salsa`
    with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure
    Python implementation used by `refinery.salsa`.
    """
    pass

class scope (*slice, visible=True)

This unit is implemented in refinery.units.meta.scope and has the following commandline Interface:

usage: scope [-h] [-L] [-Q] [-0] [-v] [-n] [start:end:step [start:end:step ...]]

After using scope within in a frame, all the following operations will be applied only to the
selected indices. All remaining chunks still exist, they are just not operated on. When the frame
closes or the frame is being rescoped by a second application of this unit, they become visible
again.

positional arguments:
  start:end:step  Specify start:end:step in Python slice syntax. The default is :.

optional arguments:
  -n, --not       Hide the given chunks instead of making them the only ones visible.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class scope(FrameSlicer):
    """
    After using `refinery.scope` within in a `refinery.lib.frame`, all the
    following operations will be applied only to the selected indices. All
    remaining chunks still exist, they are just not operated on. When the
    frame closes or the frame is being rescoped by a second application of
    this unit, they become visible again.
    """
    def __init__(self, *slice, visible: Arg.Switch('-n', '--not', off=True, help=(
        'Hide the given chunks instead of making them the only ones visible.')) = True
    ):
        super().__init__(*slice, visible=visible)
        # Sort any slices with negative arguments to the back so we check
        # them last. This delays potential consumption of the chunks iterator
        # as much as possible.
        self.args.slice.sort(
            key=lambda s: (s.start or 0, s.stop or 0), reverse=True)

    def filter(self, chunks):
        it = iter(chunks)
        consumed = None
        size = None

        def buffered() -> Generator[Chunk, None, None]:
            yield from it
            while consumed:
                yield consumed.popleft()

        def shift(offset, default):
            nonlocal consumed, it, size
            if offset is None:
                return default
            if offset >= 0:
                return offset
            if consumed is None:
                from collections import deque
                self.log_info(F'consuming iterator to compute negative offset {offset}.')
                consumed = deque(it)
                size = len(consumed) + k + 1
            return max(0, offset + size)

        for k, chunk in enumerate(buffered()):
            for s in self.args.slice:
                if k in range(shift(s.start, 0), shift(s.stop, k + 1), s.step or 1):
                    chunk.visible = self.args.visible
                    break
            else:
                chunk.visible = not self.args.visible
            self.log_debug(chunk)
            yield chunk

class seal (key, discard=0, stateful=False)

This unit is implemented in refinery.units.crypto.cipher.seal and has the following commandline Interface:

usage: seal [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key

SEAL encryption and decryption.

positional arguments:
  key              The encryption key.

optional arguments:
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class seal(StreamCipherUnit):
    """
    SEAL encryption and decryption.
    """
    key_size = {20}

    def keystream(self) -> Iterable[bytes]:
        return SEAL_Cipher(self.args.key)

class secstr (key=b'\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10', iv=None)

This unit is implemented in refinery.units.crypto.cipher.secstr and has the following commandline Interface:

usage: secstr [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [key]

Implements the AES-based encryption scheme used by the PowerShell commands ConvertFrom-
SecureString and ConvertTo-SecureString.

positional arguments:
  key            Secure string encryption 16-byte AES key; the default are the bytes from 1 to
                 16.

optional arguments:
  -i, --iv IV    Optionally specify an IV to use for encryption.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class secstr(Unit):
    """
    Implements the AES-based encryption scheme used by the PowerShell commands
    `ConvertFrom-SecureString` and `ConvertTo-SecureString`.
    """

    # This is a magic header value used for PowerShell secure strings.
    _MAGIC = bytes((
        0xEF, 0xAE, 0x3D, 0xD9, 0xDD, 0x75, 0xD7, 0xAE, 0xF8, 0xDD, 0xFD, 0x38,
        0xDB, 0x7E, 0x35, 0xDD, 0xBD, 0x7A, 0xD3, 0x9D, 0x1A, 0xE7, 0x7E, 0x39))

    # Secure strings include a decimal number formatted as a string directly
    # following the header. Presumably, this is the PowerShell version.
    _PSVER = 2

    def __init__(
        self, key: Arg(
            help='Secure string encryption 16-byte AES key; the default are the bytes from 1 to 16.'
        ) = bytes(range(1, 17)),
        iv: Arg('-i', help='Optionally specify an IV to use for encryption.') = None
    ):
        super().__init__(key=key, iv=iv)

    @property
    def key(self):
        key = self.args.key
        if len(key) not in (0x10, 0x18, 0x20):
            raise ValueError('The encryption key has to be 16 bytes long.')
        return key

    @property
    def iv(self):
        iv = self.args.iv
        if iv is not None and len(iv) != 0x10:
            raise ValueError('The IV has to be 16 bytes long.')
        return iv

    def reverse(self, data):
        ivec = self.iv or urandom(0x10)
        if len(ivec) != 0x10:
            raise ValueError(self._IVERR)
        cipher = AES.new(self.key, AES.MODE_CBC, ivec)
        data = data.decode('latin-1').encode('utf-16LE')
        data = cipher.encrypt(pad(data, block_size=0x10))
        data = base64.b16encode(data).lower().decode('ascii')
        ivec = base64.b64encode(ivec).decode('ascii')
        data = '|'.join(('%d' % self._PSVER, ivec, data)).encode('utf-16LE')
        return base64.b64encode(self._MAGIC + data)

    def process(self, data):
        head, ivec, data = base64.b64decode(data).split(b'|\0')
        self.log_info('head:', head.hex())
        ivec = base64.b64decode(ivec.decode('utf-16LE'))
        self.log_info('ivec:', ivec.hex())
        data = base64.b16decode(data.decode('utf-16LE'), casefold=True)
        if len(data) % 0x10 != 0:
            self.log_info('data not block-aligned, padding with zeros')
            data += B'\0' * (0x10 - len(data) % 0x10)
        cipher = AES.new(self.key, AES.MODE_CBC, ivec)
        data = cipher.decrypt(data)
        try:
            data = unpad(data, block_size=0x10)
        except Exception:
            self.log_warn('decrypted data does not have PKCS7 padding')
        for p in range(0x10):
            try:
                return data[-p:].decode('utf-16LE').encode('latin-1')
            except UnicodeDecodeError:
                pass
            except UnicodeEncodeError:
                pass
        self.log_warn('result is not a padded unicode string, key is likely wrong')
        return data

class sep (separator=b'\n', scoped=False)

This unit is implemented in refinery.units.meta.sep and has the following commandline Interface:

usage: sep [-h] [-L] [-Q] [-0] [-v] [-s] [separator]

Multiple inputs are joined along a specified separator. If any of the input Chunks is currently
out of scope, sep turns makes them visible by default. This can be prevented by using the -s
flag.

positional arguments:
  separator      Separator; the default is a line break.

optional arguments:
  -s, --scoped   Maintain chunk scope; i.e. do not turn all input chunks visible.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sep(Unit):
    """
    Multiple inputs are joined along a specified separator. If any of the input
    `refinery.lib.frame.Chunk`s is currently out of scope, `refinery.sep` turns
    makes them visible by default. This can be prevented by using the `-s` flag.
    """

    def __init__(
        self, separator: Arg(help='Separator; the default is a line break.') = B'\n',
        scoped: Arg.Switch('-s', help=(
            'Maintain chunk scope; i.e. do not turn all input chunks visible.')) = False
    ):
        super().__init__(separator=separator, scoped=scoped)
        self.separate = False

    def filter(self, chunks):
        it = iter(chunks)
        try:
            chunk = next(it)
        except StopIteration:
            return
        self.separate = True
        for upcoming in it:
            if not self.args.scoped:
                chunk.visible = True
            yield chunk
            chunk = upcoming
        self.separate = False
        yield chunk

    def process(self, data):
        yield data
        if self.separate:
            yield self.args.separator

class serpent (key, iv=b'', padding=None, mode=None, raw=False, swap=False)

This unit is implemented in refinery.units.crypto.cipher.serpent and has the following commandline Interface:

usage: serpent [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key

Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in
one direction, some in the other. When decryption results with this unit do not yield the
expected result, try using the --swap (or -s) option to swap the bytes in each block.
Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by
prefixing the input key by the multibin handler snip[::-1].

positional arguments:
  key              The encryption key.

optional arguments:
  -i, --iv IV      Specifies the initialization vector. If none is specified, then a block of
                   zero bytes is used.
  -p, --padding P  Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does
                   nothing. By default, all other algorithms are attempted. In most cases, the
                   data was not correctly decrypted if none of these work.
  -m, --mode M     Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB,
                   PCBC. By default, the CBC mode is used when an IV is is provided, and ECB
                   otherwise.
  -r, --raw        Set the padding to raw; ignored when a padding is specified.
  -s, --swap       Read the bytes in each block in reverse order.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class serpent(StandardBlockCipherUnit, cipher=BlockCipherFactory(Serpent)):
    """
    Serpent encryption and decryption. Some Serpent implementations read the bytes of each block
    in one direction, some in the other. When decryption results with this unit do not yield the
    expected result, try using the `--swap` (or `-s`) option to swap the bytes in each block.
    Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done
    by prefixing the input key by the multibin handler `snip[::-1]`.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        swap: Arg.Switch('-s', help='Read the bytes in each block in reverse order.') = False
    ):
        super().__init__(key, iv, padding=padding, mode=mode, raw=raw, swap=swap)

    def _new_cipher(self, **optionals) -> CipherInterface:
        instance: Serpent = super()._new_cipher()
        instance.swap = self.args.swap
        return instance

class sha1 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha1 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the SHA1 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sha1(HashUnit):
    """
    Returns the SHA1 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.sha1(data)

class sha224 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha224 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the SHA224 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sha224(HashUnit):
    """
    Returns the SHA224 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.sha224(data)

class sha256 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha256 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the SHA256 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sha256(HashUnit):
    """
    Returns the SHA256 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.sha256(data)

class sha384 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha384 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the SHA384 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sha384(HashUnit):
    """
    Returns the SHA384 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.sha384(data)

class sha512 (text=False)

This unit is implemented in refinery.units.crypto.hash.cryptographic and has the following commandline Interface:

usage: sha512 [-h] [-L] [-Q] [-0] [-v] [-t]

Returns the SHA512 hash of the input data.

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class sha512(HashUnit):
    """
    Returns the SHA512 hash of the input data.
    """
    def _algorithm(self, data):
        return hashlib.sha512(data)

class shl (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.shl and has the following commandline Interface:

usage: shl [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Shift the bits of each block left, filling with zero bits.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class shl(BinaryOperation):
    """
    Shift the bits of each block left, filling with zero bits.
    """
    @staticmethod
    def operate(a, b): return a << b
    @staticmethod
    def inplace(a, b): a <<= b

class shr (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.shr and has the following commandline Interface:

usage: shr [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Shift the bits of each block right, filling with zero bits.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class shr(BinaryOperation):
    """
    Shift the bits of each block right, filling with zero bits.
    """
    @staticmethod
    def operate(a, b): return a >> b
    @staticmethod
    def inplace(a, b): a >>= b

class sm4 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)

This unit is implemented in refinery.units.crypto.cipher.sm4 and has the following commandline Interface:

usage: sm4 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] key

The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography
Administration of China (SCA).

positional arguments:
  key                   The encryption key.

optional arguments:
  -i, --iv IV           Specifies the initialization vector. If none is specified, then a block
                        of zero bytes is used.
  -p, --padding P       Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                        does nothing. By default, all other algorithms are attempted. In most
                        cases, the data was not correctly decrypted if none of these work.
  -m, --mode M          Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                        OFB, PCBC. By default, the CBC mode is used when an IV is is provided,
                        and ECB otherwise.
  -r, --raw             Set the padding to raw; ignored when a padding is specified.
  -e, --little-endian   Only for CTR: Use a little endian counter instead of the default big
                        endian.
  -S, --segment-size N  Only for CFB: Number of bits into which data is segmented. It must be a
                        multiple of 8. The default of 0 means that the block size will be used as
                        the segment size.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class sm4(StandardBlockCipherUnit, cipher=BlockCipherFactory(SM4)):
    """
    The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography
    Administration of China (SCA).
    """
    pass

class snip (slices=[slice(None, None, None)], length=False, remove=False)

This unit is implemented in refinery.units.strings.snip and has the following commandline Interface:

usage: snip [-h] [-L] [-Q] [-0] [-v] [-l] [-r] [slices [slices ...]]

Snips the input data based on a Python slice expression. For example, the initialization slice
0::1 1::1 would yield a unit that first extracts every byte at an even position and then, every
byte at an odd position. In this case, multiple outputs are produced. The unit can be used in
reverse mode, in which case the specified ranges are deleted sequentially from the input.

positional arguments:
  slices         Specify start:stop:step in Python slice syntax.

optional arguments:
  -l, --length   Interpret the end of a slice as a length rather than as an offset.
  -r, --remove   Remove the slices from the input rather than selecting them.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class snip(Unit):
    """
    Snips the input data based on a Python slice expression. For example, the
    initialization `slice 0::1 1::1` would yield a unit that first extracts
    every byte at an even position and then, every byte at an odd position. In
    this case, multiple outputs are produced. The unit can be used in reverse
    mode, in which case the specified ranges are deleted sequentially from the
    input.
    """
    def __init__(
        self,
        slices: Arg(help='Specify start:stop:step in Python slice syntax.') = [slice(None, None)],
        length: Arg.Switch('-l',
            help='Interpret the end of a slice as a length rather than as an offset.') = False,
        remove: Arg.Switch('-r',
            help='Remove the slices from the input rather than selecting them.') = False,
    ):
        super().__init__(slices=slices, length=length, remove=remove)

    def process(self, data: bytearray):
        slices: list[slice] = list(self.args.slices)
        if self.args.length:
            for k, s in enumerate(slices):
                if s.stop is None:
                    continue
                slices[k] = slice(s.start, (s.start or 0) + s.stop, s.step)
        if self.args.remove:
            for last, bounds in lookahead(slices):
                chunk = data if last else copy(data)
                del chunk[bounds]
                yield chunk
        else:
            for bounds in slices:
                yield data[bounds]

class sorted (key=None, ascending=False)

This unit is implemented in refinery.units.meta.sorted and has the following commandline Interface:

usage: sorted [-h] [-L] [-Q] [-0] [-v] [-a] [key]

Sorts all elements of the input frame lexicographically. This unit is a nop on single inputs.

positional arguments:
  key              A meta variable expression to sort by instead of sorting the content.

optional arguments:
  -a, --ascending  Sort in ascending order, the default is descending.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class sorted(Unit):
    """
    Sorts all elements of the input `refinery.lib.frame` lexicographically.
    This unit is a `refinery.nop` on single inputs.
    """

    def __init__(
        self,
        key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None,
        ascending: Arg.Switch('-a', help='Sort in ascending order, the default is descending.') = False
    ):
        super().__init__(key=key, ascending=ascending)

    def filter(self, chunks):
        sortbuffer = []
        invisibles = []
        key = self.args.key
        rev = not self.args.ascending

        if key is not None:
            def _key(chunk):
                return expression(metavars(chunk)), chunk
            expression = PythonExpression(key, all_variables_allowed=True)
            key = _key

        def sorted():
            if not sortbuffer:
                return
            sortbuffer.sort(key=key, reverse=rev)
            yield from sortbuffer
            sortbuffer.clear()

        for chunk in chunks:
            if chunk.visible:
                yield from invisibles
                invisibles.clear()
                sortbuffer.append(chunk)
            else:
                yield from sorted()
                invisibles.append(chunk)

        yield from invisibles
        yield from sorted()

class sosemanuk (key, stateful=False, discard=0, nonce=b'')

This unit is implemented in refinery.units.crypto.cipher.sosemanuk and has the following commandline Interface:

usage: sosemanuk [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] key [nonce]

positional arguments:
  key              The encryption key.
  nonce            The nonce. Default is empty, which is equivalent to 16 null bytes.

optional arguments:
  -s, --stateful   Do not reset the key stream while processing the chunks of one frame.
  -d, --discard N  Discard the first N bytes of the keystream, 0 by default.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class sosemanuk(StreamCipherUnit):

    def __init__(
        self, key, stateful=False, discard=0,
        nonce: Arg(help='The nonce. Default is empty, which is equivalent to 16 null bytes.') = B'',
    ):
        super().__init__(key=key, nonce=nonce, stateful=stateful, discard=discard)

    def keystream(self):
        yield from Sosemanuk(self.args.key, self.args.nonce)

class stego (transpose, split=False, parts='RGB')

This unit is implemented in refinery.units.formats.stego and has the following commandline Interface:

usage: stego [-h] [-L] [-Q] [-0] [-v] [-t] [-m] [parts]

Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs
these values as bytes. By default, the pixels are converted left to right, top to bottom.

positional arguments:
  parts            A string containing any ordering of the letters R, G, B, and A (case-
                   insensitive). These pixel components will be extracted from every pixel in the
                   given order. The default value is RGB.

optional arguments:
  -t, --transpose  Return the columns of the image rather than the rows.
  -m, --split      Emit the individual rows or columns as separate outputs.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class stego(Unit):
    """
    Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs
    these values as bytes. By default, the pixels are converted left to right, top to bottom.
    """
    def __init__(
        self,
        transpose: Arg.Switch('-t', help='Return the columns of the image rather than the rows.'),
        split: Arg.Switch('-m', help='Emit the individual rows or columns as separate outputs.') = False,
        parts: Arg('parts', nargs='?', type=str, help=(
            'A string containing any ordering of the letters R, G, B, and A (case-insensitive). '
            'These pixel components will be extracted from every pixel in the given order. The '
            'default value is {default}.'
        )) = 'RGB'
    ):
        super().__init__(
            transpose=transpose,
            split=split,
            parts=tuple(Arg.AsOption(p, PIXEL_PART) for p in parts)
        )

    @Unit.Requires('Pillow', 'formats')
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        split = self.args.split
        parts = self.args.parts
        image = self._image.open(MemoryFile(data))
        if self.args.transpose:
            image = image.transpose(self._image.Transpose.ROTATE_90)
        width, height = image.size
        chunk_size = len(parts)
        output = MemoryFile()
        buffer = bytearray(chunk_size * width)
        for y in range(height):
            offset = 0
            for x in range(width):
                pixel = image.getpixel((x, y))
                next_offset = offset + chunk_size
                buffer[offset:next_offset] = (pixel[p] for p in parts)
                offset = next_offset
            if split:
                yield buffer
            else:
                output.write(buffer)
        if not split:
            yield output.getvalue()

class stretch (*count)

This unit is implemented in refinery.units.strings.stretch and has the following commandline Interface:

usage: stretch [-h] [-L] [-Q] [-0] [-v] [-R] [count [count ...]]

Stretch the input data by repeating every byte a number of times.

positional arguments:
  count          The number of times every byte should be repeated. By default, every byte is
                 repeated once.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class stretch(Unit):
    """
    Stretch the input data by repeating every byte a number of times.
    """
    def __init__(self, *count: Arg.Number(metavar='count', help=(
        'The number of times every byte should be repeated. By default,  '
        'every byte is repeated once.'
    ))):
        count = count or (2,)
        if any(k <= 0 for k in count):
            raise ValueError('You can not use a stretching factor of less than 1.')
        super().__init__(count=count or (2,))

    def process(self, data):
        def stretched(it):
            factor = cycle(self.args.count)
            for byte in it:
                yield from repeat(byte, next(factor))
        return bytearray(stretched(iter(data)))

    def reverse(self, data):
        # one-sided inverse
        def clinched(it):
            factor = cycle(self.args.count)
            while True:
                try:
                    take = islice(it, next(factor))
                    yield next(take)
                    for _ in take: pass
                except StopIteration:
                    break
        return bytearray(clinched(iter(data)))

class struct (spec, *outputs, multi=False, count=∞, until=None, more=False)

This unit is implemented in refinery.units.pattern.struct_parser and has the following commandline Interface:

usage: struct [-h] [-L] [-Q] [-0] [-v] [-m] [-n N] [-u E] [-M] spec [output [output ...]]

Read structured data from the beginning of a chunk and store the extracted fields in chunk meta
variables. The structure format is specified in extended Python struct format, and all remaining
arguments to this unit are the names of the variables that receive the values from this struct.
The extended struct format supports all field types supported by Python, as well as the
following:

- a for null-terminated ASCII strings,
- u to read encoded, null-terminated UTF16 strings,
- w to read decoded, null-terminated UTF16 strings,
- g to read Microsoft GUID values,
- E to read 7-bit encoded integers.

For example, the string LLxxHaa will read two unsigned 32bit integers, then skip two bytes, then
read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to
using native byte order with no alignment.

The spec parameter may additionally contain format expressions of the form {name:format}. Here,
format can either be an integer expression specifying a number of bytes to read, or any format
string. If name is specified for an extracted field, its value is made available as a meta
variable under the given name. For example, the expression LLxxH{foo:a}{bar:a} would be parsed in
the same way as the previous example, but the two ASCII strings would also be stored in meta
variables under the names foo and bar, respectively. The format string of a named field is itself
parsed as a foramt string expression, where all the previously parsed fields are already
available. For example, I{:{}} reads a single 32-bit integer length prefix and then reads as many
bytes as that prefix specifies.

A second format string expression is used to specify the output format. For example, the format
string LLxxH{foo:a}{bar:a} together with the output format {foo}/{bar} would parse data as
before, but the output body would be the concatnation of the field foo, a forward slash, and the
field bar. Variables used in the output expression are not included as meta variables. As format
fields in the output expression, one can also use {1}, {2} or {-1} to access extracted fields by
index. The value {0} represents the entire chunk of structured data. By default, the output
format {#} is used, which represents either the last byte string field that was extracted, or the
entire chunk of structured data if none of the fields were extracted.

Reverse multibin expressions can be used to post-process the fields included in any output
format. For example, {F:b64:zl} will be the base64-decoded and inflate- decompressed contents of
the data that was read as field F.

Finally, it is possible to specify a byte alignment by using the syntax {field!T:a:b:c} where the
letter T is either a single digit specifying the alignment, or a single letter variable that
holds the byte alignment value in the current metadata.

positional arguments:
  spec           Structure format as explained above.
  output         Output format as explained above.

optional arguments:
  -m, --multi    Read as many pieces of structured data as possible intead of just one.
  -n, --count N  A limit on the number of chunks to read in multi mode; default is ∞.
  -u, --until E  An expression evaluated on each chunk in multi mode. New chunks will be parsed
                 only if the result is nonzero.
  -M, --more     After parsing the struct, emit one chunk that contains the data that was left
                 over in the buffer. If no data was left over, this chunk will be empty.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class struct(Unit):
    """
    Read structured data from the beginning of a chunk and store the extracted fields in chunk meta
    variables. The structure format is specified in extended Python struct format, and all
    remaining arguments to this unit are the names of the variables that receive the values from
    this struct. The extended struct format supports all field types supported by Python, as well
    as the following:

    - `a` for null-terminated ASCII strings,
    - `u` to read encoded, null-terminated UTF16 strings,
    - `w` to read decoded, null-terminated UTF16 strings,
    - `g` to read Microsoft GUID values,
    - `E` to read 7-bit encoded integers.

    For example, the string `LLxxHaa` will read two unsigned 32bit integers, then skip two bytes,
    then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults
    to using native byte order with no alignment.

    The `spec` parameter may additionally contain format expressions of the form `{name:format}`.
    Here, `format` can either be an integer expression specifying a number of bytes to read, or any
    format string. If `name` is specified for an extracted field, its value is made available as a
    meta variable under the given name. For example, the expression `LLxxH{foo:a}{bar:a}` would be
    parsed in the same way as the previous example, but the two ASCII strings would also be stored
    in meta variables under the names `foo` and `bar`, respectively. The `format` string of a named
    field is itself parsed as a foramt string expression, where all the previously parsed fields
    are already available. For example, `I{:{}}` reads a single 32-bit integer length prefix and
    then reads as many bytes as that prefix specifies.

    A second format string expression is used to specify the output format. For example, the format
    string `LLxxH{foo:a}{bar:a}` together with the output format `{foo}/{bar}` would parse data as
    before, but the output body would be the concatnation of the field `foo`, a forward slash, and
    the field `bar`. Variables used in the output expression are not included as meta variables. As
    format fields in the output expression, one can also use `{1}`, `{2}` or `{-1}` to access
    extracted fields by index. The value `{0}` represents the entire chunk of structured data. By
    default, the output format `{#}` is used, which represents either the last byte string field
    that was extracted, or the entire chunk of structured data if none of the fields were extracted.

    Reverse `refinery.lib.argformats.multibin` expressions can be used to post-process the fields
    included in any output format. For example, `{F:b64:zl}` will be the base64-decoded and inflate-
    decompressed contents of the data that was read as field `F`.

    Finally, it is possible to specify a byte alignment by using the syntax `{field!T:a:b:c}` where
    the letter `T` is either a single digit specifying the alignment, or a single letter variable
    that holds the byte alignment value in the current metadata.
    """

    def __init__(
        self,
        spec: Arg(type=str, help='Structure format as explained above.'),
        *outputs: Arg(metavar='output', type=str, help='Output format as explained above.'),
        multi: Arg.Switch('-m', help=(
            'Read as many pieces of structured data as possible intead of just one.')) = False,
        count: Arg.Number('-n', help=(
            'A limit on the number of chunks to read in multi mode; default is {default}.')) = INF,
        until: Arg('-u', metavar='E', type=str, help=(
            'An expression evaluated on each chunk in multi mode. New chunks will be parsed '
            'only if the result is nonzero.')) = None,
        more : Arg.Switch('-M', help=(
            'After parsing the struct, emit one chunk that contains the data that was left '
            'over in the buffer. If no data was left over, this chunk will be empty.')) = False
    ):
        outputs = outputs or [F'{{{_SHARP}}}']
        super().__init__(spec=spec, outputs=outputs, until=until, count=count, multi=multi, more=more)

    def process(self, data: Chunk):
        formatter = string.Formatter()
        until = self.args.until
        until = until and PythonExpression(until, all_variables_allowed=True)
        reader = StructReader(memoryview(data))
        checkpoint = 0
        mainspec = self.args.spec
        byteorder = mainspec[:1]
        if byteorder in '<@=!>':
            mainspec = mainspec[1:]
        else:
            byteorder = '='

        def fixorder(spec):
            if spec[0] not in '<@=!>':
                spec = byteorder + spec
            return spec

        previously_existing_variables = set(metavars(data).variable_names())

        it = itertools.count() if self.args.multi else (0,)
        for index in it:

            checkpoint = reader.tell()

            if reader.eof:
                break
            if index >= self.args.count:
                break

            meta = metavars(data)
            meta.ghost = True
            meta.update_index(index)

            args = []
            last = None
            self.log_debug(F'starting new read at: 0x{checkpoint:08X}')

            try:
                for prefix, name, spec, conversion in formatter.parse(mainspec):
                    name: str
                    spec: str = spec and spec.strip()
                    if prefix:
                        args.extend(reader.read_struct(fixorder(prefix)))
                    if name is None:
                        continue
                    if name and not name.isdecimal():
                        check_variable_name(name)
                    if conversion:
                        _aa = reader.tell()
                        reader.byte_align(PythonExpression.evaluate(conversion, meta))
                        _ab = reader.tell()
                        if _aa != _ab:
                            self.log_info(F'aligned from 0x{_aa:X} to 0x{_ab:X}')
                    spec, _, pipeline = spec.partition(':')
                    if spec:
                        spec = meta.format_str(spec, self.codec, args)
                    if spec:
                        try:
                            _exp = PythonExpression.evaluate(spec, meta)
                        except ParserError:
                            pass
                        else:
                            spec = _exp
                    if spec == '':
                        last = value = reader.read()
                    elif isinstance(spec, int):
                        if spec < 0:
                            spec += reader.remaining_bytes
                        if spec < 0:
                            raise ValueError(F'The specified negative read offset is {-spec} beyond the cursor.')
                        last = value = reader.read_bytes(spec)
                    else:
                        value = reader.read_struct(fixorder(spec))
                        if not value:
                            self.log_debug(F'field {name} was empty, ignoring.')
                            continue
                        if len(value) > 1:
                            self.log_info(F'parsing field {name} produced {len(value)} items reading a tuple')
                        else:
                            value = value[0]

                    if pipeline:
                        value = multibin(pipeline, reverse=True, seed=value)
                    args.append(value)

                    if name == _SHARP:
                        raise ValueError('Extracting a field with name # is forbidden.')
                    elif name.isdecimal():
                        index = int(name)
                        limit = len(args) - 1
                        if index > limit:
                            self.log_warn(F'cannot assign index field {name}, the highest index is {limit}')
                        else:
                            args[index] = value
                        continue
                    elif name:
                        meta[name] = value

                if until and until(meta):
                    self.log_info(F'the expression ({until}) evaluated to true; aborting.')
                    break

                with StreamDetour(reader, checkpoint) as detour:
                    full = reader.read(detour.cursor - checkpoint)
                if last is None:
                    last = full

                outputs = []

                for template in self.args.outputs:
                    used = set()
                    outputs.append(meta.format(template, self.codec, [full, *args], {_SHARP: last}, True, used=used))
                    for key in used:
                        if key in previously_existing_variables:
                            continue
                        meta.discard(key)

                for output in outputs:
                    chunk = Chunk(output)
                    chunk.meta.update(meta)
                    chunk.set_next_batch(index)
                    yield chunk

            except EOF:
                break

        leftover = len(reader) - checkpoint

        if not leftover:
            return
        elif self.args.more:
            reader.seekset(checkpoint)
            yield reader.read()
        else:
            leftover = repr(SizeInt(leftover)).strip()
            self.log_info(F'discarding {leftover} left in buffer')

class sub (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.sub and has the following commandline Interface:

usage: sub [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Subtract the given argument from each block.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class sub(BinaryOperationWithAutoBlockAdjustment):
    """
    Subtract the given argument from each block.
    """
    @staticmethod
    def operate(a, b): return a - b
    @staticmethod
    def inplace(a, b): a -= b

class subfiles (memdump=False, recursive=False)

This unit is implemented in refinery.units.pattern.subfiles and has the following commandline Interface:

usage: subfiles [-h] [-L] [-Q] [-0] [-v] [-m] [-r]

Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents
against the input data and generates one output chunk for each successfully carved subfile.

optional arguments:
  -m, --memdump    Assume that the input is a memdump for PE file carving.
  -r, --recursive  Extract files that are subfiles of other extracted files as separate chunks.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class subfiles(Unit):
    """
    Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against
    the input data and generates one output chunk for each successfully carved subfile.
    """

    _MINLENGTH = {
        'json': 300,
        'xml' : 300,
        'rtf' : 100,
    }

    def __init__(
        self,
        memdump  : Unit.Arg.Switch('-m',
            help='Assume that the input is a memdump for PE file carving.') = False,
        recursive: Unit.Arg.Switch('-r',
            help='Extract files that are subfiles of other extracted files as separate chunks.') = False,
    ):
        super().__init__(memdump=memdump, recursive=recursive)

    def process(self, data: bytearray):
        carvers = {
            'zip'  : carve_zip(),
            '7z'   : carve_7z(),
            'pe'   : carve_pe(memdump=self.args.memdump, fileinfo=True, recursive=True, keep_root=True),
            'lnk'  : carve_lnk(),
            'json' : carve_json(dictonly=True),
            'xml'  : carve_xml(),
            'rtf'  : carve_rtf(),
        }

        covered = []

        for extension, unit in carvers.items():
            self.log_info(F'carving {extension} files')
            for chunk in data | unit:
                if len(chunk) < self._MINLENGTH.get(extension, 1):
                    continue
                start = chunk['offset']
                end = start + len(chunk)
                if any(start > left and end < right for left, right in covered):
                    continue
                if not self.args.recursive:
                    covered.append((start, end))
                yield chunk

class swap (src, dst=None)

This unit is implemented in refinery.units.meta.swap and has the following commandline Interface:

usage: swap [-h] [-L] [-Q] [-0] [-v] src [dst]

Swap the contents of an existing variable with the contents of the chunk or with another meta
variable. When swapping with the chunk, the variable has to contain a binary string. When
swapping with a variable that does not exist, the original variable is cleared, essentially
renaming the variable.

positional arguments:
  src            The meta variable name.
  dst            Optional name of the second meta variable.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class swap(Unit):
    """
    Swap the contents of an existing variable with the contents of the chunk or with another meta variable.
    When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable
    that does not exist, the original variable is cleared, essentially renaming the variable.
    """
    def __init__(
        self,
        src: Arg(type=str, help='The meta variable name.'),
        dst: Arg(type=str, help='Optional name of the second meta variable.') = None
    ):
        super().__init__(
            src=check_variable_name(src),
            dst=check_variable_name(dst)
        )

    def filter(self, chunks: Iterable[Chunk]):
        src = self.args.src
        dst = self.args.dst
        for chunk in chunks:
            if not chunk.visible:
                pass
            elif dst is None:
                try:
                    value = chunk.meta[src]
                except KeyError:
                    value = bytearray()
                if isinstance(value, str):
                    value = value.encode(self.codec)
                elif not isbuffer(value):
                    raise ValueError(F'Unable to swap data with variable {src} because it has type {type(value).__name__}.')
                if not chunk:
                    chunk.meta.discard(src)
                else:
                    chunk.meta[src] = bytes(chunk)
                chunk[:] = value
            else:
                try:
                    value = chunk.meta.pop(src)
                except KeyError:
                    raise KeyError(F'The variable {src} does not exist.')
                try:
                    swap = chunk.meta.pop(dst)
                except KeyError:
                    chunk.meta[dst] = value
                else:
                    chunk.meta[src], chunk.meta[dst] = swap, value
            yield chunk

class szdd

This unit is implemented in refinery.units.compression.szdd and has the following commandline Interface:

usage: szdd [-h] [-L] [-Q] [-0] [-v]

Extract files from SZDD archives.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class szdd(Unit):
    """
    Extract files from SZDD archives.
    """
    def process(self, data):
        with StructReader(data) as archive:
            if archive.read(8) != b'SZDD\x88\xF0\x27\x33':
                if not self.args.lenient:
                    raise ValueError('signature missing')
                self.log_warn('the header signature is invalid, this is likely not an SZDD archive')
            if archive.read_byte() != 0x41:
                raise ValueError('Unsupported compression mode')
            # ignore the missing file extension letter:
            archive.seekrel(1)
            output_len = archive.u32()
            window_pos = 0x1000 - 0x10
            output_pos = 0
            output = bytearray(output_len)
            window = bytearray(0x1000)
            for k in range(len(window)):
                window[k] = 0x20
            while not archive.eof:
                control = archive.read_byte()
                for cb in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80):
                    if archive.eof:
                        break
                    if control & cb:
                        output[output_pos] = window[window_pos] = archive.read_byte()
                        output_pos += 1
                        window_pos += 1
                        window_pos &= 0xFFF
                    else:
                        match_pos = archive.read_byte()
                        match_len = archive.read_byte()
                        match_pos |= (match_len & 0xF0) << 4
                        match_len = (match_len & 0x0F) + 3
                        match_pos &= 0xFFF
                        for _ in range(match_len):
                            window[window_pos] = window[match_pos]
                            output[output_pos] = window[window_pos]
                            output_pos += 1
                            window_pos += 1
                            match_pos += 1
                            window_pos &= 0xFFF
                            match_pos &= 0xFFF
            return output

    @classmethod
    def handles(self, data: bytearray):
        return data[:4] == B'SZDD'

class tea (key, iv=b'', padding=None, mode=None, raw=False, swap=False)

This unit is implemented in refinery.units.crypto.cipher.tea and has the following commandline Interface:

usage: tea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key

TEA encryption and decryption.

positional arguments:
  key              The encryption key.

optional arguments:
  -i, --iv IV      Specifies the initialization vector. If none is specified, then a block of
                   zero bytes is used.
  -p, --padding P  Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does
                   nothing. By default, all other algorithms are attempted. In most cases, the
                   data was not correctly decrypted if none of these work.
  -m, --mode M     Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB,
                   PCBC. By default, the CBC mode is used when an IV is is provided, and ECB
                   otherwise.
  -r, --raw        Set the padding to raw; ignored when a padding is specified.
  -s, --swap       Decode blocks as big endian rather than little endian.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class tea(TEAUnit, cipher=BlockCipherFactory(TEA)):
    """
    TEA encryption and decryption.
    """

class termfit (width=0, delta=0, tight=False)

This unit is implemented in refinery.units.strings.termfit and has the following commandline Interface:

usage: termfit [-h] [-L] [-Q] [-0] [-v] [-d N] [-t] [width]

Reformat incoming text data to fit a certain width.

positional arguments:
  width          Optionally specify the width, by default the current terminal width is used.

optional arguments:
  -d, --delta N  Subtract this number from the calculated width (0 by default).
  -t, --tight    Separate paragraphs by a single line break instead of two.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class termfit(Unit):
    """
    Reformat incoming text data to fit a certain width.
    """

    def __init__(
        self,
        width: Arg('width', help='Optionally specify the width, by default the current terminal width is used.') = 0,
        delta: Arg.Number('-d', help='Subtract this number from the calculated width (0 by default).') = 0,
        tight: Arg.Switch('-t', help='Separate paragraphs by a single line break instead of two.') = False,
    ):
        super().__init__(width=width, delta=delta, tight=tight)

    @unicoded
    def process(self, data: str) -> str:
        parsep = '\n' if self.args.tight else '\n\n'
        return terminalfit(data, self.args.delta, self.args.width, parsep)

class terminate (sentinel=b'\x00', blocksize=None, bigendian=False)

This unit is implemented in refinery.units.blockwise.terminate and has the following commandline Interface:

usage: terminate [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] [-E] [sentinel]

The unit reads data from the incoming chunk in blocks of any given size until the sentinel value
is encountered. The output of the unit is all data that was read, excluding the sentinel. The
default block size is one and the default sentinel value is zero, which corresponds to reading a
null-terminated string from the input. If the sentinel value is not found anywhere in the
incoming data, the complete input is returned as output.

positional arguments:
  sentinel           sentinel value to look for; default is H:00

optional arguments:
  -B, --blocksize N  The size of each block in bytes, default is 1.
  -E, --bigendian    Read chunks in big endian.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class terminate(BlockTransformationBase):
    """
    The unit reads data from the incoming chunk in blocks of any given size until the
    sentinel value is encountered. The output of the unit is all data that was read,
    excluding the sentinel. The default block size is one and the default sentinel value
    is zero, which corresponds to reading a null-terminated string from the input.
    If the sentinel value is not found anywhere in the incoming data, the complete input
    is returned as output.
    """
    def __init__(
        self,
        sentinel: Arg(help='sentinel value to look for; default is {default}') = B'\0',
        blocksize=None, bigendian=False
    ):
        super().__init__(blocksize=blocksize, bigendian=bigendian, sentinel=sentinel)

    def process(self, data: bytearray):
        sentinel = self.args.sentinel
        position = 0
        blocksize = self.blocksize

        self.log_info('blocksize:', blocksize)
        self.log_debug('separator:', sentinel)

        while position >= 0:
            position = data.find(sentinel, position)
            if position < 0:
                self.log_info(F'The sentinel value {sentinel} was not found.')
                break
            q, r = divmod(position, blocksize)
            if r:
                position = (q + 1) * blocksize
                continue
            else:
                data[position:] = []
                break

        return data

    def reverse(self, data: bytearray):
        sentinel = self.args.sentinel
        position = 0
        while True:
            position = data.find(sentinel, position)
            if position < 0:
                data.extend(sentinel)
                break
            if position % self.blocksize == 0:
                self.log_warn('input string already contains the termination character; returning unmodified input')
                break
            position += 1
        return data

class tnetmtm (headers_as_meta_vars, list_header_names, header_filter)

This unit is implemented in refinery.units.formats.tnetmtm and has the following commandline Interface:

usage: tnetmtm [-h] [-L] [-Q] [-0] [-v] [--populate-headers] [--list-header-names]
               [--header-filter HEADER_FILTER]

Parses out payloads from tnetstring files generated by mitmproxy. The unit is also able to
populate HTTP headers as meta variables or emitting header values instead of actual payloads.

optional arguments:
  --populate-headers, -p
  --list-header-names, -l
  --header-filter, -f HEADER_FILTER

generic options:
  -h, --help                  Show this help message and exit.
  -L, --lenient               Allow partial results as output.
  -Q, --quiet                 Disables all log output.
  -0, --devnull               Do not produce any output.
  -v, --verbose               Specify up to two times to increase log level.

Expand source code Browse git

class tnetmtm(Unit):
    """
    Parses out payloads from tnetstring files generated by mitmproxy. The unit is also able to populate HTTP headers as
    meta variables or emitting header values instead of actual payloads.
    """

    def __init__(
            self,
            headers_as_meta_vars: Arg.Switch('--populate-headers', '-p'),
            list_header_names: Arg.Switch('--list-header-names', '-l'),
            header_filter: Arg('--header-filter', '-f'),
    ):
        ...

    @Unit.Requires('mitmproxy', 'all')
    def _tnetstring():
        from mitmproxy.io import tnetstring
        return tnetstring

    @staticmethod
    def _generate_errors(log_line: Dict) -> Iterator[str]:
        def _extract_error(d: Optional[Dict]) -> Optional[str]:
            return ((d or {}).get('error') or {}).get('msg')

        proxy_error = _extract_error(log_line.get('client_conn'))
        if proxy_error:
            yield proxy_error
        error = _extract_error(log_line)
        if error:
            yield error
        return error

    def _default_meta_vars(self, log_line, request: Dict, response: Dict) -> Dict[str, Union[str, int]]:
        ret = {
            'request_method': request.get('method').decode('utf-8'),
            'request_scheme': request.get('scheme').decode('utf-8'),
            'request_host': request.get('host'),
            'request_query_string': request.get('path').decode('utf-8'),
            'request_header_count': len(request.get('headers', [])),
            'response_status_code': response.get('status_code'),
            'response_header_count': len(response.get('headers', [])),
        }
        for num, error in enumerate(self._generate_errors(log_line)):
            ret[f'error_{num}'] = error
        request_http_version = request.get('http_version')
        if request_http_version:
            ret['request_http_version'] = request_http_version.decode('utf-8')
        response_http_version = response.get('http_version')
        if response_http_version:
            ret['response_http_version'] = response_http_version.decode('utf-8')
        return ret

    @staticmethod
    def _output_type(args) -> OutputType:
        if args.list_header_names:
            return OutputType.header_names

        if args.header_filter:
            return OutputType.header_value

        return OutputType.payloads

    def process(self, data: bytearray):
        args = self.args
        tnetstring = self._tnetstring
        output_type = self._output_type(args)

        with io.BytesIO(data) as fp:
            while True:
                try:
                    log_line = tnetstring.load(fp)
                    request = log_line.get('request') or {}
                    response = log_line.get('response') or {}
                    labels = {} if args.headers_as_meta_vars else self._default_meta_vars(log_line, request, response)
                    for header_name, header_value in request.get('headers', []) + response.get('headers', []):
                        if output_type == OutputType.header_names:
                            yield header_name

                        if output_type == OutputType.header_value:
                            if header_name == args.header_filter:
                                yield header_value

                        if args.headers_as_meta_vars:
                            labels[header_name.decode('utf-8').replace('-', '')] = header_value

                    if output_type == OutputType.payloads:
                        yield self.labelled(response.get('content'), **labels)
                except ValueError:
                    break

class transpose (padding=b'')

This unit is implemented in refinery.units.meta.transpose and has the following commandline Interface:

usage: transpose [-h] [-L] [-Q] [-0] [-v] [padding]

Interprets the chunks in the current frame as rows of a matrix and yields the columns of that
matrix. When chunks are not of even length, the matrix is considered to have empty entries in
some positions. Optionally, a padding sequence can be provided to pad all rows to the same
length.

positional arguments:
  padding        Optional byte sequence to use as padding for incomplete rows.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class transpose(Unit):
    """
    Interprets the chunks in the current frame as rows of a matrix and yields the columns
    of that matrix. When chunks are not of even length, the matrix is considered to have
    empty entries in some positions. Optionally, a padding sequence can be provided to pad
    all rows to the same length.
    """
    @Unit.Requires('numpy', 'speed', 'default', 'extended')
    def _numpy():
        import numpy
        return numpy

    def __init__(
        self,
        padding: Arg(help='Optional byte sequence to use as padding for incomplete rows.') = B'',
    ):
        super().__init__(bigendian=False, padding=padding)

    def filter(self, chunks: Iterable[Chunk]):
        rows = []
        for chunk in chunks:
            if not chunk.visible:
                yield chunk
                continue
            rows.append(chunk)
        if not rows:
            return
        matrix = rows[0]
        matrix.temp = rows
        yield matrix

    def process(self, data: Chunk):
        chunks: List[Chunk] = data.temp
        if not chunks:
            return
        length = [len(chunk) for chunk in chunks]
        n = min(length)
        m = max(length)
        pad = self.args.padding
        if pad:
            for chunk in chunks:
                while len(chunk) < m:
                    chunk.extend(pad)
                del chunk[m:]
        if n > 0:
            try:
                np = self._numpy
            except ImportError:
                pass
            else:
                t = [chunk[n:] for chunk in chunks if len(chunk) > n]
                for chunk in chunks:
                    del chunk[n:]
                a = np.array(chunks, dtype=np.uint8).transpose()
                for row in a:
                    yield row.tobytes('C')
                m = m - n
                chunks = t
        for i in range(m):
            yield bytes(chunk[i] for chunk in chunks if len(chunk) > i)

class trim (*junk, unpad=False, left=True, right=True, nocase=False)

This unit is implemented in refinery.units.strings.trim and has the following commandline Interface:

usage: trim [-h] [-L] [-Q] [-0] [-v] [-u] [-r | -l] [-i] [junk [junk ...]]

Removes byte sequences at beginning and end of input data.

positional arguments:
  junk              Binary strings to be removed, default are all whitespace characters.

optional arguments:
  -u, --unpad       Also trim partial occurrences of the junk string.
  -r, --right-only  Do not trim left.
  -l, --left-only   Do not trim right.
  -i, --nocase      Ignore capitalization for alphabetic characters.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class trim(Unit):
    """
    Removes byte sequences at beginning and end of input data.
    """

    def __init__(
        self, *junk: Arg(help='Binary strings to be removed, default are all whitespace characters.'),
        unpad: Arg.Switch('-u', help='Also trim partial occurrences of the junk string.') = False,
        left: Arg.Switch('-r', '--right-only', group='SIDE', help='Do not trim left.') = True,
        right: Arg.Switch('-l', '--left-only', group='SIDE', help='Do not trim right.') = True,
        nocase: Arg.Switch('-i', help='Ignore capitalization for alphabetic characters.') = False,
    ):
        super().__init__(junk=junk, left=left, right=right, unpad=unpad, nocase=nocase)

    def _trimfast(self, view: memoryview, *junks: bytes, right=False) -> Tuple[bool, memoryview]:
        done = False
        pos = 0
        while not done:
            done = True
            for junk in junks:
                temp = junk
                size = len(junk)
                if right and self.args.unpad:
                    for k in range(size):
                        n = size - k
                        if view[pos:pos + n] == junk[k:]:
                            pos += n
                            done = False
                            break
                if view[pos:pos + size] == temp:
                    m = len(temp)
                    while True:
                        mm = m << 1
                        if view[pos + m:pos + mm] != temp:
                            break
                        temp += temp
                        m = mm
                    temp = memoryview(temp)
                    while m >= size:
                        if view[pos:pos + m] == temp[:m]:
                            done = False
                            pos += m
                        m //= 2
                if right or not self.args.unpad:
                    continue
                while size > 0:
                    if view[pos:pos + size] == temp[:size]:
                        done = False
                        pos += size
                        break
                    size -= 1
        return pos

    def process(self, data: bytearray):
        junk = list(self.args.junk)
        if not junk:
            import string
            space = string.whitespace.encode('ascii')
            junk = [space[k - 1:k] for k in range(1, len(space))]
        lpos = 0
        rpos = 0
        if self.args.nocase:
            work = data.lower()
            junk = [j.lower() for j in junk]
        else:
            work = data
        if self.args.left:
            lpos = self._trimfast(memoryview(work), *junk)
        if self.args.right:
            work.reverse()
            junk = [bytes(reversed(j)) for j in junk]
            rpos = self._trimfast(memoryview(work), *junk, right=True)
            work.reverse()
        view = memoryview(data)
        if lpos:
            view = view[+lpos:]
        if rpos:
            view = view[:-rpos]
        return view

class u16

This unit is implemented in refinery.units.encoding.u16 and has the following commandline Interface:

usage: u16 [-h] [-L] [-Q] [-0] [-v] [-R]

Encodes and decodes UTF-16LE encoded string data.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class u16(Unit):
    """
    Encodes and decodes UTF-16LE encoded string data.
    """

    def reverse(self, data):
        return data.decode(self.codec).encode('utf-16LE')

    def process(self, data):
        return data.decode('utf-16LE').encode(self.codec)

class ucrypt (size=13, salt=b'AA')

This unit is implemented in refinery.units.crypto.keyderive.unixcrypt and has the following commandline Interface:

usage: ucrypt [-h] [-L] [-Q] [-0] [-v] [size] [salt]

Implements the classic Unix crypt algorithm.

positional arguments:
  size           The number of bytes to generate, default is 13.
  salt           Salt for the derivation, the default is "AA".

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class ucrypt(KeyDerivation):
    """
    Implements the classic Unix crypt algorithm.
    """
    def __init__(
        self,
        size: Arg(help='The number of bytes to generate, default is 13.') = 13,
        salt: Arg(help='Salt for the derivation, the default is "AA".') = B'AA'
    ):
        super().__init__(size=size, salt=salt)

    def process(self, data):
        crypted = bytes(UnixCrypt(data, salt=self.args.salt))
        if len(crypted) < self.args.size:
            raise RefineryPartialResult(
                F'unix crypt only provided {len(crypted)} bytes, but {self.args.size} '
                F'were requested.', partial=crypted
            )
        return crypted[:self.args.size]

class url (plus=False, hex=False)

This unit is implemented in refinery.units.encoding.url and has the following commandline Interface:

usage: url [-h] [-L] [-Q] [-0] [-v] [-R] [-p] [-x]

Decodes and encodes URL-Encoding, which preserves only alphanumeric characters and the symbols _,
., -, ~, \, and /. Every other character is escaped by hex-encoding it and prefixing it with a
percent symbol.

optional arguments:
  -p, --plus     also replace plus signs by spaces
  -x, --hex      hex encode every character in reverse mode

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class url(Unit):
    """
    Decodes and encodes URL-Encoding, which preserves only alphanumeric characters and the symbols `_`, `.`, `-`, `~`, `\\`, and `/`.
    Every other character is escaped by hex-encoding it and prefixing it with a percent symbol.
    """

    def __init__(
        self,
        plus: Arg.Switch('-p', help='also replace plus signs by spaces') = False,
        hex : Arg.Switch('-x', help='hex encode every character in reverse mode') = False
    ):
        super().__init__(plus=plus, hex=hex)

    def process(self, data):
        if self.args.plus:
            data = data.replace(B'+', B' ')
        return unquote_to_bytes(bytes(data))

    def reverse(self, data):
        if self.args.hex:
            result = bytearray(len(data) * 3)
            offset = 0
            for byte in data:
                result[offset + 0] = 0x25
                offset += 1
                result[offset:offset + 2] = B'%02X' % byte
                offset += 2
            return result
        elif self.args.plus:
            def replace(m):
                c = m[0][0]
                return b'+' if c == 0x20 else B'%%%02X' % c
        else:
            def replace(m):
                return B'%%%02X' % m[0][0]
        return re.sub(B'[^a-zA-Z0-9_.-~\\/]', replace, data)

class urlfix (keep=0)

This unit is implemented in refinery.units.misc.urlfix and has the following commandline Interface:

usage: urlfix [-h] [-L] [-Q] [-0] [-v] [-k]

Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
characters in the URL path component and normalizes the network location part to lowercase. Note
that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
URL will be swallowed and not return any output.

optional arguments:
  -k, --keep     If specified once, keeps the it keeps the URL params and query string. If
                 specified twice, it keeps the URL fragment as well. At this level, the unit
                 still filters out anything that does not parse as a URL.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class urlfix(Unit):
    """
    Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
    characters in the URL path component and normalizes the network location part to lowercase. Note
    that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
    URL will be swallowed and not return any output.
    """
    def __init__(
        self,
        keep: Arg('-k', action='count', help=(
            'If specified once, keeps the it keeps the URL params and query string. If specified '
            'twice, it keeps the URL fragment as well. At this level, the unit still filters out '
            'anything that does not parse as a URL.'
        )) = 0
    ):
        super().__init__(keep=keep)

    @unicoded
    def process(self, data: str) -> Optional[str]:
        def fix(string):
            return quote(unquote(string))
        keep = self.args.keep
        parsed = urlparse(data)
        if not parsed.scheme or not parsed.netloc:
            return None
        new_query = '&'.join(F'{key}={fix(value)}' for key, value in parse_qsl(parsed.query))
        replacements = dict(
            netloc=parsed.netloc.lower(),
            params=fix(parsed.params),
            path=fix(parsed.path),
            query=new_query,
            fragment=fix(parsed.fragment),
        )
        if keep < 2:
            replacements.update(fragment='')
            if keep < 1:
                replacements.update(params='', query='')
        return urlunparse(parsed._replace(**replacements))

class urlguards

This unit is implemented in refinery.units.pattern.urlguards and has the following commandline Interface:

usage: urlguards [-h] [-L] [-Q] [-0] [-v]

Restores the original URLs from their 'protected' versions as generated by Outlook protection and
ProofPoint.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class urlguards(Unit):
    """
    Restores the original URLs from their 'protected' versions as generated by
    Outlook protection and ProofPoint.
    """

    _PP3RLENC = {
        letter: rl for rl, letter in enumerate(
            'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            'abcdefghijklmnopqrstuvwxyz'
            '0123456789-_', 2
        )
    }

    @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v([12])/url\?([:;/_=!?#&.,\w\%\-\+|]+)')
    def _proofpointV2(self, match):
        version = int(match[1])
        self.log_info('proofpoint match:', version)
        argmatch = re.match(
            R'^u=(.+?)&(?:amp;)?{}='.format('k' if version == 1 else '[dc]'),
            match[2],
            flags=re.DOTALL
        )
        if not argmatch:
            self.log_warn('not able to translate unexpected proofpoint format:', match)
            return match[0]
        encoded = argmatch[1]
        if match[1] == '2':
            encoded = encoded.translate(str.maketrans('-_', '%/'))
        return unescape(unquote(encoded))

    @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v3/__(.+?)__;(.*?)![-\w!?$]+')
    def _proofpointV3(self, match):
        data = unquote(match[1])
        cmap = match[2] + '=' * (-len(match[2]) % 4)
        cmap = urlsafe_b64decode(cmap).decode('UTF-8')
        cursor = 0
        result = ''
        for k in range(len(cmap)):
            ast = data.find('*', cursor)
            if ast < 0:
                break
            result += data[cursor:ast]
            if data[ast + 1] == '*':
                end = self._PP3RLENC[data[ast + 2]]
                result += cmap[k:end]
                ast += 2
            else:
                result += cmap[k]
            cursor = ast + 1
        self.log_debug(result)
        self.log_debug(data[cursor:])
        return result + data[cursor:]

    @unguard(r'https?://\w+.safelinks\.protection\.outlook\.com/([:;/_=!?#&.,\w\%\-\+|]+)')
    def _outlook(self, match):
        result = match[0]
        self.log_info('outlook match:', result)
        parsed = urlparse(result)
        params = parse_qs(parsed.query)
        try:
            result = unquote(params['url'][0])
        except Exception:
            pass
        return result

    @unguard(r'https?://outlook.office.com/actions/ei\?u=([:;/_=!?#&.,\w\%\-\+|]+)')
    def _outlook_image_proxy(self, match):
        return unquote(match[1])

    @unguard(r'https?://(?:[\w-]+\.)?trendmicro.com(?::\d+)?/wis/clicktime/v[12]/(?:query|clickthrough)[:;/_=!?#&.,\w\%\-\+|]+')
    def _trendmicro(self, match):
        result = match[0]
        self.log_info('trendmicro match:', result)
        parsed = urlparse(result)
        params = parse_qs(parsed.query)
        try:
            result = unquote(params['url'][0])
        except Exception:
            pass
        return result

    @unicoded
    def process(self, data: str) -> str:
        newsize, size = 0, len(data)
        while newsize != size:
            for handler in (
                self._proofpointV2,
                self._proofpointV3,
                self._outlook,
                self._outlook_image_proxy,
                self._trendmicro
            ):
                data = handler(data)
            size = newsize
            newsize = len(data)
        return data

class uuenc

This unit is implemented in refinery.units.encoding.uuenc and has the following commandline Interface:

usage: uuenc [-h] [-L] [-Q] [-0] [-v] [-R]

Unit for uuencode.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class uuenc(Unit):
    """
    Unit for uuencode.
    """
    def process(self, data):
        with MemoryFile(data) as stream:
            with MemoryFile() as output:
                uu.decode(stream, output, quiet=True)
                return output.getvalue()

    def reverse(self, data):
        meta = metavars(data)
        path = meta.get('path', None)
        name = path and pathlib.Path(path).name
        with MemoryFile(data) as stream:
            with MemoryFile() as output:
                uu.encode(stream, output, name, backtick=True)
                return output.getvalue()

class vaddr (*name, base=None)

This unit is implemented in refinery.units.formats.exe.vaddr and has the following commandline Interface:

usage: vaddr [-h] [-L] [-Q] [-0] [-v] [-R] [-b ADDR] [name [name ...]]

Converts a metadata variable holding a file offset to a virtual address. This unit only works
when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in
place. If you would like to retain the original value, it is recommended to use the put unit
first to create a copy of an already existing variable, and then convert the copy.

positional arguments:
  name             The name of a metadata variable holding an integer.

optional arguments:
  -b, --base ADDR  Optionally specify a custom base address B.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class vaddr(Unit):
    """
    Converts a metadata variable holding a file offset to a virtual address. This unit only works when the
    chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you
    would like to retain the original value, it is recommended to use the `refinery.put` unit first to create
    a copy of an already existing variable, and then convert the copy.
    """

    def __init__(
        self, *name: Arg(type=str, help='The name of a metadata variable holding an integer.'),
        base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None
    ):
        return super().__init__(names=name, base=base)

    def process(self, data):
        try:
            exe = Executable.Load(data, self.args.base)
        except Exception:
            self.log_warn('unable to parse input as executable; no variable conversion was performed')
            return data
        meta = metavars(data)
        for name in self.args.names:
            value = meta[name]
            meta[name] = exe.location_from_offset(value).virtual.position
        return data

    def reverse(self, data):
        try:
            exe = Executable.Load(data, self.args.base)
        except Exception:
            self.log_warn('unable to parse input as executable; no variable conversion was performed')
            return data
        meta = metavars(data)
        for name in self.args.names:
            value = meta[name]
            meta[name] = exe.location_from_address(value).physical.position
        return data

class vbapc (raw=False)

This unit is implemented in refinery.units.formats.office.vbapc and has the following commandline Interface:

usage: vbapc [-h] [-L] [-Q] [-0] [-v] [-r]

Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to
decompile the disassembled p-code. This unit is specifically useful for macro documents that use
VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the
p-code functionality that the document will actually execute.

optional arguments:
  -r, --raw      Return disassembled p-code, do not try to decompile.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class vbapc(Unit):
    """
    Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to
    decompile the disassembled p-code. This unit is specifically useful for macro documents that
    use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent
    the p-code functionality that the document will actually execute.
    """
    def __init__(self, raw: Unit.Arg.Switch('-r', help='Return disassembled p-code, do not try to decompile.') = False):
        super().__init__(raw=raw)

    @Unit.Requires('oletools', 'formats', 'office', 'extended')
    def _pcodedmp():
        with NoLogging():
            import pcodedmp.pcodedmp
            return pcodedmp.pcodedmp

    def process(self, data):
        class args:
            disasmOnly = True
            verbose = False
        with io.StringIO() as output:
            with VirtualFileSystem() as vfs:
                vf = vfs.new(data)
                self._pcodedmp.processFile(vf, args, output)
            code = output.getvalue()
            if not self.args.raw:
                from refinery.lib.thirdparty.pcode2code import Parser
                parser = Parser(code)
                parser.parseInput()
                parser.processInput(False)
                code = parser.getOutput()
                code = re.sub(R'(?m)^((?:Sub|Function).*?)$(?!\n[^\s])', r'\n\1', code)
            return code.encode(self.codec)

class vbastr (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.vbastr and has the following commandline Interface:

usage: vbastr [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract VBA macro variables from Office documents. The items are extracted in a directory
hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same
as the variable's name. The variable can define a caption, a control tip text, and a value; the
unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class vbastr(PathExtractorUnit):
    """
    Extract VBA macro variables from Office documents. The items are extracted in a directory
    hierarchy that specifies their corresponding OLE stream. The stem of their file name is the
    same as the variable's name. The variable can define a caption, a control tip text, and a
    value; the unit extracts these with the synthesized file extension "cap", "tip", and "val",
    respectively.
    """
    @PathExtractorUnit.Requires('oletools', 'formats', 'office')
    def _olevba():
        from oletools import olevba
        return olevba

    def unpack(self, value):
        try:
            parser = self._olevba.VBA_Parser('.', data=bytes(value), relaxed=True)
        except self._olevba.FileOpenError:
            raise ValueError('Input data not recognized by VBA parser')
        try:
            for path, name, vars in parser.extract_form_strings_extended():
                if not vars:
                    continue
                name = _txt(vars['name'])
                for ext, key in {
                    'cap': 'caption',
                    'tip': 'control_tip_text',
                    'val': 'value',
                }.items():
                    value = _bin(vars.get(key))
                    if not value:
                        continue
                    yield UnpackResult(F'{path!s}/{name!s}/{name}.{ext}', value)
        except self._olevba.oleform.OleFormParsingError as error:
            from collections import Counter
            self.log_debug(str(error))
            self.log_info('extended form extraction failed with error; falling back to simple method')
            form_strings = list(parser.extract_form_strings())
            name_counter = Counter(name for _, name, _ in form_strings)
            dedup = Counter()
            for path, name, string in form_strings:
                if string is None:
                    continue
                if name_counter[name] > 1:
                    dedup[name] += 1
                    name = F'{name!s}.v{dedup[name]}'
                yield UnpackResult(F'{path!s}/{name!s}.val', _bin(string))

class vigenere (key, alphabet=b'abcdefghijklmnopqrstuvwxyz', operator='add', case_sensitive=False, ignore_unknown=False)

This unit is implemented in refinery.units.crypto.cipher.vigenere and has the following commandline Interface:

usage: vigenere [-h] [-L] [-Q] [-0] [-v] [-R] [-: OP] [-c] [-i] key [alphabet]

Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.

positional arguments:
  key                   The encryption key
  alphabet              The alphabet, by default the Latin one is used:
                        "abcdefghijklmnopqrstuvwxyz"

optional arguments:
  -:, --operator OP     Choose the vigenere block operation. The default is add, and the
                        available options are: add, sub, xor
  -c, --case-sensitive  Unless this option is set, the key will be case insensitive. Uppercase
                        letters from the input are transformed using the same shift as would be
                        the lowercase variant, but case is retained.
  -i, --ignore-unknown  Unless this option is set, the key stream will be iterated even for
                        letters that are not contained in the alphabet.

generic options:
  -h, --help            Show this help message and exit.
  -L, --lenient         Allow partial results as output.
  -Q, --quiet           Disables all log output.
  -0, --devnull         Do not produce any output.
  -v, --verbose         Specify up to two times to increase log level.
  -R, --reverse         Use the reverse operation; Specify twice to normalize (first decode, then
                        encode).

Expand source code Browse git

class vigenere(Unit):
    """
    Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.
    """

    def __init__(
        self,
        key: Arg(help='The encryption key'),
        alphabet: Arg(
            help='The alphabet, by default the Latin one is used: "{default}"'
        ) = b'abcdefghijklmnopqrstuvwxyz',
        operator: Arg.Choice('-:', choices=['add', 'sub', 'xor'], metavar='OP', help=(
            'Choose the vigenere block operation. The default is {default}, and the available options are: {choices}')) = 'add',
        case_sensitive: Arg.Switch('-c', help=(
            'Unless this option is set, the key will be case insensitive. Uppercase letters from the input are transformed '
            'using the same shift as would be the lowercase variant, but case is retained.')) = False,
        ignore_unknown: Arg.Switch('-i', help=(
            'Unless this option is set, the key stream will be iterated even '
            'for letters that are not contained in the alphabet.'
        )) = False
    ):
        if not callable(operator):
            operator = {
                'add': __add__,
                'sub': __sub__,
                'xor': __xor__,
            }.get(operator.lower(), None)
            if operator is None:
                raise ValueError(F'The value {operator!r} is not valid as an operator.')
        self.superinit(super(), **vars())

    def _tabula_recta(self, data, reverse=True):
        key: str = self.args.key.decode(self.codec)
        alphabet: str = self.args.alphabet.decode(self.codec)
        operator = self.args.operator
        case_sensitive: bool = self.args.case_sensitive
        ignore_unknown: bool = self.args.ignore_unknown
        if not case_sensitive:
            key = key.lower()
            alphabet = alphabet.lower()
            if len(set(alphabet)) != len(alphabet):
                raise ValueError('Duplicate entries detected in alphabet.')
        if not set(key) <= set(alphabet):
            diff = set(key) - set(alphabet)
            diff = ', '.join(diff)
            raise ValueError(F'key contains letters which are not from the given alphabet: {diff}')
        self.log_info(F'using key {key} and alphabet {alphabet}')
        keystream = cycle(key)
        alph_size = len(alphabet)
        if reverse:
            operator = _opeator_inverse[operator]
        for letter in data:
            uppercase = not case_sensitive and letter.isupper()
            if uppercase:
                letter = letter.lower()
            try:
                position = alphabet.index(letter)
            except ValueError:
                yield letter
                if not ignore_unknown:
                    next(keystream)
                continue
            shift = alphabet.index(next(keystream))
            result = alphabet[operator(position, shift) % alph_size]
            yield result.upper() if uppercase else result

    @unicoded
    def process(self, data):
        return ''.join(self._tabula_recta(data, True))

    @unicoded
    def reverse(self, data):
        return ''.join(self._tabula_recta(data, False))

class vmemref (address, base=None)

This unit is implemented in refinery.units.formats.exe.vmemref and has the following commandline Interface:

usage: vmemref [-h] [-L] [-Q] [-0] [-v] [-b ADDR] ADDR

The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual
address for memory references. For each memory reference, the unit looks up the corresponding
section and file offset for the reference. It then returns all data from that section starting at
the given offset.

positional arguments:
  ADDR             Specify the address of a function to scan.

optional arguments:
  -b, --base ADDR  Optionally specify a custom base address B.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class vmemref(Unit):
    """
    The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual
    address for memory references. For each memory reference, the unit looks up the corresponding
    section and file offset for the reference. It then returns all data from that section starting
    at the given offset.
    """

    @Unit.Requires('angr', 'all')
    def _angr():
        import angr
        import angr.project
        import angr.engines
        return angr

    def _memory_references(
        self,
        function: Function,
        memory: Clemory,
        functions: Container[int],
        pointer_size: int,
        max_dereference: int = 1
    ):
        pointer_size //= 8
        references = []
        code = set()
        for block in function.blocks:
            code.update(block.instruction_addrs)
        try:
            constants = function.code_constants
        except Exception:
            pass
        else:
            def is_valid_data_address(address):
                if not isinstance(address, int):
                    return False
                if address not in memory:
                    return False
                if address in code:
                    return False
                if address in functions:
                    return False
                return True

            def dereference(address):
                data = bytes(memory[k] for k in range(address, address + pointer_size))
                return int.from_bytes(data, 'little')

            for address in constants:
                try:
                    address = int(address)
                except Exception:
                    continue
                times_dereferenced = 0
                while is_valid_data_address(address) and address not in references:
                    references.append(address)
                    times_dereferenced += 1
                    if max_dereference and max_dereference > 0 and times_dereferenced > max_dereference:
                        break
                    try:
                        address = dereference(address)
                    except Exception:
                        break

        return references

    def __init__(
        self,
        address: Arg.Number(metavar='ADDR', help='Specify the address of a function to scan.'),
        base: Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None,
    ):
        super().__init__(address=address, base=base)

    def process(self, data):
        address = self.args.address
        executable = Executable.Load(data, self.args.base)
        code = executable.location_from_address(address).virtual.box

        self.log_info(R'loading project into angr')
        with NoLogging():
            project: Project = self._angr.Project(MemoryFile(data), load_options={'auto_load_libs': False})

        self.log_info(F'scanning function at 0x{address:X}')
        with NoLogging():
            cfg: CFGEmulated = project.analyses.CFGEmulated(
                call_depth=0,
                starts=[address],
                enable_symbolic_back_traversal=True,
                address_whitelist=code.range(),
            )

        function = cfg.functions[address]
        code_addresses = cfg.functions

        if executable.type is ET.PE:
            code_addresses = code

        self.log_info(R'extracting memory references from lifted function')
        for ref in self._memory_references(
            function,
            project.loader.memory,
            code_addresses,
            executable.pointer_size
        ):
            try:
                yield executable[ref:]
            except CompartmentNotFound:
                self.log_info(F'memory reference could not be resolved: 0x{ref:0{executable.pointer_size // 4}X}')

class vsect (*paths, meta=False, synthetic=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.exe.vsect and has the following commandline Interface:

usage: vsect [-h] [-L] [-Q] [-0] [-v] [-m] [-s] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
             [path [path ...]]

Extract sections/segments from PE, ELF, and MachO executables.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -m, --meta       Populates the metadata variables vaddr and vsize containing the virtual
                   address and size of each section, respectively.
  -s, --synthetic  Include synthesized sections: These represent data regions that are outside
                   the sections as listed by the executable metadata, such as headers and
                   overlays.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class vsect(PathExtractorUnit):
    """
    Extract sections/segments from PE, ELF, and MachO executables.
    """
    def __init__(
        self, *paths,
        meta: Arg.Switch('-m', help=(
            'Populates the metadata variables vaddr and vsize containing the virtual address and size '
            'of each section, respectively.')) = False,
        synthetic: Arg.Switch('-s', help=(
            'Include synthesized sections: These represent data regions that are outside the sections '
            'as listed by the executable metadata, such as headers and overlays.')) = False,
        **keywords
    ):
        super().__init__(*paths, meta=meta, synthetic=synthetic, **keywords)

    def unpack(self, data):
        exe = Executable.Load(data)
        mv = memoryview(data)
        for k, section in enumerate(exe.sections()):
            if section.synthetic and not self.args.synthetic:
                continue
            start = section.physical.lower
            end = section.physical.upper
            va = section.virtual.lower
            vs = len(section.virtual)
            kwargs = {'offset': start}
            if self.args.meta:
                if va is not None:
                    kwargs['vaddr'] = va
                if vs is not None:
                    kwargs['vsize'] = vs
            name = section.name
            if not name:
                addr = F'{section.virtual.lower:0{exe.pointer_size // 4}X}'
                self.log_warn(F'section {k} had no name, synthesizing name from virtual address 0x{addr}')
                name = F'.{addr}'
            yield UnpackResult(name, mv[start:end], **kwargs)

class vsnip (*addresses, ascii=False, utf16=False, until=b'', base=None)

This unit is implemented in refinery.units.formats.exe.vsnip and has the following commandline Interface:

usage: vsnip [-h] [-L] [-Q] [-0] [-v] [-a | -u | -t B] [-b ADDR]
             [start:count:align [start:count:align ...]]

Extract data from PE, ELF, and MachO files based on virtual offsets.

positional arguments:
  start:count:align  Use Python slice syntax to describe an area of virtual memory to read. If a
                     chunksize is specified, then the unit will always read a multiple of that
                     number of bytes

optional arguments:
  -a, --ascii        Read ASCII strings; equivalent to -th:00
  -u, --utf16        Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)
  -t, --until B      Read until sequence B is read.
  -b, --base ADDR    Optionally specify a custom base address B.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class vsnip(Unit):
    """
    Extract data from PE, ELF, and MachO files based on virtual offsets.
    """

    def __init__(
        self, *addresses: Arg.Bounds(metavar='start:count:align', help=(
            'Use Python slice syntax to describe an area of virtual memory to read. If a chunksize is '
            'specified, then the unit will always read a multiple of that number of bytes')),
        ascii: Arg.Switch('-a', group='END', help='Read ASCII strings; equivalent to -th:00') = False,
        utf16: Arg.Switch('-u', group='END', help='Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)') = False,
        until: Arg.Binary('-t', group='END', help='Read until sequence {varname} is read.') = B'',
        base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None,
    ):
        if sum(1 for t in (until, utf16, ascii) if t) > 1:
            raise ValueError('Only one of utf16, ascii, and until can be specified.')
        return super().__init__(addresses=addresses, utf16=utf16, ascii=ascii, until=until, base=base)

    def process(self, data: bytearray):
        until = self.args.until
        addrs = self.args.addresses
        if self.args.ascii:
            until = B'\0'
        if self.args.utf16:
            until = B'\0\0'
            addrs = (slice(a.start, a.stop, 2) for a in addrs)

        exe = Executable.Load(data, self.args.base)

        for addr in addrs:
            area = MemoryArea(addr)
            location = exe.location_from_address(area.start)
            offset = location.physical.position
            max_offset = location.physical.box.upper
            if not until:
                end = max_offset
            else:
                end = offset - 1
                align = area.align
                while True:
                    end = data.find(until, end + 1)
                    if end not in range(offset, max_offset):
                        raise EndOfStringNotFound
                    if (end - offset) % align == 0:
                        break

            if area.count:
                end = min(end, offset + area.count)

            yield self.labelled(data[offset:end], offset=offset)

class vstack (*address, stop=None, base=None, arch=Arch.X32, meta_registers=False, timeout=None, patch_range=slice(5, None, None), write_range=slice(1, None, None), wait=10, wait_calls=False, skip_calls=0, stack_size=65536, block_size=4096, max_visits=4096, log_writes_in_calls=False, log_stack_addresses=False, log_other_addresses=False, log_zero_overwrites=False)

This unit is implemented in refinery.units.formats.exe.vstack and has the following commandline Interface:

usage: vstack [-h] [-L] [-Q] [-0] [-v] [-s stop] [-b Addr] [-a Arch] [-r] [-t N] [-p MIN:MAX]
              [-n MIN:MAX] [-w N] [-c | -C] [-S N] [-B N] [-V N] [-W] [-X] [-Y] [-Z]
              [start [start ...]]

The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and
extracts data patches that are written to the stack during emulation. Emulation is halted as soon
as a certain number of instructions has not performed any memory writes, or when an error occurs.
By default, most registers are set to the current location in the emulated stack. However, if you
want to initialize certain registers differently, you can set an environment variable to the
desired value.

positional arguments:
  start                      Specify the (virtual) addresses of a stack string instruction
                             sequences.

optional arguments:
  -s, --stop stop            Optional: Stop when reaching this address.
  -b, --base Addr            Optionally specify a custom base address B.
  -a, --arch Arch            Specify for blob inputs: x32, x64, arm32, arm64, mips16, mips32,
                             mips64, ppc32, ppc64, sparc32, sparc64
  -r, --meta-registers       Consume register initialization values from the chunk's metadata.
  -t, --timeout N            Optionally stop emulating after a given number of instructions.
  -p, --patch-range MIN:MAX  Extract only patches that are in the given range, default is 5:.
  -n, --write-range MIN:MAX  Log only writes whose size is in the given range, default is 1:.
  -w, --wait N               When this many instructions did not write to memory, emulation is
                             halted. The default is 10.
  -c, --wait-calls           Wait indefinitely when inside a function call.
  -C, --skip-calls           Skip function calls entirely. Use twice to treat each call as
                             allocating memory.
  -S, --stack-size N         Optionally specify the stack size. The default is 0x10000.
  -B, --block-size N         Standard memory block size for the emulator, 0x1000 by default.
  -V, --max-visits N         Maximum number of times a code address is visited. Default is 4096.
  -W, --log-writes-in-calls  Log writes of values that occur in functions calls.
  -X, --log-stack-addresses  Log writes of values that are stack addresses.
  -Y, --log-other-addresses  Log writes of values that are addresses to mapped segments.
  -Z, --log-zero-overwrites  Log writes of zeros to memory that contained nonzero values.

generic options:
  -h, --help                 Show this help message and exit.
  -L, --lenient              Allow partial results as output.
  -Q, --quiet                Disables all log output.
  -0, --devnull              Do not produce any output.
  -v, --verbose              Specify up to two times to increase log level.

Expand source code Browse git

class vstack(Unit):
    """
    The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and
    extracts data patches that are written to the stack during emulation. Emulation is halted as
    soon as a certain number of instructions has not performed any memory writes, or when an error
    occurs. By default, most registers are set to the current location in the emulated stack.
    However, if you want to initialize certain registers differently, you can set an environment
    variable to the desired value.
    """

    @Unit.Requires('intervaltree', 'default', 'extended')
    def _intervaltree():
        import intervaltree
        return intervaltree

    @Unit.Requires('unicorn', 'default', 'extended')
    def _unicorn():
        with NoLogging():
            import unicorn
        import unicorn.x86_const
        import unicorn.arm64_const
        import unicorn.mips_const
        import unicorn.sparc_const
        try:
            import unicorn.ppc_const
        except ImportError:
            pass
        return unicorn

    @Unit.Requires('capstone', 'default', 'extended')
    def _capstone():
        import capstone
        return capstone

    def __init__(
        self,
        *address: Arg.Number(metavar='start', help='Specify the (virtual) addresses of a stack string instruction sequences.'),
        stop: Arg.Number('-s', metavar='stop', help='Optional: Stop when reaching this address.') = None,
        base: Arg.Number('-b', metavar='Addr', help='Optionally specify a custom base address B.') = None,
        arch: Arg.Option('-a', help='Specify for blob inputs: {choices}', choices=Arch) = Arch.X32,
        meta_registers: Arg.Switch('-r', help='Consume register initialization values from the chunk\'s metadata.') = False,
        timeout: Arg.Number('-t', help='Optionally stop emulating after a given number of instructions.') = None,
        patch_range: Arg.Bounds('-p', metavar='MIN:MAX',
            help='Extract only patches that are in the given range, default is {default}.') = slice(5, None),
        write_range: Arg.Bounds('-n', metavar='MIN:MAX',
            help='Log only writes whose size is in the given range, default is {default}.') = slice(1, None),
        wait: Arg.Number('-w', help=(
            'When this many instructions did not write to memory, emulation is halted. The default is {default}.')) = 10,
        wait_calls: Arg.Switch('-c', group='CALL',
            help='Wait indefinitely when inside a function call.') = False,
        skip_calls: Arg.Counts('-C', group='CALL',
            help='Skip function calls entirely. Use twice to treat each call as allocating memory.') = 0,
        stack_size: Arg.Number('-S', help='Optionally specify the stack size. The default is 0x{default:X}.') = 0x10000,
        block_size: Arg.Number('-B', help='Standard memory block size for the emulator, 0x{default:X} by default.') = 0x1000,
        max_visits: Arg.Number('-V', help='Maximum number of times a code address is visited. Default is {default}.') = 0x1000,
        log_writes_in_calls: Arg.Switch('-W', help='Log writes of values that occur in functions calls.') = False,
        log_stack_addresses: Arg.Switch('-X', help='Log writes of values that are stack addresses.') = False,
        log_other_addresses: Arg.Switch('-Y', help='Log writes of values that are addresses to mapped segments.') = False,
        log_zero_overwrites: Arg.Switch('-Z', help='Log writes of zeros to memory that contained nonzero values.') = False,
    ):
        super().__init__(
            address=address or [0],
            stop=stop,
            base=base,
            arch=Arg.AsOption(arch, Arch),
            meta_registers=meta_registers,
            timeout=timeout,
            patch_range=patch_range,
            write_range=write_range,
            wait=wait,
            stack_size=stack_size,
            wait_calls=wait_calls,
            skip_calls=skip_calls,
            block_size=block_size,
            max_visits=max_visits,
            log_writes_in_calls=log_writes_in_calls,
            log_stack_addresses=log_stack_addresses,
            log_other_addresses=log_other_addresses,
            log_zero_overwrites=log_zero_overwrites,
        )

    def _find_stack_location(self, exe: Executable):
        stack_size = self.args.stack_size
        memory_max = 1 << exe.pointer_size
        space = exe.image_defined_address_space()
        aligned = align(stack_size, space.upper)
        if aligned + stack_size < memory_max:
            return aligned
        aligned = align(stack_size, space.lower - stack_size, down=True)
        if aligned > 0:
            return aligned
        raise RuntimeError('The primitive method used to map stack memory has failed.')

    def process(self, data):
        uc = self._unicorn
        blob = False
        try:
            exe = Executable.Load(data, self.args.base)
        except ValueError:
            exe = ExecutableCodeBlob(data, self.args.base, self.args.arch)
            blob = True
        arch = exe.arch()
        width = exe.pointer_size // 4
        block_size = self.args.block_size
        stack_size = self.args.stack_size
        stack_addr = self._find_stack_location(exe)
        self.log_info(F'mapping {SizeInt(stack_size)!r} of stack at 0x{stack_addr:X}')
        image = memoryview(data)
        disassembler = self._capstone.Cs(*self._cs_arch(arch, exe.byte_order()))
        register_values = {}

        if arch in (Arch.PPC32, Arch.PPC64):
            try:
                sp = uc.ppc_const.UC_PPC_REG_1
                rv = uc.ppc_const.UC_PPC_REG_3
                ip = uc.ppc_const.UC_PPC_REG_PC
            except AttributeError:
                raise RuntimeError('The installed unicorn version does not support the PPC architecture.')
        else:
            sp, ip, rv = {
                Arch.X32     : (
                    uc.x86_const.UC_X86_REG_ESP,
                    uc.x86_const.UC_X86_REG_EIP,
                    uc.x86_const.UC_X86_REG_EAX,
                ),
                Arch.X64     : (
                    uc.x86_const.UC_X86_REG_RSP,
                    uc.x86_const.UC_X86_REG_RIP,
                    uc.x86_const.UC_X86_REG_RAX,
                ),
                Arch.ARM32   : (
                    uc.arm_const.UC_ARM_REG_SP,
                    uc.arm_const.UC_ARM_REG_IP,
                    uc.arm_const.UC_ARM_REG_R0,
                ),
                Arch.ARM64   : (
                    uc.arm_const.UC_ARM_REG_SP,
                    uc.arm_const.UC_ARM_REG_IP,
                    uc.arm_const.UC_ARM_REG_R0,
                ),
                Arch.MIPS16  : (
                    uc.mips_const.UC_MIPS_REG_SP,
                    uc.mips_const.UC_MIPS_REG_PC,
                    uc.mips_const.UC_MIPS_REG_0,
                ),
                Arch.MIPS32  : (
                    uc.mips_const.UC_MIPS_REG_SP,
                    uc.mips_const.UC_MIPS_REG_PC,
                    uc.mips_const.UC_MIPS_REG_V0,
                ),
                Arch.MIPS64  : (
                    uc.mips_const.UC_MIPS_REG_SP,
                    uc.mips_const.UC_MIPS_REG_PC,
                    uc.mips_const.UC_MIPS_REG_V0,
                ),
                Arch.SPARC32 : (
                    uc.sparc_const.UC_SPARC_REG_SP,
                    uc.sparc_const.UC_SPARC_REG_PC,
                    uc.sparc_const.UC_SPARC_REG_O0,
                ),
                Arch.SPARC64 : (
                    uc.sparc_const.UC_SPARC_REG_SP,
                    uc.sparc_const.UC_SPARC_REG_PC,
                    uc.sparc_const.UC_SPARC_REG_O0,
                ),
            }[arch]

        if self.args.meta_registers:
            from refinery.lib.meta import metavars
            meta = metavars(data)
            for module in [uc.x86_const, uc.arm_const, uc.mips_const, uc.sparc_const]:
                md: Dict[str, Any] = module.__dict__
                for name, register in md.items():
                    try:
                        u, *_, kind, name = name.split('_')
                    except Exception:
                        continue
                    if kind != 'REG' or u != 'UC':
                        continue
                    for var, value in list(meta.items()):
                        if var.upper() != name:
                            continue
                        meta.discard(var)
                        register_values[register] = value
                        break

        for address in self.args.address:

            emulator = uc.Uc(*self._uc_arch(arch, exe.byte_order()))
            stack = Range(stack_addr, stack_addr + 3 * stack_size)

            emulator.mem_map(stack.lower, len(stack))
            emulator.reg_write(sp, stack.lower + 2 * len(stack) // 3)

            if arch is Arch.X32:
                for reg in [
                    uc.x86_const.UC_X86_REG_EAX,
                    uc.x86_const.UC_X86_REG_EBX,
                    uc.x86_const.UC_X86_REG_ECX,
                    uc.x86_const.UC_X86_REG_EDX,
                    uc.x86_const.UC_X86_REG_ESI,
                    uc.x86_const.UC_X86_REG_EDI,
                    uc.x86_const.UC_X86_REG_EBP,
                ]:
                    emulator.reg_write(reg, stack_addr + stack_size)
            if arch is Arch.X64:
                for reg in [
                    uc.x86_const.UC_X86_REG_RAX,
                    uc.x86_const.UC_X86_REG_RBX,
                    uc.x86_const.UC_X86_REG_RCX,
                    uc.x86_const.UC_X86_REG_RDX,
                    uc.x86_const.UC_X86_REG_RSI,
                    uc.x86_const.UC_X86_REG_RDI,
                    uc.x86_const.UC_X86_REG_RBP,
                    uc.x86_const.UC_X86_REG_R8,
                    uc.x86_const.UC_X86_REG_R9,
                    uc.x86_const.UC_X86_REG_R10,
                    uc.x86_const.UC_X86_REG_R11,
                    uc.x86_const.UC_X86_REG_R12,
                    uc.x86_const.UC_X86_REG_R13,
                    uc.x86_const.UC_X86_REG_R14,
                    uc.x86_const.UC_X86_REG_R15,
                ]:
                    emulator.reg_write(reg, stack_addr + stack_size)

            for reg, value in register_values.items():
                emulator.reg_write(reg, value)

            for segment in exe.segments():
                pmem = segment.physical
                vmem = segment.virtual
                try:
                    emulator.mem_map(vmem.lower, align(block_size, len(vmem)))
                    emulator.mem_write(vmem.lower, bytes(image[pmem.slice()]))
                except KeyboardInterrupt:
                    raise
                except Exception as error:
                    if address in vmem:
                        raise
                    self.log_info(F'error mapping segment [{vmem.lower:0{width}X}-{vmem.upper:0{width}X}]: {error!s}')

            tree = self._intervaltree.IntervalTree()
            state = EmuState(
                exe, tree, address, stack, blob, disassembler,
                stop=self.args.stop,
                sp_register=sp,
                ip_register=ip,
                rv_register=rv,
                allocations=[stack],
                max_wait=self.args.wait,
                max_loop=self.args.max_visits,
            )

            timeout = self.args.timeout
            if timeout is not None:
                self.log_info(F'setting timeout of {timeout} steps')
                state.ticks = timeout

            emulator.hook_add(uc.UC_HOOK_CODE, self._hook_code, user_data=state)
            emulator.hook_add(uc.UC_HOOK_MEM_WRITE, self._hook_mem_write, user_data=state, )
            emulator.hook_add(uc.UC_HOOK_INSN_INVALID, self._hook_insn_error, user_data=state)
            emulator.hook_add(uc.UC_HOOK_MEM_INVALID, self._hook_mem_error, user_data=state)

            end_of_code = exe.location_from_address(address).virtual.box.upper

            try:
                emulator.emu_start(address, end_of_code)
            except uc.UcError:
                pass

            it: Iterator[Interval] = iter(tree)
            for interval in it:
                size = interval.end - interval.begin - 1
                if size not in bounds[self.args.patch_range]:
                    continue
                try:
                    patch = emulator.mem_read(interval.begin, size)
                except uc.UcError as error:
                    self.log_info(F'error reading 0x{interval.begin:0{width}X}:{size}: {error!s}')
                    continue
                if not any(patch):
                    continue
                self.log_info(F'memory patch at {state.fmt(interval.begin)} of size {size}')
                yield patch

    def _hook_mem_write(self, emu: Uc, access: int, address: int, size: int, value: int, state: EmuState):
        try:
            mask = (1 << (size * 8)) - 1
            unsigned_value = value & mask

            if unsigned_value == state.expected_address:
                callstack = state.callstack
                state.retaddr = unsigned_value
                if not self.args.skip_calls:
                    if not callstack:
                        state.callstack_ceiling = emu.reg_read(state.sp_register)
                    callstack.append(unsigned_value)
                return
            else:
                state.retaddr = None

            skipped = False

            if size not in bounds[self.args.write_range]:
                skipped = 'size excluded'
            elif (
                state.callstack_ceiling > 0
                and not self.args.log_writes_in_calls
                and address in range(state.callstack_ceiling - 0x200, state.callstack_ceiling)
            ):
                skipped = 'inside call'
            elif not self.args.log_stack_addresses and unsigned_value in state.stack:
                skipped = 'stack address'
            elif not self.args.log_other_addresses and not state.blob:
                for s in state.executable.sections():
                    if address in s.virtual:
                        skipped = F'write to section {s.name}'
                        break
            if (
                not skipped
                and unsigned_value == 0
                and state.writes.at(address) is not None
                and self.args.log_zero_overwrites is False
            ):
                try:
                    if any(emu.mem_read(address, size)):
                        skipped = 'zero overwrite'
                except Exception:
                    pass

            if not skipped:
                state.writes.addi(address, address + size + 1)
                state.writes.merge_overlaps()
                state.waiting = 0

            def info():
                data = unsigned_value.to_bytes(size, state.executable.byte_order().value)
                ph = state.executable.pointer_size // 4
                pt = state.executable.pointer_size // 8
                h = data.hex().upper()
                t = re.sub('[^!-~]', '.', data.decode('latin1'))
                msg = state.log(F'{state.fmt(address)} <- {h:_<{ph}} {t:_<{pt}}')
                if skipped:
                    msg = F'{msg} (ignored: {skipped})'
                return msg

            self.log_info(info)

        except KeyboardInterrupt:
            emu.emu_stop()
            return False

    def _hook_insn_error(self, emu: Uc, state: EmuState):
        self.log_debug('aborting emulation; instruction error')
        emu.emu_stop()
        return False

    def _hook_mem_error(self, emu: Uc, access: int, address: int, size: int, value: int, state: EmuState):
        bs = self.args.block_size
        try:
            emu.mem_map(align(bs, address, down=True), 2 * bs)
        except Exception:
            self.log_info(state.log(F'{state.fmt(address)} :: MEMORY ERROR'))
            return False
        else:
            return True

    def _hook_code(self, emu: Uc, address: int, size: int, state: EmuState):
        try:
            state.ticks -= 1
            state.visits[address] += 1
            if state.visits[address] > state.max_loop:
                self.log_info(
                    F'aborting emulation: 0x{address:0{state.executable.pointer_size // 8}X}'
                    F' was visited more than {state.max_loop} times.')
                emu.emu_stop()
                return False
            if address == state.stop or state.ticks == 0:
                emu.emu_stop()
                return False
            waiting = state.waiting
            callstack = state.callstack
            depth = len(callstack)
            state.previous_address = address
            retaddr = state.retaddr
            state.retaddr = None

            if address != state.expected_address:
                if retaddr is not None and self.args.skip_calls:
                    if self.args.skip_calls > 1:
                        stack_size = self.args.stack_size
                        block_size = self.args.block_size
                        rv = state.rv_register
                        alloc_addr = align(block_size, state.allocations[-1].upper)
                        state.allocations.append(Range(alloc_addr, alloc_addr + stack_size))
                        emu.mem_map(alloc_addr, stack_size)
                        emu.reg_write(rv, alloc_addr)
                    ip = state.ip_register
                    sp = state.sp_register
                    ps = state.executable.pointer_size // 8
                    emu.reg_write(ip, retaddr)
                    emu.reg_write(sp, emu.reg_read(sp) + ps)
                    return
                if depth and address == callstack[-1]:
                    depth -= 1
                    state.callstack.pop()
                    if depth == 0:
                        state.callstack_ceiling = 0
                state.expected_address = address
            elif retaddr is not None and not self.args.skip_calls:
                # The present address was moved to the stack but we did not branch.
                # This is not quite accurate, of course: We could be calling the
                # next instruction. However, that sort of code is usually not really
                # a function call anyway, but rather a way to get the IP.
                callstack.pop()

            if waiting > self.args.wait:
                emu.emu_stop()
                return False
            if not depth or not self.args.wait_calls:
                state.waiting += 1
            state.expected_address += size

            instruction = state.disassemble(address, size)
            if instruction:
                instruction = F'{instruction.mnemonic} {instruction.op_str}'
                self.log_debug(state.log(instruction))
            else:
                self.log_debug(state.log('unrecognized instruction, aborting'))
                emu.emu_stop()

        except KeyboardInterrupt:
            emu.emu_stop()
            return False

    def _uc_arch(self, arch: Arch, bo: Optional[BO] = None) -> Tuple[int, int]:
        uc = self._unicorn
        arch, mode = {
            Arch.X32     : (uc.UC_ARCH_X86,   uc.UC_MODE_32),     # noqa
            Arch.X64     : (uc.UC_ARCH_X86,   uc.UC_MODE_64),     # noqa
            Arch.ARM32   : (uc.UC_ARCH_ARM,   uc.UC_MODE_ARM),    # noqa
            Arch.ARM64   : (uc.UC_ARCH_ARM,   uc.UC_MODE_THUMB),  # noqa
            Arch.MIPS16  : (uc.UC_ARCH_MIPS,  uc.UC_MODE_16),     # noqa
            Arch.MIPS32  : (uc.UC_ARCH_MIPS,  uc.UC_MODE_32),     # noqa
            Arch.MIPS64  : (uc.UC_ARCH_MIPS,  uc.UC_MODE_64),     # noqa
            Arch.PPC32   : (uc.UC_ARCH_PPC,   uc.UC_MODE_32),     # noqa
            Arch.PPC64   : (uc.UC_ARCH_PPC,   uc.UC_MODE_64),     # noqa
            Arch.SPARC32 : (uc.UC_ARCH_SPARC, uc.UC_MODE_32),     # noqa
            Arch.SPARC64 : (uc.UC_ARCH_SPARC, uc.UC_MODE_V9),     # noqa
        }[arch]
        if bo is not None:
            mode |= {
                BO.BE: uc.UC_MODE_BIG_ENDIAN,
                BO.LE: uc.UC_MODE_LITTLE_ENDIAN,
            }[bo]
        return arch, mode

    def _cs_arch(self, arch: Arch, bo: Optional[BO] = None) -> Tuple[int, int]:
        cs = self._capstone
        arch, mode = {
            Arch.X32     : (cs.CS_ARCH_X86,   cs.CS_MODE_32),     # noqa
            Arch.X64     : (cs.CS_ARCH_X86,   cs.CS_MODE_64),     # noqa
            Arch.ARM32   : (cs.CS_ARCH_ARM,   cs.CS_MODE_ARM),    # noqa
            Arch.ARM64   : (cs.CS_ARCH_ARM,   cs.CS_MODE_THUMB),  # noqa
            Arch.MIPS16  : (cs.CS_ARCH_MIPS,  cs.CS_MODE_16),     # noqa
            Arch.MIPS32  : (cs.CS_ARCH_MIPS,  cs.CS_MODE_32),     # noqa
            Arch.MIPS64  : (cs.CS_ARCH_MIPS,  cs.CS_MODE_64),     # noqa
            Arch.PPC32   : (cs.CS_ARCH_PPC,   cs.CS_MODE_32),     # noqa
            Arch.PPC64   : (cs.CS_ARCH_PPC,   cs.CS_MODE_64),     # noqa
            Arch.SPARC32 : (cs.CS_ARCH_SPARC, cs.CS_MODE_32),     # noqa
            Arch.SPARC64 : (cs.CS_ARCH_SPARC, cs.CS_MODE_V9),     # noqa
        }[arch]
        if bo is not None:
            mode |= {
                BO.BE: cs.CS_MODE_BIG_ENDIAN,
                BO.LE: cs.CS_MODE_LITTLE_ENDIAN,
            }[bo]
        return arch, mode

class winreg (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.winreg and has the following commandline Interface:

usage: winreg [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract values from a Windows registry hive or from a registry export (.reg file).

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class winreg(PathExtractorUnit):
    """
    Extract values from a Windows registry hive or from a registry export (.reg file).
    """
    @PathExtractorUnit.Requires('python-registry', 'formats')
    def _registry():
        import Registry
        import Registry.Registry
        import Registry.RegistryParse
        return Registry

    @staticmethod
    def _walk(patterns: List[PathPattern], key: RegistryKey, *path: str):
        here = '/'.join(path)
        if not any(p.reach(here) for p in patterns):
            winreg.log_debug(F'pruning search at {here}')
            return
        for value in key.values():
            def raw(v: RegistryValue = value):
                return v.raw_data()
            vpath = F'{here}/{value.name()}'
            yield UnpackResult(vpath, raw)
        for subkey in key.subkeys():
            yield from winreg._walk(patterns, subkey, *path, subkey.name())

    def _unpack_hive(self, data: bytearray):
        try:
            with MemoryFile(data) as stream:
                root = self._registry.Registry.Registry(stream).root()
                yield from self._walk(self._patterns, root, root.name())
        except self._registry.RegistryParse.ParseException:
            raise ParseException

    def _decode_registry_export(self, data: str):
        def REG_BINARY(data: str) -> bytes:
            return bytes.fromhex(re.sub('[^a-f0-9]+', '', data))

        def REG_SZ(data: str) -> bytes:
            return data.encode(self.codec) | esc(quoted=True) | bytes

        def REG_EXPAND_SZ(data: str):
            return REG_BINARY(data).decode('UTF-16LE').rstrip('\0').encode(self.codec)

        def REG_MULTI_SZ(data: str):
            data = REG_BINARY(data).decode('UTF-16LE').split('\0')
            for string in data:
                if string:
                    yield string.encode(self.codec)

        def REG_DWORD(data: str):
            value = int(data, 16)
            return F'0x{value:X}'.encode(self.codec)

        def REG_QWORD(data: str):
            value = int.from_bytes(REG_BINARY(data), 'little')
            return F'0x{value:X}'.encode(self.codec)

        class Missing:
            def __init__(self, name: str): self.name = name
            def __str__(self): return self.name

        REG_NONE = REG_EXPAND_SZ
        REG_DWORD_BIG_ENDIAN = Missing('REG_DWORD_BIG_ENDIAN')
        REG_LINK = Missing('REG_LINK')
        REG_RESOURCE_LIST = Missing('REG_RESOURCE_LIST')
        REG_FULL_RESOURCE_DESCRIPTOR = Missing('REG_FULL_RESOURCE_DESCRIPTOR')
        REG_RESOURCE_REQUIREMENTS_LIST = Missing('REG_RESOURCE_REQUIREMENTS_LIST')

        prefix, _, encoded = data.partition(':')

        try:
            decoder = {
                'hex(0)' : REG_NONE,
                'hex(1)' : REG_SZ,
                'hex(2)' : REG_EXPAND_SZ,
                'hex(3)' : REG_BINARY,
                'hex'    : REG_BINARY,
                'hex(4)' : REG_DWORD,
                'dword'  : REG_DWORD,
                'hex(5)' : REG_DWORD_BIG_ENDIAN,
                'hex(6)' : REG_LINK,
                'hex(7)' : REG_MULTI_SZ,
                'hex(8)' : REG_RESOURCE_LIST,
                'hex(9)' : REG_FULL_RESOURCE_DESCRIPTOR,
                'hex(a)' : REG_RESOURCE_REQUIREMENTS_LIST,
                'hex(b)' : REG_QWORD,
            }[prefix]
        except KeyError:
            decoder = REG_SZ
            encoded = data

        if isinstance(decoder, Missing):
            self.log_warn(F'Found registry type {decoder!s}; no decoder implemented.')
            return
        self.log_debug(F'decoding as {decoder.__name__}: {encoded}')
        it = decoder(encoded)
        if not inspect.isgenerator(it):
            it = (it,)
        yield from it

    def _unpack_file(self, data: bytearray):
        for codec in ('utf16', 'utf-16le', 'utf8'):
            try:
                reg = data.decode(codec).splitlines(keepends=True)
            except UnicodeError:
                continue
            if reg[0].startswith('Windows Registry Editor'):
                break
        else:
            raise ParseException
        config = WinRegFileParser()
        config.read_string(''.join(reg[1:]))
        for key in config.sections():
            self.log_debug(key)
            for value in config[key]:
                name = next(iter(shlex.split(value)))
                path = Path(key) / Path(name)
                data = config[key][value]
                decoded = list(self._decode_registry_export(data))
                if len(decoded) == 1:
                    yield UnpackResult(str(path), decoded[0])
                    continue
                for k, d in enumerate(decoded):
                    yield UnpackResult(F'{path!s}.{k}', d)

    def unpack(self, data):
        with contextlib.suppress(ParseException):
            yield from self._unpack_hive(data)
            return
        yield from self._unpack_file(data)

class wshenc (marker=True)

This unit is implemented in refinery.units.encoding.wshenc and has the following commandline Interface:

usage: wshenc [-h] [-L] [-Q] [-0] [-v] [-R] [-m]

Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).

optional arguments:
  -m, --no-marker  Do not require magic marker when encoding and do not search for marker when
                   decoding.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class wshenc(Unit):
    """
    Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).
    """

    _MARKER_INIT = RB'#@~^BINREF=='
    _MARKER_STOP = RB'BINREF==^#~@'

    _CHUNKS = (
        0x57, 0x6E, 0x7B, 0x4A, 0x4C, 0x41, 0x0B, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x4A, 0x4C, 0x41,
        0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12,
        0x13, 0x13, 0x13, 0x14, 0x14, 0x14, 0x15, 0x15, 0x15, 0x16, 0x16, 0x16, 0x17, 0x17, 0x17,
        0x18, 0x18, 0x18, 0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B, 0x1B, 0x1C, 0x1C, 0x1C,
        0x1D, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x2E, 0x2D, 0x32, 0x47, 0x75, 0x30,
        0x7A, 0x52, 0x21, 0x56, 0x60, 0x29, 0x42, 0x71, 0x5B, 0x6A, 0x5E, 0x38, 0x2F, 0x49, 0x33,
        0x26, 0x5C, 0x3D, 0x49, 0x62, 0x58, 0x41, 0x7D, 0x3A, 0x34, 0x29, 0x35, 0x32, 0x36, 0x65,
        0x5B, 0x20, 0x39, 0x76, 0x7C, 0x5C, 0x72, 0x7A, 0x56, 0x43, 0x7F, 0x73, 0x38, 0x6B, 0x66,
        0x39, 0x63, 0x4E, 0x70, 0x33, 0x45, 0x45, 0x2B, 0x6B, 0x68, 0x68, 0x62, 0x71, 0x51, 0x59,
        0x4F, 0x66, 0x78, 0x09, 0x76, 0x5E, 0x62, 0x31, 0x7D, 0x44, 0x64, 0x4A, 0x23, 0x54, 0x6D,
        0x75, 0x43, 0x71, 0x4A, 0x4C, 0x41, 0x7E, 0x3A, 0x60, 0x4A, 0x4C, 0x41, 0x5E, 0x7E, 0x53,
        0x40, 0x4C, 0x40, 0x77, 0x45, 0x42, 0x4A, 0x2C, 0x27, 0x61, 0x2A, 0x48, 0x5D, 0x74, 0x72,
        0x22, 0x27, 0x75, 0x4B, 0x37, 0x31, 0x6F, 0x44, 0x37, 0x4E, 0x79, 0x4D, 0x3B, 0x59, 0x52,
        0x4C, 0x2F, 0x22, 0x50, 0x6F, 0x54, 0x67, 0x26, 0x6A, 0x2A, 0x72, 0x47, 0x7D, 0x6A, 0x64,
        0x74, 0x39, 0x2D, 0x54, 0x7B, 0x20, 0x2B, 0x3F, 0x7F, 0x2D, 0x38, 0x2E, 0x2C, 0x77, 0x4C,
        0x30, 0x67, 0x5D, 0x6E, 0x53, 0x7E, 0x6B, 0x47, 0x6C, 0x66, 0x34, 0x6F, 0x35, 0x78, 0x79,
        0x25, 0x5D, 0x74, 0x21, 0x30, 0x43, 0x64, 0x23, 0x26, 0x4D, 0x5A, 0x76, 0x52, 0x5B, 0x25,
        0x63, 0x6C, 0x24, 0x3F, 0x48, 0x2B, 0x7B, 0x55, 0x28, 0x78, 0x70, 0x23, 0x29, 0x69, 0x41,
        0x28, 0x2E, 0x34, 0x73, 0x4C, 0x09, 0x59, 0x21, 0x2A, 0x33, 0x24, 0x44, 0x7F, 0x4E, 0x3F,
        0x6D, 0x50, 0x77, 0x55, 0x09, 0x3B, 0x53, 0x56, 0x55, 0x7C, 0x73, 0x69, 0x3A, 0x35, 0x61,
        0x5F, 0x61, 0x63, 0x65, 0x4B, 0x50, 0x46, 0x58, 0x67, 0x58, 0x3B, 0x51, 0x31, 0x57, 0x49,
        0x69, 0x22, 0x4F, 0x6C, 0x6D, 0x46, 0x5A, 0x4D, 0x68, 0x48, 0x25, 0x7C, 0x27, 0x28, 0x36,
        0x5C, 0x46, 0x70, 0x3D, 0x4A, 0x6E, 0x24, 0x32, 0x7A, 0x79, 0x41, 0x2F, 0x37, 0x3D, 0x5F,
        0x60, 0x5F, 0x4B, 0x51, 0x4F, 0x5A, 0x20, 0x42, 0x2C, 0x36, 0x65, 0x57)
    _OFFSETS = (
        0, 1, 2, 0, 1, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1,
        0, 0, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 1)
    _ENCODER = {
        0x09 : [0x37, 0x69, 0x64], 0x0B : [0x0B, 0x0B, 0x0B], 0x0C : [0x0C, 0x0C, 0x0C],
        0x0E : [0x0E, 0x0E, 0x0E], 0x0F : [0x0F, 0x0F, 0x0F], 0x10 : [0x10, 0x10, 0x10],
        0x11 : [0x11, 0x11, 0x11], 0x12 : [0x12, 0x12, 0x12], 0x13 : [0x13, 0x13, 0x13],
        0x14 : [0x14, 0x14, 0x14], 0x15 : [0x15, 0x15, 0x15], 0x16 : [0x16, 0x16, 0x16],
        0x17 : [0x17, 0x17, 0x17], 0x18 : [0x18, 0x18, 0x18], 0x19 : [0x19, 0x19, 0x19],
        0x1A : [0x1A, 0x1A, 0x1A], 0x1B : [0x1B, 0x1B, 0x1B], 0x1C : [0x1C, 0x1C, 0x1C],
        0x1D : [0x1D, 0x1D, 0x1D], 0x1E : [0x1E, 0x1E, 0x1E], 0x1F : [0x1F, 0x1F, 0x1F],
        0x20 : [0x7E, 0x2C, 0x50], 0x21 : [0x5A, 0x65, 0x22], 0x22 : [0x45, 0x72, 0x4A],
        0x23 : [0x3A, 0x5B, 0x61], 0x24 : [0x79, 0x66, 0x5E], 0x25 : [0x59, 0x75, 0x5D],
        0x26 : [0x27, 0x4C, 0x5B], 0x27 : [0x76, 0x45, 0x42], 0x28 : [0x63, 0x76, 0x60],
        0x29 : [0x62, 0x2A, 0x23], 0x2A : [0x4D, 0x43, 0x65], 0x2B : [0x51, 0x33, 0x5F],
        0x2C : [0x53, 0x42, 0x7E], 0x2D : [0x52, 0x20, 0x4F], 0x2E : [0x20, 0x63, 0x52],
        0x2F : [0x26, 0x4A, 0x7A], 0x30 : [0x54, 0x5A, 0x21], 0x31 : [0x71, 0x38, 0x46],
        0x32 : [0x2B, 0x79, 0x20], 0x33 : [0x66, 0x32, 0x26], 0x34 : [0x2A, 0x57, 0x63],
        0x35 : [0x58, 0x6C, 0x2A], 0x36 : [0x7F, 0x2B, 0x76], 0x37 : [0x7B, 0x46, 0x47],
        0x38 : [0x30, 0x52, 0x25], 0x39 : [0x31, 0x4F, 0x2C], 0x3A : [0x6C, 0x3D, 0x29],
        0x3B : [0x49, 0x70, 0x69], 0x3D : [0x78, 0x7B, 0x27], 0x3F : [0x5F, 0x51, 0x67],
        0x40 : [0x40, None, 0x40], 0x41 : [0x29, 0x7A, 0x62], 0x42 : [0x24, 0x7E, 0x41],
        0x43 : [0x2F, 0x3B, 0x5A], 0x44 : [0x39, 0x47, 0x66], 0x45 : [0x33, 0x41, 0x32],
        0x46 : [0x6F, 0x77, 0x73], 0x47 : [0x21, 0x56, 0x4D], 0x48 : [0x75, 0x5F, 0x43],
        0x49 : [0x28, 0x26, 0x71], 0x4A : [0x42, 0x78, 0x39], 0x4B : [0x46, 0x6E, 0x7C],
        0x4C : [0x4A, 0x64, 0x53], 0x4D : [0x5C, 0x74, 0x48], 0x4E : [0x48, 0x67, 0x31],
        0x4F : [0x36, 0x7D, 0x72], 0x50 : [0x4B, 0x68, 0x6E], 0x51 : [0x7D, 0x35, 0x70],
        0x52 : [0x5D, 0x22, 0x49], 0x53 : [0x6A, 0x55, 0x3F], 0x54 : [0x50, 0x3A, 0x4B],
        0x55 : [0x69, 0x60, 0x6A], 0x56 : [0x23, 0x6A, 0x2E], 0x57 : [0x09, 0x71, 0x7F],
        0x58 : [0x70, 0x6F, 0x28], 0x59 : [0x65, 0x49, 0x35], 0x5A : [0x74, 0x5C, 0x7D],
        0x5B : [0x2C, 0x5D, 0x24], 0x5C : [0x77, 0x27, 0x2D], 0x5D : [0x44, 0x59, 0x54],
        0x5E : [0x3F, 0x25, 0x37], 0x5F : [0x6D, 0x7C, 0x7B], 0x60 : [0x7C, 0x23, 0x3D],
        0x61 : [0x43, 0x6D, 0x6C], 0x62 : [0x38, 0x28, 0x34], 0x63 : [0x5E, 0x31, 0x6D],
        0x64 : [0x5B, 0x39, 0x4E], 0x65 : [0x6E, 0x7F, 0x2B], 0x66 : [0x57, 0x36, 0x30],
        0x67 : [0x4C, 0x54, 0x6F], 0x68 : [0x34, 0x34, 0x74], 0x69 : [0x72, 0x62, 0x6B],
        0x6A : [0x25, 0x4E, 0x4C], 0x6B : [0x56, 0x30, 0x33], 0x6C : [0x73, 0x5E, 0x56],
        0x6D : [0x68, 0x73, 0x3A], 0x6E : [0x55, 0x09, 0x78], 0x6F : [0x47, 0x4B, 0x57],
        0x70 : [0x32, 0x61, 0x77], 0x71 : [0x35, 0x24, 0x3B], 0x72 : [0x2E, 0x4D, 0x44],
        0x73 : [0x64, 0x6B, 0x2F], 0x74 : [0x4F, 0x44, 0x59], 0x75 : [0x3B, 0x21, 0x45],
        0x76 : [0x2D, 0x37, 0x5C], 0x77 : [0x41, 0x53, 0x68], 0x78 : [0x61, 0x58, 0x36],
        0x79 : [0x7A, 0x48, 0x58], 0x7A : [0x22, 0x2E, 0x79], 0x7B : [0x60, 0x50, 0x09],
        0x7C : [0x6B, 0x2D, 0x75], 0x7D : [0x4E, 0x29, 0x38], 0x7E : [0x3D, 0x3F, 0x55],
        0x7F : [0x67, 0x2F, 0x51]
    }

    _ESCAPE = {
        0x40: B'@$',
        0x3C: B'@!',
        0x3E: B'@*',
        0x0D: B'@#',
        0x0A: B'@&',
    }

    _UNESCAPE = {
        B'@$': B'@',
        B'@!': B'<',
        B'@*': B'>',
        B'@#': B'\r',
        B'@&': B'\n',
    }

    def __init__(
        self,
        marker: Arg.Switch('-m', '--no-marker', off=True, help=(
            'Do not require magic marker when encoding and do not search for '
            'marker when decoding.')
        ) = True
    ):
        super().__init__(marker=marker)

    @classmethod
    def _chunk(cls, byte, index):
        k = byte - 9
        c = cls._CHUNKS[k * 3 : k * 3 + 3]
        return c[cls._OFFSETS[index % 64]]

    def _escape(self, iterable):
        escapes = bytes(self._ESCAPE)
        if self.args.marker:
            yield from self._MARKER_INIT
        for byte in iterable:
            if byte in escapes:
                yield from self._ESCAPE[byte]
            else:
                yield byte
        if self.args.marker:
            yield from self._MARKER_STOP

    def _unescape(self, data):
        def unescaper(m): return self._UNESCAPE[m[0]]
        return re.sub(RB'@[$!*#&]', unescaper, data)

    @classmethod
    def _decoded(cls, data):
        index = -1
        for byte in data:
            if byte < 128:
                index += 1
            if (byte == 9 or 31 < byte < 128) and byte != 60 and byte != 62 and byte != 64:
                byte = cls._chunk(byte, index)
            yield byte

    @classmethod
    def _encoded(cls, data):
        for i, byte in enumerate(data):
            try:
                sequence = cls._ENCODER[byte]
            except KeyError:
                yield byte
            else:
                offset = cls._OFFSETS[i % 0x40]
                yield sequence[offset]

    def reverse(self, data):
        return bytearray(self._escape(self._encoded(data)))

    def process(self, data):
        if self.args.marker:
            match = formats.wshenc.search(data)
            if not match:
                raise ValueError('Encoded script marker was not found.')
            data = match[0][12:-12]
        return bytearray(self._decoded(self._unescape(data)))

class xfcc (variable='count', relative=False)

This unit is implemented in refinery.units.meta.xfcc and has the following commandline Interface:

usage: xfcc [-h] [-L] [-Q] [-0] [-v] [-r] [variable]

The cross frame chunk count unit! It computes the number of times a chunk occurs across several
frames of input. It consumes all frames in the current and counts the number of times each item
occurs. It converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of
every leaf has this leaf as its only child. The leaves of this tree have been enriched with a
meta variable containing the number of times the corresponding chunk has occurred in the input
frame tree.

positional arguments:
  variable        The variable which is used as the accumulator

optional arguments:
  -r, --relative  Normalize the accumulator to a number between 0 and 1.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class xfcc(Unit):
    """
    The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames
    of input. It consumes all frames in the current and counts the number of times each item occurs. It
    converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of every leaf has
    this leaf as its only child. The leaves of this tree have been enriched with a meta variable containing
    the number of times the corresponding chunk has occurred in the input frame tree.
    """
    def __init__(
        self,
        variable: Arg(help='The variable which is used as the accumulator') = 'count',
        relative: Arg.Switch('-r', help='Normalize the accumulator to a number between 0 and 1.') = False
    ):
        super().__init__(variable=variable, relative=relative)
        self._trunk = None
        self._store: Dict[Chunk, int] = defaultdict(int)

    def finish(self):
        vn = self.args.variable
        rc = self.args.relative
        if rc and self._store:
            maximum = max(self._store.values())
        for index, (chunk, count) in enumerate(self._store.items()):
            if rc:
                count /= maximum
            chunk.path[-2] = 0
            chunk.path[-1] = index
            chunk.meta[vn] = count
            yield chunk
        self._store.clear()

    def _getcount(self, chunk):
        try:
            count = int(chunk.meta[self.args.variable])
        except (AttributeError, KeyError, TypeError):
            return 1
        else:
            return count

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        try:
            head = next(it)
        except StopIteration:
            return
        if len(head.path) < 2:
            self.log_warn(F'the current frame is nested {len(head.path)} layers deep, at least two layers are required.')
            yield head
            yield from it
            return
        trunk = head.path[:-2]
        store = self._store
        if trunk != self._trunk:
            yield from self.finish()
            self._trunk = trunk
        store[head] += self._getcount(head)
        for chunk in it:
            store[chunk] += self._getcount(chunk)

class xj0 (key=None, raw=False, all=False)

This unit is implemented in refinery.units.formats.json and has the following commandline Interface:

usage: xj0 [-h] [-L] [-Q] [-0] [-v] [-r | -a] [key]

Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic
to extract remaining fields as metadata: String values are extracted only if they do not exceed
80 characters in length and do not contain any line breaks. Floating-point, integer, boolean
values, and lists of the latter are also extracted.

positional arguments:
  key            Optional key of a value to become the main body of the chunk.

optional arguments:
  -r, --raw      Do not extract any other fields as metadata.
  -a, --all      Extract all other fields as metadata.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xj0(Unit):
    """
    Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to
    extract remaining fields as metadata: String values are extracted only if they do not exceed 80
    characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and
    lists of the latter are also extracted.
    """
    def __init__(
        self,
        key: Unit.Arg.Binary(help='Optional key of a value to become the main body of the chunk.') = None,
        raw: Unit.Arg.Switch('-r', group='META', help='Do not extract any other fields as metadata.') = False,
        all: Unit.Arg.Switch('-a', group='META', help='Extract all other fields as metadata.') = False
    ):
        super().__init__(key=key, raw=raw, all=all)

    def process(self, data):

        def acceptable(key, value, inside_list=False):
            if not is_valid_variable_name(key):
                return False
            if isinstance(value, dict):
                return False
            if isinstance(value, (float, int, bool)):
                return True
            if inside_list:
                return False
            if isinstance(value, list):
                return all(acceptable(key, t, True) for t in value)
            if isinstance(value, str):
                if self.args.all:
                    return True
                return len(value) in range(1, 80) and '\n' not in value

        doc: dict = json.loads(data)
        if not isinstance(doc, dict):
            raise ValueError('The input must be a JSON dictionary.')
        key = self.args.key
        result = key and doc.pop(key.decode(self.codec), '').encode(self.codec)
        if self.args.raw:
            return result
        else:
            return self.labelled(result, **{
                key: value for key, value in doc.items() if acceptable(key, value)
            })

class xjl

This unit is implemented in refinery.units.formats.json and has the following commandline Interface:

usage: xjl [-h] [-L] [-Q] [-0] [-v]

Returns all JSON elements from a JSON iterable as individual outputs.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xjl(Unit):
    """
    Returns all JSON elements from a JSON iterable as individual outputs.
    """

    def process(self, data):
        try:
            doc: Union[list, dict] = json.loads(data)
        except Exception:
            from refinery.units.pattern.carve_json import carve_json
            doc = data | carve_json | json.loads
        try:
            it = doc.values()
        except AttributeError:
            it = doc
        for item in it:
            yield json.dumps(item, indent=4).encode(self.codec)

class xkey (range=slice(1, 32, None))

This unit is implemented in refinery.units.misc.xkey and has the following commandline Interface:

usage: xkey [-h] [-L] [-Q] [-0] [-v] [start:end:step]

The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key, and
where the plaintext also has one letter that occurs with overwhelming frequency. This is often
the case for the zero byte in binary formats such as PE files, and the space character in text
files. Based on this assumption, the unit computes the most likely key. This can be useful to
decrypt PE and uncompressed text files that were encrypted byte-wise using a short key.

positional arguments:
  start:end:step  range of length values to try in Python slice syntax, the default is 1:32.

generic options:
  -h, --help      Show this help message and exit.
  -L, --lenient   Allow partial results as output.
  -Q, --quiet     Disables all log output.
  -0, --devnull   Do not produce any output.
  -v, --verbose   Specify up to two times to increase log level.

Expand source code Browse git

class xkey(Unit):
    """
    The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key, and
    where the plaintext also has one letter that occurs with overwhelming frequency. This is often
    the case for the zero byte in binary formats such as PE files, and the space character in text
    files. Based on this assumption, the unit computes the most likely key. This can be useful to
    decrypt PE and uncompressed text files that were encrypted byte-wise using a short key.
    """
    def __init__(
        self,
        range: Arg.Bounds(help='range of length values to try in Python slice syntax, the default is {default}.') = slice(1, 32),
    ):
        super().__init__(range=range)

    def process(self, data: bytearray):
        score = 0
        guess = None
        bounds: slice = self.args.range
        data = memoryview(data)

        n = len(data)

        start = bounds.start or 1
        stop = min(bounds.stop or n, n)

        if bounds.step is not None:
            step = bounds.step
            if bounds.start is None:
                start *= step
        else:
            step = 1

        self.log_debug(F'received input range [{bounds.start}:{bounds.stop}:{bounds.step}], using [{start}:{stop}:{step}]')

        for _count in range(start, stop, step):
            _guess = [Counter(data[j::_count]).most_common(1)[0] for j in range(_count)]
            _score = sum(letter_count for _, letter_count in _guess) / n

            # This scaling accounts for the smaller probability of larger keys. The scaling power is arbitrary and has
            # been chosen as 5 based on in-the-wild examples. For illustration, consider that a key of length equal to
            # the input can be chosen such that the input is annihilated entirely, so that would always yield the best
            # score. However, we are looking for an annihilating sequence of relatively small length.
            _score = _score * ((n - _count) / (n - 1)) ** 5

            logmsg = F'got score {_score * 100:5.2f}% for length {_count}'
            if _score > score:
                self.log_info(logmsg)
                score = _score
                guess = bytearray(value for value, _ in _guess)
            else:
                self.log_debug(logmsg)

        return guess

class xlmdeobf (extract_only=False, sort_formulas=False, with_ms_excel=False, day=-1, output_formula_format='CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format='CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent=False, start_point='', password='', output_level=0, timeout=0)

This unit is implemented in refinery.units.formats.office.xlmdeobf and has the following commandline Interface:

usage: xlmdeobf [-h] [-L] [-Q] [-0] [-v] [-x] [-s] [-X] [-d N] [-O FMT] [-E FMT] [-I] [-c CELL]
                [-p PASSWORD] [-o N] [-t N]

Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.

optional arguments:
  -x, --extract-only        Only extract cells without any emulation.
  -s, --sort-formulas       Sort extracted formulas based on their cell address (implies -x).
  -X, --with-ms-excel       Use MS Excel to process XLS files.
  -d, --day N               Specify the day of month
  -O, --output-format FMT   Specify the format for output formulas (using [[CELL-ADDR]], [[INT-
                            FORMULA]], and [[STATUS]])
  -E, --extract-format FMT  Specify the format for extracted formulas (using [[CELL-ADDR]],
                            [[CELL-FORMULA]], and [[CELL-VALUE]])
  -I, --no-indent           Do not show indent before formulas
  -c, --start-point CELL    Start interpretation from a specific cell address
  -p, --password PASSWORD   Password to decrypt the protected document
  -o, --output-level N      Set the level of details to be shown (0:all commands, 1: commands no
                            jump 2:important commands 3:strings in important commands).
  -t, --timeout N           Stop emulation after N seconds (0: not interruption N>0: stop
                            emulation after N seconds)

generic options:
  -h, --help                Show this help message and exit.
  -L, --lenient             Allow partial results as output.
  -Q, --quiet               Disables all log output.
  -0, --devnull             Do not produce any output.
  -v, --verbose             Specify up to two times to increase log level.

Expand source code Browse git

class xlmdeobf(Unit):
    """
    Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.
    """

    def __init__(
        self,
        extract_only: Unit.Arg.Switch(
            '-x', help='Only extract cells without any emulation.'
        ) = False,
        sort_formulas: Unit.Arg.Switch(
            '-s', '--sort-formulas',
            help='Sort extracted formulas based on their cell address (implies -x).',
        ) = False,
        with_ms_excel: Unit.Arg.Switch(
            '-X', '--with-ms-excel', help='Use MS Excel to process XLS files.'
        ) = False,
        day: Unit.Arg.Number(
            '-d',
            '--day',
            help='Specify the day of month',
        ) = -1,
        output_formula_format: Unit.Arg(
            '-O', '--output-format',
            type=str,
            metavar='FMT',
            help='Specify the format for output formulas (using [[CELL-ADDR]], [[INT-FORMULA]], and [[STATUS]])',
        ) = 'CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]',
        extract_formula_format: Unit.Arg(
            '-E', '--extract-format',
            metavar='FMT',
            type=str,
            help='Specify the format for extracted formulas (using [[CELL-ADDR]], [[CELL-FORMULA]], and [[CELL-VALUE]])',
        ) = 'CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]',
        no_indent: Unit.Arg.Switch(
            '-I', '--no-indent',
            help='Do not show indent before formulas',
        ) = False,
        start_point: Unit.Arg(
            '-c', '--start-point',
            type=str,
            help='Start interpretation from a specific cell address',
            metavar='CELL',
        ) = '',
        password: Unit.Arg(
            '-p',
            '--password',
            type=str,
            help='Password to decrypt the protected document',
        ) = '',
        output_level: Unit.Arg.Number(
            '-o',
            '--output-level',
            help=(
                'Set the level of details to be shown (0:all commands, 1: commands no jump 2:important '
                'commands 3:strings in important commands).'
            ),
        ) = 0,
        timeout: Unit.Arg.Number(
            '-t',
            '--timeout',
            help='Stop emulation after N seconds (0: not interruption N>0: stop emulation after N seconds)',
        ) = 0,
    ):
        extract_only = sort_formulas or extract_only
        self.superinit(super(), **vars())

    @Unit.Requires('XLMMacroDeobfuscator', 'formats', 'office')
    def _process_file():
        from XLMMacroDeobfuscator.configs import settings
        settings.SILENT = True
        from XLMMacroDeobfuscator.deobfuscator import process_file
        return process_file

    def process(self, data: bytearray):
        with VirtualFileSystem() as vfs, NoLogging():
            result = self._process_file(
                file=vfs.new(data),
                noninteractive=True,
                return_deobfuscated=True,
                extract_only=self.args.extract_only,
                silent=True,
                sort_formulas=self.args.sort_formulas,
                defined_names=False,
                with_ms_excel=self.args.with_ms_excel,
                start_with_shell=False,
                day=self.args.day,
                output_formula_format=self.args.output_formula_format,
                extract_formula_format=self.args.extract_formula_format,
                no_indent=self.args.no_indent,
                start_point=self.args.start_point,
                password=self.args.password,
                output_level=self.args.output_level,
                timeout=self.args.timeout,
            )
        return '\n'.join(result).encode(self.codec)

class xlxtr (*references)

This unit is implemented in refinery.units.formats.office.xlxtr and has the following commandline Interface:

usage: xlxtr [-h] [-L] [-Q] [-0] [-v] [reference [reference ...]]

Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet
reference is of the form B1 or 1.2, both specifying the first cell of the second column. A cell
range can be specified as B1:C12, or 1.2:C12, or 1.2:12.3. Finally, the unit will always refer to
the first sheet in the document and to change this, specify the sheet name or index separated by
a hashtag, i.e. sheet#B1:C12 or 1#B1:C12. Note that indices are 1-based. To get all elements of
one sheet, use sheet#. The unit If parsing a sheet reference fails, the script will assume that
the given reference specifies a sheet.

positional arguments:
  reference      A sheet reference to be extracted. If no sheet references are given, the unit
                 lists all sheet names.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xlxtr(Unit):
    """
    Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet reference is of the form `B1` or `1.2`,
    both specifying the first cell of the second column. A cell range can be specified as `B1:C12`, or `1.2:C12`, or `1.2:12.3`. Finally,
    the unit will always refer to the first sheet in the document and to change this, specify the sheet name or index separated by a
    hashtag, i.e. `sheet#B1:C12` or `1#B1:C12`. Note that indices are 1-based. To get all elements of one sheet, use `sheet#`. The unit
    If parsing a sheet reference fails, the script will assume that the given reference specifies a sheet.
    """
    def __init__(self, *references: Arg(metavar='reference', type=SheetReference, help=(
        'A sheet reference to be extracted. '
        'If no sheet references are given, the unit lists all sheet names.'
    ))):
        if not references:
            references = [SheetReference('*')]
        super().__init__(references=references)

    @Unit.Requires('xlrd2', 'formats', 'office', 'extended')
    def _xlrd():
        import xlrd2
        return xlrd2

    @Unit.Requires('openpyxl', 'formats', 'office', 'extended')
    def _openpyxl():
        import openpyxl
        return openpyxl

    @Unit.Requires('pyxlsb2', 'formats', 'office', 'extended')
    def _pyxlsb2():
        import pyxlsb2
        return pyxlsb2

    def _rcmatch(self, sheet_index, sheet_name, row, col):
        assert row > 0
        assert col > 0
        if not self.args.references:
            return True
        for ref in self.args.references:
            ref: SheetReference
            if not ref.match(sheet_index, sheet_name):
                continue
            if (row, col) in ref:
                return True
        else:
            return False

    def _get_value(self, sheet_index, sheet, callable, row, col):
        if col <= 0 or row <= 0:
            raise ValueError(F'invalid cell reference ({row}, {col}) - indices must be positive numbers')
        if not self._rcmatch(sheet_index, sheet, row, col):
            return
        try:
            value = callable(row - 1, col - 1)
        except IndexError:
            return
        if not value:
            return
        if isinstance(value, float):
            if float(int(value)) == value:
                value = int(value)
        yield self.labelled(
            str(value).encode(self.codec),
            row=row,
            col=col,
            ref=_rc2ref(row, col),
            sheet=sheet
        )

    def _process_pyxlsb2(self, data):
        with self._pyxlsb2.open_workbook(MemoryFile(data)) as wb:
            for ref in self.args.references:
                ref: SheetReference
                for k, rec in enumerate(wb.sheets):
                    rec: SheetRecord
                    self.log_info(rec)
                    name = rec.name
                    if not ref.match(k, name):
                        continue
                    sheet = wb.get_sheet_by_name(name)
                    rows = list(sheet.rows())
                    nrows = len(rows)
                    ncols = max((len(r) for r in rows), default=0)
                    for row, col in ref.cells(nrows, ncols):
                        def get(row, col):
                            return rows[row][col].v
                        yield from self._get_value(k, name, get, row, col)

    def _process_xlrd(self, data):
        with io.StringIO() as logfile:
            vb = max(self.log_level.verbosity - 1, 0)
            wb = self._xlrd.open_workbook(file_contents=data, logfile=logfile, verbosity=vb, on_demand=True)
            logfile.seek(0)
            for entry in logfile:
                entry = entry.strip()
                if re.search(R'^[A-Z]+:', entry) or '***' in entry:
                    self.log_info(entry)
        for ref in self.args.references:
            ref: SheetReference
            for k, name in enumerate(wb.sheet_names()):
                if not ref.match(k, name):
                    continue
                sheet = wb.sheet_by_name(name)
                self.log_info(F'iterating {sheet.ncols} columns and {sheet.nrows} rows')
                for row, col in ref.cells(sheet.nrows, sheet.ncols):
                    yield from self._get_value(k, name, sheet.cell_value, row, col)

    def _process_openpyxl(self, data):
        with NoLogging():
            workbook = self._openpyxl.load_workbook(MemoryFile(data), read_only=True)
        for ref in self.args.references:
            ref: SheetReference
            for k, name in enumerate(workbook.sheetnames):
                if not ref.match(k, name):
                    continue
                sheet = workbook[name]
                with NoLogging():
                    cells = [row for row in sheet.iter_rows(values_only=True)]
                nrows = len(cells)
                ncols = max((len(row) for row in cells), default=0)
                for row, col in ref.cells(nrows, ncols):
                    yield from self._get_value(k, name, lambda r, c: cells[r][c], row, col)

    def process(self, data):
        last_error = None
        for name, processor in (
            ('openpyxl', self._process_openpyxl),
            ('xlrd', self._process_xlrd),
            ('pyxlsb2', self._process_pyxlsb2),
        ):
            try:
                yield from processor(data)
            except Exception as e:
                last_error = e
                self.log_debug(F'failed processing with {name}: {e!s}')
            else:
                break
        else:
            raise last_error

class xor (argument, bigendian=False, blocksize=None)

This unit is implemented in refinery.units.blockwise.xor and has the following commandline Interface:

usage: xor [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument

Form the exclusive or of the input data with the given argument.

positional arguments:
  argument           A single numeric expression which provides the right argument to the
                     operation, where the left argument is each block in the input data. This
                     argument can also contain a sequence of bytes which is then split into
                     blocks of the same size as the input data and used cyclically.

optional arguments:
  -E, --bigendian    Read chunks in big endian.
  -B, --blocksize N  The size of each block in bytes, default is 1.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.

Expand source code Browse git

class xor(BinaryOperationWithAutoBlockAdjustment):
    """
    Form the exclusive or of the input data with the given argument.
    """
    @staticmethod
    def operate(a, b): return a ^ b
    @staticmethod
    def inplace(a, b): a ^= b

    def _fastblock(self, data):
        try:
            return super()._fastblock(data)
        except FastBlockError as E:
            try:
                from Cryptodome.Util.strxor import strxor
            except ModuleNotFoundError:
                raise E
            else:
                from itertools import islice
                size = len(data)
                arg0 = self._normalize_argument(*self._argument_parse_hook(self.args.argument[0]))
                take = len(data) // self.blocksize + 1
                argb = self.unchunk(islice(arg0, take))
                del argb[size:]
                return strxor(data, argb)

class xt (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xt and has the following commandline Interface:

usage: xt [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
          [path [path ...]]

Extract files from archives. The unit tries to identify the archive format and use the correct
extractor.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xt(ArchiveUnit):
    """
    Extract files from archives. The unit tries to identify the archive format and use the
    correct extractor.
    """
    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        out = False
        for engine in cls.handlers():
            engine_verdict = engine.handles(data)
            if engine_verdict is True:
                return True
            if engine_verdict is None:
                out = None
        return out

    @staticmethod
    def handlers():
        """
        Returns all archive handlers supported by the unit.
        """
        from refinery.units.formats.office.xtone import xtone
        yield xtone
        from refinery.units.formats.archive.xtgz import xtgz
        yield xtgz
        from refinery.units.formats.email import xtmail
        yield xtmail
        from refinery.units.formats.pdf import xtpdf
        yield xtpdf
        from refinery.units.formats.archive.xtasar import xtasar
        yield xtasar
        from refinery.units.formats.office.xtrtf import xtrtf
        yield xtrtf
        from refinery.units.formats.archive.xtzpaq import xtzpaq
        yield xtzpaq
        from refinery.units.formats.pe.dotnet.dnsfx import dnsfx
        yield dnsfx
        from refinery.units.formats.archive.xtnsis import xtnsis
        yield xtnsis
        from refinery.units.formats.archive.xtnode import xtnode
        yield xtnode
        from refinery.units.formats.archive.xtace import xtace
        yield xtace
        from refinery.units.formats.archive.xtcab import xtcab
        yield xtcab
        from refinery.units.formats.archive.xtcpio import xtcpio
        yield xtcpio
        from refinery.units.formats.archive.xtiso import xtiso
        yield xtiso
        from refinery.units.formats.archive.xtpyi import xtpyi
        yield xtpyi
        from refinery.units.formats.archive.xttar import xttar
        yield xttar
        from refinery.units.formats.archive.xtiss import xtiss
        yield xtiss
        from refinery.units.formats.archive.xtzip import xtzip
        yield xtzip
        from refinery.units.formats.archive.xt7z import xt7z
        yield xt7z
        from refinery.units.formats.msi import xtmsi
        yield xtmsi
        from refinery.units.formats.archive.xtmacho import xtmacho
        yield xtmacho
        from refinery.units.formats.archive.xtnuitka import xtnuitka
        yield xtnuitka
        from refinery.units.formats.office.xtdoc import xtdoc
        yield xtdoc
        from refinery.units.formats.json import xtjson
        yield xtjson
        from refinery.units.formats.exe.vsect import vsect
        yield vsect

    def unpack(self, data):
        fallback: List[Type[ArchiveUnit]] = []
        errors = {}
        pos_args = self.args.paths
        key_args = dict(
            list=self.args.list,
            path=self.args.path,
            date=self.args.date,
            join_path=self.args.join,
            drop_path=self.args.drop,
        )
        if self.args.pwd:
            key_args.update(pwd=self.args.pwd)
        if self.args.regex:
            key_args.update(regex=self.args.regex)

        class unpacker:
            unit = self

            def __init__(self, handler: Type[ArchiveUnit], fallback: bool):
                self.success = False
                self.handler = handler
                self.fallback = fallback

            def __iter__(self):
                handler = self.handler
                if self.fallback:
                    verdict = True
                else:
                    verdict = handler.handles(data)
                if verdict is False:
                    self.unit.log_info(F'rejected: {handler.name}')
                elif verdict is True:
                    if not self.fallback:
                        self.unit.log_info(F'accepted: {handler.name}')
                    try:
                        unit = handler(*pos_args, **key_args)
                        unit.args.lenient = self.unit.args.lenient
                        unit.args.quiet = self.unit.args.quiet
                    except TypeError as error:
                        self.unit.log_debug('handler construction failed:', error)
                        return
                    try:
                        for item in unit.unpack(data):
                            item.get_data()
                            yield item
                    except Exception as error:
                        if not self.fallback:
                            errors[handler.name] = error
                        if isinstance(error, MultipleArchives):
                            self.unit.log_warn(error)
                        else:
                            self.unit.log_debug('handler unpacking failed:', error)
                    else:
                        self.success = True
                elif verdict is None:
                    fallback.append(handler)

        for handler in self.handlers():
            self._custom_path_separator = handler._custom_path_separator
            it = unpacker(handler, fallback=False)
            yield from it
            if it.success:
                return

        self.log_debug('fallback order:', lambda: ', '.join(h.name for h in fallback))

        for handler in fallback:
            it = unpacker(handler, fallback=True)
            yield from it
            if it.success:
                return

        if not errors:
            raise ValueError('input data did not match any known archive format')
        for name, error in errors.items():
            self.log_info(F'error when trying to unpack with {name}:', error)
        raise RefineryException('none of the available unpackers could handle this data')

Static methods

def handlers()

Returns all archive handlers supported by the unit.

Expand source code Browse git

@staticmethod
def handlers():
    """
    Returns all archive handlers supported by the unit.
    """
    from refinery.units.formats.office.xtone import xtone
    yield xtone
    from refinery.units.formats.archive.xtgz import xtgz
    yield xtgz
    from refinery.units.formats.email import xtmail
    yield xtmail
    from refinery.units.formats.pdf import xtpdf
    yield xtpdf
    from refinery.units.formats.archive.xtasar import xtasar
    yield xtasar
    from refinery.units.formats.office.xtrtf import xtrtf
    yield xtrtf
    from refinery.units.formats.archive.xtzpaq import xtzpaq
    yield xtzpaq
    from refinery.units.formats.pe.dotnet.dnsfx import dnsfx
    yield dnsfx
    from refinery.units.formats.archive.xtnsis import xtnsis
    yield xtnsis
    from refinery.units.formats.archive.xtnode import xtnode
    yield xtnode
    from refinery.units.formats.archive.xtace import xtace
    yield xtace
    from refinery.units.formats.archive.xtcab import xtcab
    yield xtcab
    from refinery.units.formats.archive.xtcpio import xtcpio
    yield xtcpio
    from refinery.units.formats.archive.xtiso import xtiso
    yield xtiso
    from refinery.units.formats.archive.xtpyi import xtpyi
    yield xtpyi
    from refinery.units.formats.archive.xttar import xttar
    yield xttar
    from refinery.units.formats.archive.xtiss import xtiss
    yield xtiss
    from refinery.units.formats.archive.xtzip import xtzip
    yield xtzip
    from refinery.units.formats.archive.xt7z import xt7z
    yield xt7z
    from refinery.units.formats.msi import xtmsi
    yield xtmsi
    from refinery.units.formats.archive.xtmacho import xtmacho
    yield xtmacho
    from refinery.units.formats.archive.xtnuitka import xtnuitka
    yield xtnuitka
    from refinery.units.formats.office.xtdoc import xtdoc
    yield xtdoc
    from refinery.units.formats.json import xtjson
    yield xtjson
    from refinery.units.formats.exe.vsect import vsect
    yield vsect

class xt7z (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xt7z and has the following commandline Interface:

usage: xt7z [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
            [path [path ...]]

Extract files from a 7zip archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xt7z(ArchiveUnit):
    """
    Extract files from a 7zip archive.
    """
    @ArchiveUnit.Requires('py7zr', 'arc', 'default', 'extended')
    def _py7zr():
        import py7zr
        import py7zr.exceptions
        return py7zr

    def unpack(self, data: bytearray):
        for match in re.finditer(re.escape(B'7z\xBC\xAF\x27\x1C'), data):
            start = match.start()
            if start != 0:
                self.log_info(F'found a header at offset 0x{start:X}, trying to extract from there.')
            try:
                yield from self._unpack_from(data, start)
            except self._py7zr.Bad7zFile:
                continue
            else:
                break

    def _unpack_from(self, data: bytearray, zp: int = 0):
        def mk7z(**keywords):
            return self._py7zr.SevenZipFile(MemoryFile(mv[zp:]), **keywords)

        pwd = self.args.pwd
        mv = memoryview(data)

        if pwd:
            try:
                archive = mk7z(password=pwd.decode(self.codec))
            except self._py7zr.Bad7zFile:
                raise ValueError('corrupt archive; the password is likely invalid.')
        else:
            def passwords():
                yield None
                yield from self._COMMON_PASSWORDS
            for pwd in passwords():
                try:
                    archive = mk7z(password=pwd)
                    problem = archive.testzip()
                except self._py7zr.PasswordRequired:
                    problem = True
                except self._py7zr.UnsupportedCompressionMethodError as E:
                    raise ValueError(E.message)
                except self._py7zr.exceptions.InternalError:
                    # ignore internal errors during testzip
                    break
                except SystemError:
                    problem = True
                except Exception:
                    if pwd is None:
                        raise
                    problem = True
                if not problem:
                    break
                if pwd is not None:
                    self.log_debug(F'trying password: {pwd}')
            else:
                raise ValueError('a password is required and none of the default passwords worked.')

        for info in archive.list():
            def extract(archive: SevenZipFile = archive, info: FileInfo = info):
                archive.reset()
                return archive.read([info.filename]).get(info.filename).read()
            if info.is_directory:
                continue
            yield self._pack(info.filename, info.creationtime, extract, crc32=info.crc32)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return B'7z\xBC\xAF\x27\x1C' in data

class xtace (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtace and has the following commandline Interface:

usage: xtace [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
             [path [path ...]]

Extract files from an ACE archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtace(ArchiveUnit):
    """
    Extract files from an ACE archive.
    """
    def unpack(self, data):
        ace = acefile.open(MemoryFile(data, read_as_bytes=True))
        for member in ace.getmembers():
            member: acefile.AceMember
            comment = {} if not member.comment else {'comment': member.comment}
            yield self._pack(
                member.filename,
                member.datetime,
                lambda a=ace, m=member: a.read(m, pwd=self.args.pwd),
                **comment
            )

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return b'**ACE**' in data[:0x100]

class xtasar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtasar and has the following commandline Interface:

usage: xtasar [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
              [path [path ...]]

Extract files from a ASAR archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtasar(ArchiveUnit):
    """
    Extract files from a ASAR archive.
    """
    def unpack(self, data: bytearray):
        def _unpack(dir: JSONDict, *path):
            for name, listing in dir.get('files', {}).items():
                yield from _unpack(listing, *path, name)
            try:
                offset = dir['offset']
                size = dir['size']
            except KeyError:
                return
            try:
                offset = int(offset) + header.base
                end = int(size) + offset
            except TypeError:
                self.log_warn(F'unable to convert offset "{offset}" and size "{size}" to integers')
                return
            if not path:
                self.log_warn(F'not processing item at root with offset {offset} and size {size}')
                return
            yield UnpackResult(
                '/'.join(path),
                lambda a=offset, b=end: data[a:b],
                offset=offset
            )

        header = AsarHeader(data)
        self.log_debug(F'header read successfully, base offset is {header.base}.')
        yield from _unpack(header.directory)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return data.startswith(b'\04\0\0\0') and data[0x10:0x18] == B'{"files"'

class xtcab (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtcab and has the following commandline Interface:

usage: xtcab [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
             [path [path ...]]

Extract files from CAB (cabinet) archives.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtcab(ArchiveUnit):
    """
    Extract files from CAB (cabinet) archives.
    """
    @ArchiveUnit.Requires('cabarchive', 'arc', 'default', 'extended')
    def _cabarchive():
        import cabarchive
        return cabarchive

    def unpack(self, data: bytearray):
        arc = self._cabarchive.CabArchive(data)
        for item in arc.find_files('*'):
            yield self._pack(item.filename, datetime.combine(item.date, item.time), item.buf)

    @classmethod
    def handles(cls, data: bytearray):
        return data.startswith(B'MSCF')

class xtcpio (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtcpio and has the following commandline Interface:

usage: xtcpio [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
              [path [path ...]]

Extract files from a CPIO archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtcpio(ArchiveUnit):
    """
    Extract files from a CPIO archive.
    """
    def unpack(self, data):
        def cpio():
            with suppress(EOF): return CPIOEntry(reader)
        reader = StructReader(memoryview(data))
        for entry in iter(cpio, None):
            if entry.name == 'TRAILER!!!':
                break
            yield self._pack(entry.name, entry.mtime, entry.data)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        for signature in (B'\x71\xC7', B'\xC7\x71', B'0707'):
            if data.startswith(signature):
                if B'TRAILER!!' in data:
                    return True
                else:
                    return None
        return False

class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtdoc and has the following commandline Interface:

usage: xtdoc [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract files from an OLE document such as a Microsoft Word DOCX file.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    @PathExtractorUnit.Requires('olefile', 'formats', 'office', 'extended')
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data):
        with MemoryFile(data) as stream:
            try:
                oledoc = self._olefile.OleFileIO(stream)
            except OSError as error:
                self.log_info(F'error, {error}, treating input as zip file')
                yield from xtzip().unpack(data)
                return
            for item in oledoc.listdir():
                if not item or not item[-1]:
                    continue
                path = '/'.join(item)
                olestream = oledoc.openstream(path)
                c0 = ord(item[-1][:1])
                if c0 < 20:
                    item[-1] = F'[{c0:d}]{item[-1][1:]}'
                    path = '/'.join(item)
                path = convert_msi_name(path)
                self.log_debug('exploring:', path)
                yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        if data.startswith(B'\xD0\xCF\x11\xE0'):
            return True
        if xtzip.handles(data):
            return sum(1 for marker in [
                B'[Content_Types].xml',
                B'word/document.xml',
                B'docProps/core.xml',
            ] if marker in data) >= 2

class xtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False)

This unit is implemented in refinery.units.crypto.cipher.xtea and has the following commandline Interface:

usage: xtea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key

XTEA encryption and decryption.

positional arguments:
  key              The encryption key.

optional arguments:
  -i, --iv IV      Specifies the initialization vector. If none is specified, then a block of
                   zero bytes is used.
  -p, --padding P  Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does
                   nothing. By default, all other algorithms are attempted. In most cases, the
                   data was not correctly decrypted if none of these work.
  -m, --mode M     Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB,
                   PCBC. By default, the CBC mode is used when an IV is is provided, and ECB
                   otherwise.
  -r, --raw        Set the padding to raw; ignored when a padding is specified.
  -s, --swap       Decode blocks as big endian rather than little endian.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.
  -R, --reverse    Use the reverse operation; Specify twice to normalize (first decode, then
                   encode).

Expand source code Browse git

class xtea(TEAUnit, cipher=BlockCipherFactory(XTEA)):
    """
    XTEA encryption and decryption.
    """
    pass

class xtgz (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtgz and has the following commandline Interface:

usage: xtgz [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
            [path [path ...]]

Extract a file from a GZip archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtgz(ArchiveUnit):
    """
    Extract a file from a GZip archive.
    """
    def unpack(self, data: bytearray):
        archive = GzipHeader(data)
        path = archive.name
        date = archive.mtime
        date = date and datetime.fromtimestamp(date) or None
        if path is None:
            try:
                meta = metavars(data)
                path = Path(meta['path'])
            except KeyError:
                path = 'ungz'
            else:
                self.log_warn(path)
                suffix = path.suffix
                if suffix.lower() == '.gz':
                    path = path.with_suffix('')
                else:
                    path = path.with_suffix(F'{suffix}.ungz')
                path = path.as_posix()
        yield self._pack(path, date, archive.data)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return data.startswith(B'\x1F\x8B')

class xthtml (*paths, outer, attributes, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False, format=None)

This unit is implemented in refinery.units.formats.html and has the following commandline Interface:

usage: xthtml [-h] [-L] [-Q] [-0] [-v] [-o] [-a] [-f F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
              [path [path ...]]

The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the
given tag. The main purpose is to extract scripts from HTML documents.

positional arguments:
  path              Wildcard pattern for the path of the item to be extracted. Each item is
                    returned as a separate output of this unit. Paths may contain wildcards; The
                    default argument is a single wildcard, which means that every item will be
                    extracted. If a given path yields no results, the unit performs increasingly
                    fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -o, --outer       Include the HTML tags for an extracted element.
  -a, --attributes  Populate chunk metadata with HTML tag attributes.
  -f, --format F    A format expression to be applied for computing the path of an item. This
                    must use metadata that is available on the item. The current tag can be
                    accessed as {0}. If no format is specified, the unit attempts to derive a
                    good attribute from the XML tree to use for generating paths.
  -l, --list        Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path   Join path names from container with previous path names.
  -d, --drop-path   Do not modify the path variable for output chunks.
  -z, --fuzzy       Specify once to add a leading wildcard to each patterns, twice to also add a
                    trailing wildcard.
  -e, --exact       Path patterns never match on substrings.
  -r, --regex       Use regular expressions instead of wildcard patterns.
  -P, --path NAME   Name of the meta variable to receive the extracted path. The default value is
                    "path".

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class xthtml(XMLToPathExtractorUnit):
    """
    The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the
    given tag. The main purpose is to extract scripts from HTML documents.
    """
    def __init__(
        self, *paths,
        outer: Arg.Switch('-o', help='Include the HTML tags for an extracted element.'),
        attributes: Arg.Switch('-a', help='Populate chunk metadata with HTML tag attributes.'),
        **keywords
    ):
        super().__init__(*paths, outer=outer, attributes=attributes, **keywords)

    def unpack(self, data):
        html = HTMLTreeParser()
        html.feed(data.decode(self.codec))
        root = html.tos
        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        while root.parent:
            self.log_info(F'tag was not closed: {root.tag}')
            root = root.parent

        while len(root.children) == 1 and root.children[0].tag == root.tag:
            root, = root.children

        def tree(root: HTMLNode, *parts: str):

            def outer(root: HTMLNode = root):
                return root.recover(inner=False).encode(self.codec)

            def inner(root: HTMLNode = root):
                return root.recover().encode(self.codec)

            tagpath = '/'.join(parts)
            meta = {}

            if self.args.attributes:
                meta.update(root.attributes)

            if root.root:
                yield UnpackResult(tagpath, inner, **meta)
            elif self.args.outer:
                yield UnpackResult(tagpath, outer, **meta)
            else:
                yield UnpackResult(tagpath, inner, **meta)

            tag_pre_count = Counter()
            tag_run_count = Counter()
            for child in root.children:
                if child.textual:
                    continue
                tag_pre_count[child.tag] += 1

            for child in root.children:
                if child.textual:
                    continue
                if tag_pre_count[child.tag] == 1:
                    yield from tree(child, *parts, path(child))
                    continue
                tag_run_count[child.tag] += 1
                index = tag_run_count[child.tag]
                yield from tree(child, *parts, path(child, index))

        yield from tree(root, path(root))

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib import mime
        info = mime.get_cached_file_magic_info(data)
        if info.extension == 'html':
            return True
        if info.mime.endswith('html'):
            return True
        return False

class xtiso (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs='auto')

This unit is implemented in refinery.units.formats.archive.xtiso and has the following commandline Interface:

usage: xtiso [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-s TYPE]
             [path [path ...]]

Extract files from a ISO archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -s, --fs TYPE    Specify a file system (udf, joliet, rr, iso, auto) extension to use. The
                   default setting auto will automatically detect the first of the other
                   available options and use it.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtiso(ArchiveUnit):
    """
    Extract files from a ISO archive.
    """
    def __init__(
        self,
        *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
        fs: Arg.Choice('-s', metavar='TYPE', choices=_ISO_FILE_SYSTEMS, help=(
            'Specify a file system ({choices}) extension to use. The default setting {default} will automatically '
            'detect the first of the other available options and use it.')) = 'auto'
    ):
        if fs not in _ISO_FILE_SYSTEMS:
            raise ValueError(F'invalid file system {fs}: must be udf, joliet, rr, iso, or auto.')
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            path=path,
            date=date,
            fs=fs
        )

    @ArchiveUnit.Requires('pycdlib', 'arc', 'default', 'extended')
    def _pycdlib():
        import pycdlib
        import pycdlib.dates

        def fixed_parse(self, datestr):
            datestr = datestr[:-3] + b'00\0'
            return original_parse(self, datestr)

        original_parse = pycdlib.dates.VolumeDescriptorDate.parse
        pycdlib.dates.VolumeDescriptorDate.parse = fixed_parse
        return pycdlib

    @staticmethod
    def _strip_revision(name: str):
        base, split, revision = name.partition(';')
        return base if split and revision.isdigit() else name

    def unpack(self, data):
        if not self.handles(data):
            self.log_warn('The data does not look like an ISO file.')
        with MemoryFile(data, read_as_bytes=True) as stream:
            iso = self._pycdlib.PyCdlib()
            iso.open_fp(stream)
            fs = self.args.fs
            if fs != 'auto':
                mkfacade = {
                    'iso'    : iso.get_iso9660_facade,
                    'udf'    : iso.get_udf_facade,
                    'joliet' : iso.get_joliet_facade,
                    'rr'     : iso.get_rock_ridge_facade,
                }
                facade = mkfacade[fs]()
            elif iso.has_udf():
                self.log_info('using format: udf')
                facade = iso.get_udf_facade()
            elif iso.has_joliet():
                self.log_info('using format: joliet')
                facade = iso.get_joliet_facade()
            elif iso.has_rock_ridge():
                self.log_info('using format: rr')
                facade = iso.get_rock_ridge_facade()
            else:
                self.log_info('using format: iso')
                facade = iso.get_iso9660_facade()

            for root, _, files in facade.walk('/'):
                root = root.rstrip('/')
                for name in files:
                    name = name.lstrip('/')
                    path = F'{root}/{name}'
                    try:
                        info = facade.get_record(path)
                        date = info.date
                    except Exception:
                        info = None
                        date = None
                    else:
                        date = datetime.datetime(
                            date.years_since_1900 + 1900,
                            date.month,
                            date.day_of_month,
                            date.hour,
                            date.minute,
                            date.second,
                            tzinfo=datetime.timezone(datetime.timedelta(minutes=15 * date.gmtoffset))
                        )

                    def extract(info=info, path=path):
                        if info:
                            buffer = MemoryFile(bytearray(info.data_length))
                        else:
                            buffer = MemoryFile(bytearray())
                        facade.get_file_from_iso_fp(buffer, path)
                        return buffer.getvalue()

                    yield self._pack(self._strip_revision(path), date, extract)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return any(data[k] == B'CD001' for k in (
            slice(0x8001, 0x8006),
            slice(0x8801, 0x8806),
            slice(0x9001, 0x9006),
        ))

class xtiss (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtiss and has the following commandline Interface:

usage: xtiss [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
             [path [path ...]]

Extracts files from Install Shield Setup files.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtiss(ArchiveUnit):
    """
    Extracts files from Install Shield Setup files.
    """
    def unpack(self, data: bytearray):
        offset = max(data.rfind(magic) for magic in ISSReader.MAGIC)
        if offset < 0:
            raise ValueError('ISS magic not found.')
        data[:offset] = []

        reader = ISSReader(data)
        count = reader.iss_archive_header()

        self.log_info(F'archive contains {count} files according to header')

        for _ in range(count):
            name, data = reader.iss_file()
            yield self._pack(name, None, data)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return data.startswith(B'MZ') and any(data.find(m) > 0 for m in ISSReader.MAGIC)

class xtjson (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.json and has the following commandline Interface:

usage: xtjson [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract values from a JSON document.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtjson(PathExtractorUnit):
    """
    Extract values from a JSON document.
    """
    _custom_path_separator = '.'

    def unpack(self, data):

        def crawl(path, cursor):
            if isinstance(cursor, dict):
                for key, value in cursor.items():
                    yield from crawl(F'{path}/{key}', value)
            elif isinstance(cursor, list):
                for key, value in enumerate(cursor):
                    yield from crawl(F'{path}/{key:d}', value)
            if path:
                yield path, cursor, cursor.__class__.__name__

        for path, item, typename in crawl('', json.loads(data)):
            def extract(item=item):
                if isinstance(item, (list, dict)):
                    dumped = json.dumps(item, indent=4)
                else:
                    dumped = str(item)
                return dumped.encode(self.codec)
            yield UnpackResult(path, extract, type=typename)

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        return bool(checks.json.fullmatch(data))

class xtmacho (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtmacho and has the following commandline Interface:

usage: xtmacho [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
               [-p PWD]
               [path [path ...]]

Extract the individual executables from a MachO universal binary (sometimes called a MachO fat
file)."

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtmacho(ArchiveUnit):
    """
    Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)."
    """
    _SIGNATURE_BE = B'\xCA\xFE\xBA\xBE'
    _SIGNATURE_LE = B'\xBE\xBA\xFE\xCA'

    def unpack(self, data: bytearray):
        view = memoryview(data)
        signature = bytes(view[:4])
        try:
            reader = StructReader(view, bigendian={
                self._SIGNATURE_BE: True,
                self._SIGNATURE_LE: False,
            }[signature])
        except KeyError as KE:
            raise ValueError('Not a MachO universal binary; invalid magic header bytes.') from KE
        else:
            reader.seekset(4)
        count = reader.u32()
        self.log_info(F'reading {count} embedded executables')
        while count > 0:
            fa = FatArch(reader)
            self.log_info(F'reading item of size 0x{len(fa.data):08X}, arch {fa.cputype.name}')
            yield self._pack(fa.cputype.name, None, fa.data)
            count -= 1

    @classmethod
    def handles(cls, data: bytearray):
        return data[:4] in (
            cls._SIGNATURE_BE,
            cls._SIGNATURE_LE,
        )

class xtmagtape

This unit is implemented in refinery.units.formats.archive.xtmagtape and has the following commandline Interface:

usage: xtmagtape [-h] [-L] [-Q] [-0] [-v]

Extract files from SIMH magtape files.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xtmagtape(Unit):
    """
    Extract files from SIMH magtape files.
    """
    def process(self, data: bytearray):
        reader = StructReader(data)

        for r in itertools.count():
            buffer = MemoryFile()

            for k in itertools.count():
                try:
                    head = reader.peek(4)
                    size = reader.read_integer(24)
                    mark = reader.read_byte()
                except EOFError:
                    self.log_info('end of file while reading chunk header, terminating')
                    return
                if not any(head):
                    if k == 0:
                        return
                    break
                if mark != 0:
                    self.log_warn(F'error code 0x{mark:02X} in record {r}.{k}')
                buffer.write(reader.read(size))
                if reader.peek(4) != head:
                    if reader.tell() % 2 and reader.peek(5)[1:] == head:
                        padding = reader.read_byte()
                        if padding != 0:
                            self.log_info(F'nonzero padding byte in record {r}.{k}')
                    else:
                        raise ValueError('Invalid footer, data is corrupted.')
                reader.seekrel(4)

            yield buffer.getbuffer()

class xtmail (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.email and has the following commandline Interface:

usage: xtmail [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract files and body from EMail messages. The unit supports both the Outlook message format and
regular MIME documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtmail(PathExtractorUnit):
    """
    Extract files and body from EMail messages. The unit supports both the Outlook message format
    and regular MIME documents.
    """
    def _get_headparts(self, head):
        mw = mimewords()
        mw = partial(mw.process.__wrapped__.__wrapped__, mw)
        jh = defaultdict(list)
        for key, value in head:
            jh[key].append(mw(''.join(t.lstrip() for t in value.splitlines(False))))
        jh = {k: v[0] if len(v) == 1 else [t for t in v if t] for k, v in jh.items()}
        yield UnpackResult('headers.txt',
            lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec))
        yield UnpackResult('headers.json',
            lambda jsn=jh: json.dumps(jsn, indent=4).encode(self.codec))

    @PathExtractorUnit.Requires('extract-msg<=0.41.0', 'formats', 'office', 'default', 'extended')
    def _extract_msg():
        import extract_msg.message
        import extract_msg.enums
        return extract_msg

    def _get_parts_outlook(self, data):
        def ensure_bytes(data):
            return data if isinstance(data, bytes) else data.encode(self.codec)

        def make_message(name, msg):
            with NoLogging():
                try:
                    htm = msg.htmlBody
                except Exception:
                    htm = None
                try:
                    txt = msg.body
                except Exception:
                    txt = None
            if txt:
                yield UnpackResult(F'{name}.txt', ensure_bytes(txt))
            if htm:
                yield UnpackResult(F'{name}.htm', ensure_bytes(htm))

        msgcount = 0

        with NoLogging():
            class ForgivingMessage(self._extract_msg.message.Message):
                """
                If parsing the input bytes fails early, the "__open" private attribute may not
                yet exist. This hack prevents an exception to occur in the destructor.
                """
                def __getattr__(self, key: str):
                    if key.endswith('_open'):
                        return False
                    raise AttributeError(key)
            msg = ForgivingMessage(bytes(data))

        yield from self._get_headparts(msg.header.items())
        yield from make_message('body', msg)

        def attachments(msg):
            for attachment in getattr(msg, 'attachments', ()):
                yield attachment
                if attachment.type == 'data':
                    continue
                yield from attachments(attachment.data)

        for attachment in attachments(msg):
            at = attachment.type
            if at is self._extract_msg.enums.AttachmentType.MSG:
                msgcount += 1
                yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data)
                continue
            if not isbuffer(attachment.data):
                self.log_warn(F'unknown attachment of type {at}, please report this!')
                continue
            path = attachment.longFilename or attachment.shortFilename
            yield UnpackResult(F'attachments/{path}', attachment.data)

    @PathExtractorUnit.Requires('chardet', 'default', 'extended')
    def _chardet():
        import chardet
        return chardet

    def _get_parts_regular(self, data: bytes):
        try:
            info = self._chardet.detect(data)
            msg = data.decode(info['encoding'])
        except UnicodeDecodeError:
            raise ValueError('This is not a plaintext email message.')
        else:
            msg = Parser().parsestr(msg)

        yield from self._get_headparts(msg.items())

        for k, part in enumerate(msg.walk()):
            path = part.get_filename()
            elog = None
            if path is None:
                extension = file_extension(part.get_content_type(), 'txt')
                path = F'body.{extension}'
            else:
                path = path | mimewords | str
                path = F'attachments/{path}'
            try:
                data = part.get_payload(decode=True)
            except Exception as E:
                try:
                    data = part.get_payload(decode=False)
                except Exception as E:
                    elog = str(E)
                    data = None
                else:
                    from refinery import carve
                    self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}')
                    if isinstance(data, str):
                        data = data.encode('latin1')
                    if isbuffer(data):
                        data = next(data | carve('b64', stripspace=True, single=True, decode=True))
                    else:
                        elog = str(E)
                        data = None
            if not data:
                if elog is not None:
                    self.log_warn(F'could not get content of message part {k}: {elog!s}')
                continue
            yield UnpackResult(path, data)

    def unpack(self, data):
        try:
            yield from self._get_parts_outlook(data)
        except Exception:
            self.log_debug('failed parsing input as Outlook message')
            yield from self._get_parts_regular(data)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        markers = [
            b'\nReceived:\x20from'
            b'\nSubject:\x20',
            b'\nTo:\x20',
            b'\nBcc:\x20',
            b'\nContent-Transfer-Encoding:\x20',
            b'\nContent-Type:\x20',
            b'\nReturn-Path:\x20',
        ]
        if data.startswith(B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
            markers = [marker.decode('latin1').encode('utf-16le') for marker in markers]
        return sum(1 for marker in markers if marker in data) >= 3

class xtmsi (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.msi and has the following commandline Interface:

usage: xtmsi [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file
MsiTables.json contains parsed MSI table information, similar to the output of the Orca tool.
Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom
actions are separately extracted in a virtual folder named "Action".

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtmsi(xtdoc):
    """
    Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file {FN} contains
    parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a
    virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in
    a virtual folder named "Action".
    """

    _SYNTHETIC_STREAMS_FILENAME = 'MsiTables.json'

    # https://learn.microsoft.com/en-us/windows/win32/msi/summary-list-of-all-custom-action-types
    _CUSTOM_ACTION_TYPES = {
        0x01: 'DLL file stored in a Binary table stream.',
        0x02: 'EXE file stored in a Binary table stream.',
        0x05: 'JScript file stored in a Binary table stream.',
        0x06: 'VBScript file stored in a Binary table stream.',
        0x11: 'DLL file that is installed with a product.',
        0x12: 'EXE file that is installed with a product.',
        0x13: 'Displays a specified error message and returns failure, terminating the installation.',
        0x15: 'JScript file that is installed with a product.',
        0x16: 'VBScript file that is installed with a product.',
        0x22: 'EXE file having a path referencing a directory.',
        0x23: 'Directory set with formatted text.',
        0x25: 'JScript text stored in this sequence table.',
        0x26: 'VBScript text stored in this sequence table.',
        0x32: 'EXE file having a path specified by a property value.',
        0x33: 'Property set with formatted text.',
        0x35: 'JScript text specified by a property value.',
        0x36: 'VBScript text specified by a property value.',
    }

    def unpack(self, data):
        streams = {result.path: result for result in super().unpack(data)}

        def stream(name: str):
            return streams.pop(name).get_data()

        def column_formats(table: Dict[str, MSITableColumnInfo]) -> str:
            return ''.join(v.struct_format for v in table.values())

        def stream_to_rows(data: ByteStr, row_format: str):
            row_size = struct.calcsize(F'<{row_format}')
            row_count = int(len(data) / row_size)
            reader = StructReader(data)
            columns = [reader.read_struct(F'<{sc * row_count}') for sc in row_format]
            for i in range(row_count):
                yield [c[i] for c in columns]

        tables: Dict[str, Dict[str, MSITableColumnInfo]] = collections.defaultdict(collections.OrderedDict)
        strings = MSIStringData(stream('!_StringData'), stream('!_StringPool'))

        for tbl_name_id, col_number, col_name_id, col_attributes in stream_to_rows(stream('!_Columns'), 'HHHH'):
            tbl_name = strings.ref(tbl_name_id)
            col_name = strings.ref(col_name_id)
            tables[tbl_name][col_name] = MSITableColumnInfo(col_number, col_attributes)

        table_names_given = {strings.ref(k) for k in chunks.unpack(stream('!_Tables'), 2, False)}
        table_names_known = set(tables)

        for name in table_names_known - table_names_given:
            self.log_warn(F'table name known but not given: {name}')
        for name in table_names_given - table_names_known:
            self.log_warn(F'table name given but not known: {name}')

        class ScriptItem(NamedTuple):
            row_index: int
            extension: Optional[str]

        processed_table_data: Dict[str, List[Dict[str, str]]] = {}
        tbl_properties: Dict[str, str] = {}
        tbl_files: Dict[str, str] = {}
        tbl_components: Dict[str, str] = {}
        postprocessing: List[ScriptItem] = []

        def format_string(string: str):
            # https://learn.microsoft.com/en-us/windows/win32/msi/formatted
            def _replace(match: re.Match[str]):
                _replace.done = False
                prefix, name = match.groups()
                if not prefix:
                    tbl = tbl_properties
                elif prefix in '%':
                    name = name.rstrip('%').upper()
                    return F'%{name}%'
                elif prefix in '!#':
                    tbl = tbl_files
                elif prefix in '$':
                    tbl = tbl_components
                else:
                    raise ValueError
                return tbl.get(name, '')
            while True:
                _replace.done = True
                string = re.sub(R'''(?x)
                    \[             # open square brackent
                      (?![~\\])    # not followed by escapes
                      ([%$!#]?)    # any of the valid prefix characters
                      ([^[\]{}]+)  # no brackets or braces
                    \]''', _replace, string)
                if _replace.done:
                    break
            string = re.sub(r'\[\\(.)\]', r'\1', string)
            string = string.replace('[~]', '\0')
            return string

        for table_name, table in tables.items():
            stream_name = F'!{table_name}'
            if stream_name not in streams:
                continue
            processed = []
            info = list(table.values())
            for r, row in enumerate(stream_to_rows(stream(stream_name), column_formats(table))):
                values = []
                for index, value in enumerate(row):
                    vt = info[index].type
                    if vt is MsiType.Long:
                        if value != 0:
                            value -= 0x80000000
                    elif vt is MsiType.Short:
                        if value != 0:
                            value -= 0x8000
                    elif value in strings:
                        value = strings.ref(value)
                    elif not info[index].is_integer:
                        value = ''
                    values.append(value)
                if table_name == 'Property':
                    tbl_properties[values[0]] = values[1]
                if table_name == 'File':
                    tbl_properties[values[0]] = values[2]
                if table_name == 'Component':
                    tbl_properties[values[0]] = F'%{values[2]}%'
                entry = dict(zip(table, values))
                einfo = {t: i for t, i in zip(table, info)}
                if table_name == 'MsiFileHash':
                    entry['Hash'] = struct.pack(
                        '<IIII',
                        row[2] ^ 0x80000000,
                        row[3] ^ 0x80000000,
                        row[4] ^ 0x80000000,
                        row[5] ^ 0x80000000,
                    ).hex()
                if table_name == 'CustomAction':
                    code = row[1] & 0x3F
                    try:
                        entry['Comment'] = self._CUSTOM_ACTION_TYPES[code]
                    except LookupError:
                        pass
                    t = einfo.get('Target')
                    c = {0x25: 'js', 0x26: 'vbs', 0x33: None}
                    if code in c and t and not t.is_integer:
                        postprocessing.append(ScriptItem(r, c[code]))
                processed.append(entry)
            if processed:
                processed_table_data[table_name] = processed

        ca = processed_table_data.get('CustomAction', None)
        for item in postprocessing:
            entry = ca[item.row_index]
            try:
                path: str = entry['Action']
                data: str = entry['Target']
            except KeyError:
                continue
            root = F'Action/{path}'
            if item.extension:
                path = F'{root}.{item.extension}'
                streams[path] = UnpackResult(path, data.encode(self.codec))
                continue
            data = format_string(data)
            parts = [part.partition('\x02') for part in data.split('\x01')]
            if not all(part[1] == '\x02' for part in parts):
                continue
            for name, _, script in parts:
                if not name.lower().startswith('script'):
                    continue
                if not script:
                    continue
                path = F'{root}.{name}'
                streams[path] = UnpackResult(path, script.encode(self.codec))

        for ignored_stream in [
            '[5]SummaryInformation',
            '[5]DocumentSummaryInformation',
            '[5]DigitalSignature',
            '[5]MsiDigitalSignatureEx'
        ]:
            streams.pop(ignored_stream, None)

        inconsistencies = 0
        for k in range(len(strings)):
            c = strings.computed_ref_count[k]
            p = strings.provided_ref_count[k]
            if c != p and not self.log_debug(F'string reference count computed={c} provided={p}:', strings.ref(k + 1, False)):
                inconsistencies += 1
        if inconsistencies:
            self.log_info(F'found {inconsistencies} incorrect string reference counts')

        def fix_msi_path(path: str):
            prefix, dot, name = path.partition('.')
            if dot == '.' and prefix.lower() == 'binary':
                path = F'{prefix}/{name}'
            return path

        streams = {fix_msi_path(path): item for path, item in streams.items()}
        ds = UnpackResult(self._SYNTHETIC_STREAMS_FILENAME,
                json.dumps(processed_table_data, indent=4).encode(self.codec))
        streams[ds.path] = ds

        for path in sorted(streams):
            streams[path].path = path
            yield streams[path]

    @classmethod
    def handles(self, data: bytearray):
        if not data.startswith(B'\xD0\xCF\x11\xE0'):
            return False
        return FileMagicInfo(data).extension == 'msi'

class xtnode (*paths, entry=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date')

This unit is implemented in refinery.units.formats.archive.xtnode and has the following commandline Interface:

usage: xtnode [-h] [-L] [-Q] [-0] [-v] [-u] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
              [path [path ...]]

Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two
utilities that are commonly used to generate stand-alone executables.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -u, --entry      Only extract the entry point.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtnode(ArchiveUnit):
    """
    Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two
    utilities that are commonly used to generate stand-alone executables.
    """

    _NEXE_SENTINEL = B'<nexe~~sentinel>'
    _PKG_PAYLOAD_P = B'PAYLOAD_POSITION'
    _PKG_PAYLOAD_S = B'PAYLOAD_SIZE'
    _PKG_PRELUDE_P = B'PRELUDE_POSITION'
    _PKG_PRELUDE_S = B'PRELUDE_SIZE'
    _PKG_COMMON_JS = B'sourceMappingURL=common.js.map'

    def __init__(
        self, *paths, entry: Arg.Switch('-u', help='Only extract the entry point.') = False,
        list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
    ):
        super().__init__(*paths, entry=entry,
            list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex,
            path=path, date=date)

    def unpack(self, data: ByteStr) -> Iterable[UnpackResult]:
        if self._is_nexe(data):
            self.log_info('unpacking as nexe')
            yield from self._unpack_nexe(data)
            return
        if self._is_pkg(data):
            self.log_info('unpacking as pkg')
            yield from self._unpack_pkg(data)
            return

    def _unpack_nexe(self, data: ByteStr):
        try:
            ep = re.compile(
                RB"entry\s*=\s*path\.resolve\(path\.dirname\(process\.execPath\),\s*(%s)\)" % formats.string)
            ep, = ep.finditer(data)
        except Exception:
            ep = None
            self.log_info('could not identify entry point')
        else:
            ep = ep.group(1) | esc(quoted=True) | str
            self.log_info(F'entry point: {ep}')
        view = memoryview(data)
        for marker in re.finditer(re.escape(self._NEXE_SENTINEL), data):
            end = marker.end() + 16
            sizes = data[marker.end():end]
            if sizes.startswith(b"')"):
                continue
            reader = StructReader(sizes)
            code_size = int(reader.f64())
            blob_size = int(reader.f64())
            start = marker.start() - code_size - blob_size
            try:
                reader = StructReader(view[start:end])
                code = reader.read_exactly(code_size)
                blob = reader.read_exactly(blob_size)
            except EOF:
                self.log_debug(F'found marker at 0x{marker.start():X}, but failed to read data')
                continue
            else:
                self.log_debug(F'found marker at 0x{marker.start():X}, data start at {start:X}')
            for rsrc in re.finditer(RB'process\.__nexe\s*=', code):
                rsrc = JSONReader(code[rsrc.end():])
                rsrc = rsrc.read_json()
                if len(rsrc) == 1:
                    _, rsrc = rsrc.popitem()
                for path, (offset, length) in rsrc.items():
                    end = offset + length
                    if ep and self.args.entry and path != ep:
                        continue
                    yield UnpackResult(path, blob[offset:end])

    def _unpack_pkg(self, data: ByteStr):
        def _extract_coordinates(*v: bytes):
            for name in v:
                pattern = name + BR'''\s{0,3}=\s{0,3}(['"])([\s\d]+)\1'''
                value, = re.finditer(pattern, data)
                yield int(value.group(2).decode('utf8').strip(), 0)

        def _extract_data(*v: bytes):
            try:
                offset, length = _extract_coordinates(*v)
            except Exception:
                return None
            return data[offset:offset + length]

        payload = _extract_data(self._PKG_PAYLOAD_P, self._PKG_PAYLOAD_S)
        if not payload:
            raise ValueError('unable to extract payload')
        prelude = _extract_data(self._PKG_PRELUDE_P, self._PKG_PRELUDE_S)
        if not prelude:
            raise ValueError('unable to extract prelude')
        mapping = re.search(re.escape(self._PKG_COMMON_JS) + BR'\s*\},\s*\{', prelude)
        if not mapping:
            raise ValueError('unable to find common.js mapping')

        reader = JSONReader(prelude[mapping.end() - 1:])

        files: Dict[str, dict] = reader.read_json()

        if files is None:
            raise ValueError('failed to read file list')

        entry = reader.skip_comma().read_string()
        links = reader.skip_comma().read_json()

        # _unknown1 = reader.skip_comma().read_json()
        # _unknown2 = reader.skip_comma().read_terminated_array(B')').strip()

        root = next(iter(files))
        skip = 0
        view = memoryview(payload)

        for k in range(len(root) + 1):
            test = root[:k].rstrip('/').rstrip('\\')
            if not all(path.startswith(test) for path in files):
                root = test[:-1]
                skip = k - 1
                break

        entry = entry[skip:]
        self.log_info(F'detected root directory {root}, entry point is {entry}')

        for src, dst in links.items():
            new_files = {}
            self.log_info('link src:', src[skip:])
            self.log_info('link dst:', dst[skip:])
            for path, location in files.items():
                if not path.startswith(src):
                    continue
                new_path = dst + path[len(src):]
                new_files[new_path] = location
                self.log_debug('synthesizing linked file:', new_path)
            files.update(new_files)

        for path, location in files.items():
            path = path[skip:]
            if entry and self.args.entry and path != entry:
                continue
            data = None
            for kind, (offset, length) in location.items():
                stop = offset + length
                if kind == '3':  # metadata
                    continue
                if kind == '2':  # unknown
                    continue
                if kind in '01':
                    data = view[offset:stop]
            if data is not None:
                yield UnpackResult(path, data)

    @classmethod
    def _is_nexe(cls, data: ByteStr) -> bool:
        return cls._NEXE_SENTINEL in data

    @classmethod
    def _is_pkg(cls, data: ByteStr) -> bool:
        if cls._PKG_PAYLOAD_P not in data:
            return False
        if cls._PKG_PAYLOAD_S not in data:
            return False
        if cls._PKG_PRELUDE_P not in data:
            return False
        if cls._PKG_PRELUDE_S not in data:
            return False
        if cls._PKG_COMMON_JS not in data:
            return False
        return True

    @classmethod
    def handles(cls, data: ByteStr) -> Optional[bool]:
        return cls._is_nexe(data) or cls._is_pkg(data)

class xtnsis (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtnsis and has the following commandline Interface:

usage: xtnsis [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
              [path [path ...]]

Extract files from NSIS archives.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtnsis(ArchiveUnit):
    """
    Extract files from NSIS archives.
    """

    @classmethod
    def _find_archive_offset(cls, data: bytearray, before: int = -1, flawmax=2):
        def signatures(*magics):
            for changes in range(flawmax + 1):
                for magic in magics:
                    if not changes:
                        yield 0, magic
                        continue
                    for positions in itertools.permutations(range(len(magic)), r=changes):
                        signature = bytearray(magic)
                        for p in positions:
                            signature[p] = 0x2E
                        yield changes, bytes(signature)
        best_guess = None
        search_space = memoryview(data)
        for flaws, sig in signatures(*NSArchive.MAGICS):
            if flaws > 1:
                search_space = search_space[:0x20_000]
            matches = [m.start() - 4 for m in re.finditer(sig, search_space, flags=re.DOTALL)]
            if before >= 0:
                matches = [match for match in matches if match < before]
            matches.reverse()
            archive = None
            for match in matches:
                if match % 0x200 == 0:
                    archive = match
                    break
            if not archive:
                if matches and not best_guess:
                    best_guess = matches[-1]
            else:
                msg = F'Archive signature was found at offset 0x{archive:X}'
                if flaws > 0:
                    msg = F'{msg}; it has {flaws} imperfections and was likely modified'
                cls.log_info(F'{msg}.')
                return archive
        if best_guess:
            cls.log_info(F'A signature was found at offset 0x{best_guess:08X}; it is not properly aligned.')
            return best_guess
        return None

    def unpack(self, data):
        memory = memoryview(data)
        before = -1
        _error = None
        while True:
            offset = self._find_archive_offset(data, before)
            if offset is None:
                _error = _error or ValueError('Unable to find an NSIS archive marker.')
                raise _error
            try:
                arc = NSArchive(memory[offset:])
            except Exception as e:
                _error = e
                before = offset
            else:
                break

        def info():
            yield F'{arc.header.type.name} archive'
            yield F'{arc.method.name.lower()} compression'
            yield F'mystery value 0x{arc.header.unknown_value:X}'
            yield 'solid archive' if arc.solid else 'fragmented archive'
            yield '64-bit header' if arc.header.is64bit else '32-bit header'
            yield 'unicode' if arc.header.unicode else 'ascii'

        self.log_info(', '.join(info()))

        for item in arc.header.items:
            yield self._pack(item.path, item.mtime, lambda i=item: arc._extract_item(i).data)
        yield self._pack('setup.nsis', None, arc.script.encode(self.codec))

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return any(magic in data for magic in NSArchive.MAGICS)

class xtnuitka (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.archive.xtnuitka and has the following commandline Interface:

usage: xtnuitka [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
                [path [path ...]]

Extracts files packed by Nuitka using the --onefile option.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtnuitka(PathExtractorUnit):
    """
    Extracts files packed by Nuitka using the --onefile option.
    """
    _MAGIC = B'KA'

    @PathExtractorUnit.Requires('pyzstd', 'arc')
    def _pyzstd():
        import pyzstd
        return pyzstd

    def unpack(self, data: ByteStr) -> Iterable[UnpackResult]:
        class NuitkaData(Struct):
            unit = self

            def __init__(self, reader: StructReader):
                self.magic = reader.read_exactly(2)
                self.compression_flag = reader.read_exactly(1)
                if self.compressed:
                    zd = self.unit._pyzstd.ZstdDecompressor()
                    reader = StructReader(zd.decompress(reader.read()))
                self.files = {}
                self.truncated = False
                while not reader.eof:
                    path = reader.read_w_string('utf-16')
                    if not path:
                        break
                    size = reader.u64()
                    data = reader.read(size)
                    if len(data) == size:
                        self.files[path] = data
                    else:
                        self.truncated = True

            @property
            def compressed(self):
                return self.compression_flag == b'Y'

        if data.startswith(b'MZ'):
            arcs = list(self._pe_candidates(data))
        else:
            arcs = [data]

        for arc in arcs:
            archive = NuitkaData(arc)
            if archive.truncated:
                self.log_warn('the archive is truncated')
            if archive.magic != self._MAGIC:
                self.log_warn('the archive data does not start with the correct magic sequence')
            for path, data in archive.files.items():
                yield UnpackResult(path, data)

    @classmethod
    def handles(cls, data: ByteStr) -> Optional[bool]:
        if data.startswith(b'MZ'):
            try:
                next(cls._pe_candidates(data))
            except StopIteration:
                return False
        else:
            return data.startswith(cls._MAGIC)

    @classmethod
    def _pe_candidates(cls, data: ByteStr):

        from refinery.units.formats.pe.peoverlay import peoverlay
        blob = data | peoverlay | bytearray
        if blob.startswith(cls._MAGIC):
            yield blob

        from refinery.units.formats.pe.perc import perc
        for blob in data | perc:
            if blob.startswith(cls._MAGIC):
                yield blob

class xtone (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtone and has the following commandline Interface:

usage: xtone [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract embedded files from OneNote documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtone(PathExtractorUnit):
    """
    Extract embedded files from OneNote documents.
    """
    @PathExtractorUnit.Requires('pyonenote', 'formats', 'office', 'extended')
    def _pyOneNote():
        import pyOneNote
        import pyOneNote.OneDocument
        return pyOneNote.OneDocument

    def unpack(self, data: bytearray):
        with MemoryFile(memoryview(data)) as stream:
            one = self._pyOneNote.OneDocment(stream)
        for guid, file in one.get_files().items():
            chunk = file['content']
            try:
                extension = file['extension']
            except KeyError:
                extension = F'.{get_cached_file_magic_info(chunk).extension}'
            yield UnpackResult(F'{guid}{extension}', chunk)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return UUID('e4525c7b-8cd8-a74d-aeb1-5378d02996d3').bytes in data

class xtp (*pattern, filter=0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.xtp and has the following commandline Interface:

usage: xtp [-h] [-L] [-Q] [-0] [-v] [-f] [-n N] [-m N] [-e N] [-x] [-r] [-l] [-t N]
           [pattern [pattern ...]]

Extract Patterns: Uses regular expressions to extract indicators from the input data and
optionally filters these results heuristically. The unit is designed to extract indicators such
as domain names and IP addresses, see below for a complete list. To extract data formats such as
hex-encoded data, use carve.

positional arguments:
  pattern           Choose the pattern to extract. The unit uses ('hostname', 'url', 'email') by
                    default. Use an asterix character to select all available patterns. The
                    available patterns are: domain, email, guid, ipv4, ipv6, md5, sha1, sha256,
                    hostname, socket, subdomain, url, btc, pem, xmr, path, winpath, nixpath,
                    environment-variable

optional arguments:
  -f, --filter      If this setting is enabled, the xtp unit will attempt to reduce the number of
                    false positives by certain crude heuristics. Specify multiple times to make
                    the filtering more aggressive.
  -n, --min N       Matches must have length at least N.
  -m, --max N       Matches must have length at most N.
  -e, --len N       Matches must be of length N.
  -x, --stripspace  Strip all whitespace from input data.
  -r, --duplicates  Yield every (transformed) Match, even when it was found before.
  -l, --longest     Sort results by length.
  -t, --take N      Return only the first N occurrences in order of appearance.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class xtp(PatternExtractor):
    """
    Extract Patterns: Uses regular expressions to extract indicators from the input data and
    optionally filters these results heuristically. The unit is designed to extract indicators
    such as domain names and IP addresses, see below for a complete list. To extract data
    formats such as hex-encoded data, use `refinery.carve`.
    """

    def __init__(
        self,
        *pattern: Arg('pattern', type=str,
            default=(
                indicators.hostname.name,
                indicators.url.name,
                indicators.email.name,
            ), help=(
                'Choose the pattern to extract. The unit uses {{default}} by default. Use an '
                'asterix character to select all available patterns. The available patterns '
                'are: {}'.format(', '.join(p.display for p in indicators))
            )
        ),
        filter: Arg('-f', dest='filter', action='count',
            help=(
                'If this setting is enabled, the xtp unit will attempt to reduce the number '
                'of false positives by certain crude heuristics. Specify multiple times to '
                'make the filtering more aggressive.'
            )
        ) = 0,
        min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None
    ):
        self.superinit(super(), **vars(), ascii=True, utf16=True)

        patterns = {
            p for name in pattern for p in indicators if fnmatch(p.display, name)
        }
        # if indicators.hostname in patterns:
        #     patterns.remove(indicators.hostname)
        #     patterns.add(indicators.ipv4)
        #     patterns.add(indicators.domain)
        patterns = [F'(?P<{p.name}>{p.value})' for p in patterns]
        if not patterns:
            raise RefineryCriticalException('The given mask does not match any known indicator pattern.')
        pattern = '|'.join(patterns)
        self.args.pattern = re.compile(pattern.encode(self.codec), flags=re.DOTALL)
        self.args.filter = filter

    _ALPHABETIC = ascii_letters.encode('ASCII')
    _LEGITIMATE_HOSTS = {
        'acm.org'                 : 1,
        'adobe.com'               : 1,
        'aka.ms'                  : 1,
        'android.com'             : 1,
        'apache.org'              : 1,
        'apple.com'               : 1,
        'archive.org'             : 2,
        'azure.com'               : 1,
        'baidu.com'               : 2,
        'bootstrapcdn.com'        : 2,
        'cdnjs.cloudflare.com'    : 4,
        'comodo.net'              : 1,
        'comodoca.com'            : 1,
        'curl.haxx.se'            : 1,
        'curl.se'                 : 1,
        'digicert.com'            : 1,
        'dublincore.org'          : 1,
        'fontawesome.com'         : 1,
        'facebook.com'            : 4,
        'github.com'              : 3,
        'globalsign.com'          : 1,
        'globalsign.net'          : 1,
        'godaddy.com'             : 1,
        'google.com'              : 4,
        'googleapis.com'          : 5,
        'googleusercontent.com'   : 5,
        'gov'                     : 2,
        'gstatic.com'             : 2,
        'iana.org'                : 1,
        'intel.com'               : 1,
        'jquery.com'              : 1,
        'jsdelivr.net'            : 2,
        'live.com'                : 1,
        'microsoft.com'           : 1,
        'msdn.com'                : 1,
        'msn.com'                 : 1,
        'office.com'              : 1,
        'office365.com'           : 2,
        'openssl.org'             : 1,
        'openxmlformats.org'      : 1,
        'purl.org'                : 1,
        'python.org'              : 1,
        'schema.org'              : 2,
        'sectigo.com'             : 1,
        'skype.com'               : 1,
        'sourceforge.net'         : 4,
        'sway-cdn.com'            : 1,
        'sway-extensions.com'     : 1,
        'symantec.com'            : 1,
        'symauth.com'             : 1,
        'symcb.com'               : 1,
        'symcd.com'               : 1,
        'thawte.com'              : 1,
        'usertrust.com'           : 1,
        'verisign.com'            : 1,
        'w3.org'                  : 1,
        'wikipedia.org'           : 1,
        'wolfram.com'             : 1,
        'xml.org'                 : 1,
        'xmlsoap.org'             : 1,
        'yahoo.com'               : 1,
    }

    _DOMAIN_WHITELIST = [
        'system.net',
        'wscript.shell',
    ]

    _BRACKETING = {
        B"'"[0]: B"'",
        B'"'[0]: B'"',
        B'('[0]: B')',
        B'{'[0]: B'}',
        B'['[0]: B']',
        B'<'[0]: B'>',
    }

    def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str, value: bytes):
        term = self._BRACKETING.get(data[pos - 1], None)
        if term:
            pos = value.find(term)
            if pos > 0:
                value = value[:pos]
        if not self.args.filter:
            return value
        if name == indicators.hostname.name:
            if all(part.isdigit() for part in value.split(B'.')):
                name = indicators.ipv4.name
            elif B'.' not in value:
                name = indicators.ipv6.name
            else:
                name = indicators.domain.name
        if name == indicators.ipv4.name:
            ocets = [int(x) for x in value.split(B'.')]
            if ocets.count(0) >= 3:
                return None
            if self.args.filter > 2 and sum(ocets) < 10:
                return None
            for area in (
                bytes(data[pos - 20 : pos + 20]),
                bytes(data[pos * 2 - 40 : pos * 2 + 40 : 2]),
                bytes(data[pos * 2 - 41 : pos * 2 + 39 : 2]),
            ):
                if B'version' in area.lower():
                    return None
            ip = ip_address(value.decode(self.codec))
            if not ip.is_global:
                if self.args.filter >= 3 or not ip.is_private:
                    return None
        elif name in {
            indicators.url.name,
            indicators.socket.name,
            indicators.hostname.name,
            indicators.domain.name,
            indicators.subdomain.name
        }:
            if self.args.filter >= 2:
                if LetterWeights.IOC(value) < 0.6:
                    self.log_info(value)
                    self.log_info('excluding indicator because of low IOC score')
                    return None
                if name != indicators.url.name and len(value) > 0x100:
                    self.log_info(value)
                    self.log_info('excluding indicator because it is too long')
                    return None
            ioc = value.decode(self.codec)
            if '://' not in ioc: ioc = F'tcp://{ioc}'
            parts = urlparse(ioc)
            host, _, _ = parts.netloc.partition(':')
            hl = host.lower()
            for white, level in self._LEGITIMATE_HOSTS.items():
                if self.args.filter >= level and (hl == white or hl.endswith(F'.{white}')):
                    self.log_info(value)
                    self.log_info(
                        F'excluding indicator because domain {hl} is whitelisted via {white}; '
                        F'reduce level below {level} to allow, current level is {self.args.filter}'
                    )
                    return None
            if name == indicators.url.name:
                scheme = parts.scheme.lower()
                for p in ('http', 'https', 'ftp', 'file', 'mailto'):
                    if scheme.endswith(p):
                        pos = scheme.find(p)
                        value = value[pos:]
                        break
            if any(hl == w for w in self._DOMAIN_WHITELIST):
                self.log_info(value)
                self.log_info(F'excluding indicator because domain {hl} is whitelisted')
                return None
            if name in {
                indicators.hostname.name,
                indicators.domain.name,
                indicators.subdomain.name
            }:
                if data[pos - 1] in b'/\\' and self.args.filter >= 2:
                    return None
                hostparts = host.split('.')
                if self.args.filter >= 2:
                    if not all(p.isdigit() for p in hostparts) and all(len(p) < 4 for p in hostparts):
                        self.log_info(value)
                        self.log_info('excluding host with too many short parts')
                        return None
                if self.args.filter >= 3:
                    if len(hostparts) <= sum(3 for p in hostparts if p != p.lower() and p != p.upper()):
                        self.log_info(value)
                        self.log_info('excluding host with too many mixed case parts')
                        return None
                # These heuristics attempt to filter out member access to variables in
                # scripts which can be mistaken for domains because of the TLD inflation
                # we've had.
                uppercase = sum(1 for c in host if c.isalpha() and c.upper() == c)
                lowercase = sum(1 for c in host if c.isalpha() and c.lower() == c)
                if lowercase and uppercase:
                    caseratio = uppercase / lowercase
                    if 0.1 < caseratio < 0.9:
                        self.log_info(value)
                        self.log_info('excluding indicator with too much uppercase letters')
                        return None
                if all(x.isidentifier() for x in hostparts):
                    if len(hostparts) == 2 and hostparts[0] in ('this', 'self'):
                        self.log_info(value)
                        self.log_info('excluding host that looks like a code snippet')
                        return None
                    if len(hostparts[-2]) < 3:
                        self.log_info(value)
                        self.log_info('excluding host with too short root domain name')
                        return None
                    if any(x.startswith('_') for x in hostparts):
                        self.log_info(value)
                        self.log_info('excluding host with underscores')
                        return None
                    if len(hostparts[-1]) > 3:
                        prefix = '.'.join(hostparts[:-1])
                        seen_before = len(set(re.findall(
                            R'{}(?:\.\w+)+'.format(prefix).encode('ascii'), data)))
                        if seen_before > 2:
                            self.log_debug(value)
                            self.log_debug('excluding indicator that was already seen')
                            return None
        elif name == indicators.email.name:
            at = value.find(B'@')
            ix = 0
            while value[ix] not in self._ALPHABETIC:
                ix += 1
            return None if at - ix < 3 else value[ix:]
        elif name in (
            indicators.path.name,
            indicators.winpath.name,
            indicators.nixpath.name,
        ):
            if len(value) < 8:
                return None
            if len(value) > 16 and len(re.findall(RB'\\x\d\d', value)) > len(value) // 10:
                return None
            try:
                path_string = value.decode(self.codec)
            except Exception:
                return None
            try:
                path = Path(path_string)
            except Exception as E:
                self.log_debug(F'error parsing path "{path}": {E!s}')
                return None
            path_likeness = sum(v for v, x in [
                (1, path.suffix),
                (1, path_string.startswith('/')),
                (2, path_string.startswith('%')),
                (2, path_string.startswith('\\\\')),
                (2, path_string[1:3] == ':\\'),
            ] if x)
            if path_likeness < min(self.args.filter, 2):
                return None
            if self.args.filter >= 2:
                for k, part in enumerate(path.parts):
                    if not k:
                        drive, colon, slash = part.partition(':')
                        if colon and len(drive) == 1 and len(slash) <= 1:
                            continue
                        if part[0] == part[~0] == '%':
                            continue
                    if LetterWeights.Path(part) < 0.4 + (min(self.args.filter, 5) * 0.1):
                        return None
        return value

    def process(self, data):
        whitelist = set()

        def check(match: re.Match):
            for name, value in match.groupdict().items():
                if value is not None:
                    break
            else:
                raise RefineryCriticalException('Received empty match.')
            if value in whitelist:
                return None
            result = self._check_match(match.string, match.start(), name, value)
            if result is not None:
                return self.labelled(result, pattern=name)
            whitelist.add(value)

        transforms = [check]
        yield from self.matches_filtered(memoryview(data), self.args.pattern, *transforms)

class xtpdf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.pdf and has the following commandline Interface:

usage: xtpdf [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract objects from PDF documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtpdf(PathExtractorUnit):
    """
    Extract objects from PDF documents.
    """
    @PathExtractorUnit.Requires('pypdf>=3.1.0', 'formats', 'default', 'extended')
    def _pypdf2():
        import pypdf
        import pypdf.generic
        return pypdf

    def _walk(self, blob, memo: Optional[Set[int]] = None, *path):
        while isinstance(blob, self._pypdf2.generic.IndirectObject):
            try:
                blob = blob.get_object()
            except Exception:
                break
        if memo is None:
            memo = {id(blob)}
        elif id(blob) in memo:
            return
        else:
            memo.add(id(blob))
        try:
            name = blob['/F']
            blob = blob['/EF']['/F']
        except Exception:
            pass
        else:
            path = *path[:-1], F'/{name}'
        try:
            if TYPE_CHECKING:
                blob: EncodedStreamObject = cast(EncodedStreamObject, blob)
            extract = blob.get_data
        except AttributeError:
            pass
        else:
            yield UnpackResult(''.join(path), extract, kind='object')
            return

        if isinstance(blob, self._pypdf2.generic.ByteStringObject):
            yield UnpackResult(''.join(path), blob, kind='bytes')
            return
        if isinstance(blob, self._pypdf2.generic.TextStringObject):
            yield UnpackResult(''.join(path), blob.encode(self.codec), kind='string')
            return

        if isinstance(blob, (
            self._pypdf2.generic.BooleanObject,
            self._pypdf2.generic.ByteStringObject,
            self._pypdf2.generic.FloatObject,
            self._pypdf2.generic.NameObject,
            self._pypdf2.generic.NullObject,
            self._pypdf2.generic.NumberObject,
            self._pypdf2.generic.RectangleObject,
        )):
            # unhandled PDF objects
            return

        if isinstance(blob, self._pypdf2.generic.TreeObject):
            blob = list(blob)

        pdf = self._pypdf2.generic.PdfObject

        if isinstance(blob, list):
            if (
                len(blob) % 2 == 0
                and all(isinstance(key, str) for key in islice(iter(blob), 0, None, 2))
                and all(isinstance(key, pdf) for key in islice(iter(blob), 1, None, 2))
            ):
                blob = dict(zip(*([iter(blob)] * 2)))
            else:
                for key, value in enumerate(blob):
                    yield from self._walk(value, memo, *path, F'/{key}')
                return

        if not isdict(blob):
            return

        for key, value in blob.items():
            if not isinstance(key, str):
                continue
            if not key.startswith('/'):
                key = F'/{key}'
            yield from self._walk(value, memo, *path, key)

    def unpack(self, data):
        with MemoryFile(data, read_as_bytes=True) as stream:
            with NoLogging():
                pdf = self._pypdf2.PdfReader(stream)
                catalog = pdf.trailer['/Root']
                yield from self._walk(catalog)

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        return data.startswith(B'%PDF-')

class xtpyi (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', user_code=False, unmarshal=0)

This unit is implemented in refinery.units.formats.archive.xtpyi and has the following commandline Interface:

usage: xtpyi [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-u | -y]
             [path [path ...]]

Extracts and decompiles files from a Python Installer (aka PyInstaller) archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -u, --user-code  Extract only source code files from the root of the archive. These usually
                   implement the actual domain logic.
  -y, --unmarshal  (DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted
                   packages can potentially exploit this to execute code. It is advised to only
                   use this option inside an isolated environment. Specify twice to decompile
                   unmarshalled Python bytecode.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtpyi(ArchiveUnit):
    """
    Extracts and decompiles files from a Python Installer (aka PyInstaller) archive.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
        user_code: Arg.Switch('-u', group='FILTER', help=(
            'Extract only source code files from the root of the archive. These usually implement '
            'the actual domain logic.')) = False,
        unmarshal: Arg('-y', action='count', group='FILTER', help=(
            '(DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted packages can '
            'potentially exploit this to execute code. It is advised to only use this option inside '
            'an isolated environment. Specify twice to decompile unmarshalled Python bytecode.'
        )) = 0
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            path=path,
            date=date,
            unmarshal=unmarshal,
            user_code=user_code,
        )

    @ArchiveUnit.Requires('xdis', 'arc', 'python', 'extended')
    def _xdis():
        import xdis.load
        import xdis.magics
        import xdis.marsh
        import xdis.op_imports
        import xdis
        A, B, C, *_ = sys.version_info
        version = F'{A}.{B}.{C}'
        canonic = F'{A}.{B}'
        if version not in xdis.magics.canonic_python_version:
            class opcode_dummy:
                version = float(canonic)
                def __init__(self, name): self.name = name
                def __getattr__(self, key): return opcode_dummy(F'{self.name}.{key}')
                def __call__(self, *a, **k): return None
                def __str__(self): return self.name
                def __repr__(self): return self.name
            import importlib
            magic = importlib.util.MAGIC_NUMBER
            xdis.magics.add_magic_from_int(xdis.magics.magic2int(magic), version)
            xdis.magics.by_magic.setdefault(magic, set()).add(version)
            xdis.magics.by_version[version] = magic
            xdis.magics.magics[canonic] = magic
            xdis.magics.canonic_python_version[canonic] = canonic
            xdis.magics.add_canonic_versions(version, canonic)
            xdis.op_imports.op_imports.setdefault(canonic, opcode_dummy('dummy'))
        del A, B, C, version
        import xdis.std
        return xdis

    @ArchiveUnit.Requires('uncompyle6', 'arc', 'python', 'extended')
    def _uncompyle6():
        import uncompyle6
        import uncompyle6.main
        return uncompyle6

    @ArchiveUnit.Requires('decompyle3', 'arc', 'python')
    def _decompyle3():
        import decompyle3
        import decompyle3.main
        return decompyle3

    def unpack(self, data):
        view = memoryview(data)
        positions = [m.start() for m in re.finditer(re.escape(PyInstallerArchiveEpilogue.MagicSignature), view)]
        mode = Unmarshal(min(2, int(self.args.unmarshal)))
        self.log_debug(F'unmarshal mode: {mode.name}')
        if not positions:
            raise LookupError('unable to find PyInstaller signature')
        if len(positions) > 2:
            # first position is expected to be the sentinel value in the unpacker stub
            width = max(len(F'{p:X}') for p in positions)
            for position in positions:
                self.log_info(F'magic signature found at offset 0x{position:0{width}X}')
            self.log_warn(F'found {len(positions) - 1} potential PyInstaller epilogue markers; using last one.')
        archive = PyInstallerArchiveEpilogue(view, positions[-1], mode)
        for name, file in archive.files.items():
            if self.args.user_code:
                if file.type != PiType.USERCODE:
                    continue
                if name.startswith('pyiboot'):
                    continue
            yield self._pack(name, None, file.data, type=file.type.name)

    @classmethod
    def handles(cls, data: ByteString) -> Optional[bool]:
        return PyInstallerArchiveEpilogue.MagicSignature in data

class xtrtf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtrtf and has the following commandline Interface:

usage: xtrtf [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract embedded objects in RTF documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtrtf(PathExtractorUnit):
    """
    Extract embedded objects in RTF documents.
    """
    @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended')
    def _oletools():
        import oletools
        import oletools.rtfobj
        import oletools.oleobj
        return oletools

    def unpack(self, data):
        parser = self._oletools.rtfobj.RtfObjParser(data)
        parser.parse()
        width = len(str(len(parser.objects)))
        for k, item in enumerate(parser.objects):
            item: RtfObject
            path = item.filename or F'carve{k:0{width}}.bin'
            data = item.rawdata
            meta = {}
            if item.is_ole:
                if item.format_id == self._oletools.oleobj.OleObject.TYPE_EMBEDDED:
                    meta['ole_type'] = 'EMBEDDED'
                elif item.format_id == self._oletools.oleobj.OleObject.TYPE_LINKED:
                    meta['ole_type'] = 'LINKED'
                if item.is_package:
                    meta['src_path'] = item.src_path
                    meta['tmp_path'] = item.temp_path
                if item.clsid is not None:
                    meta['ole_info'] = item.clsid_desc
                    meta['ole_guid'] = item.clsid
                meta['ole_name'] = item.class_name
            if item.oledata:
                data = item.oledata
                pos = item.rawdata.find(data)
                if pos > 0:
                    meta['raw_header'] = item.rawdata[:pos]
                if item.olepkgdata:
                    data = item.olepkgdata
                    pos = item.oledata.find(data)
                    if pos >= 0:
                        meta['ole_header'] = item.oledata[:pos]
            yield UnpackResult(path, data, **meta)

    @classmethod
    def handles(self, data: bytearray) -> bool:
        return data[:500].lower().lstrip().startswith(b'{\\rtf')

class xttar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xttar and has the following commandline Interface:

usage: xttar [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
             [path [path ...]]

Extract files from a Tar archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xttar(ArchiveUnit):
    """
    Extract files from a Tar archive.
    """
    def unpack(self, data: bytearray):
        with MemoryFile(data) as stream:
            try:
                archive = tarfile.open(fileobj=stream)
            except Exception:
                ustar = data.find(B'ustar')
                if ustar < 257:
                    raise
                stream.seek(ustar - 257)
                archive = tarfile.open(fileobj=stream)
        for info in archive.getmembers():
            if not info.isfile():
                continue
            extractor = archive.extractfile(info)
            if extractor is None:
                continue
            date = datetime.datetime.fromtimestamp(info.mtime)
            yield self._pack(info.name, date, lambda e=extractor: e.read())

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        ustar = data.find(B'ustar')
        if ustar >= 0:
            return ustar == 257 or data[ustar:ustar + 3] in (B'\x00\x30\x30', B'\x20\x20\x00')
        return False

class xtvba (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.office.xtvba and has the following commandline Interface:

usage: xtvba [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]]

Extract VBA macro code from Office documents.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtvba(PathExtractorUnit):
    """
    Extract VBA macro code from Office documents.
    """
    @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended')
    def _olevba():
        from oletools import olevba
        return olevba

    def unpack(self, data):
        sentinel = uuid4()
        try:
            parser = self._olevba.VBA_Parser(sentinel, data=bytes(data), relaxed=True)
        except self._olevba.FileOpenError:
            raise ValueError('Input data not recognized by VBA parser')
        for p1, stream_path, p2, code in parser.extract_all_macros():
            if not stream_path:
                if p1 == sentinel:
                    continue
                if p2 == sentinel:
                    continue
            yield UnpackResult(stream_path, code.encode(self.codec))

class xtw (stripspace=False, duplicates=False, longest=False, take=None)

This unit is implemented in refinery.units.pattern.xtw and has the following commandline Interface:

usage: xtw [-h] [-L] [-Q] [-0] [-v] [-x] [-r] [-l] [-t N]

Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works
similar to the xtp unit.

optional arguments:
  -x, --stripspace  Strip all whitespace from input data.
  -r, --duplicates  Yield every (transformed) Match, even when it was found before.
  -l, --longest     Sort results by length.
  -t, --take N      Return only the first N occurrences in order of appearance.

generic options:
  -h, --help        Show this help message and exit.
  -L, --lenient     Allow partial results as output.
  -Q, --quiet       Disables all log output.
  -0, --devnull     Do not produce any output.
  -v, --verbose     Specify up to two times to increase log level.

Expand source code Browse git

class xtw(PatternExtractor):
    """
    Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address.
    This works similar to the `refinery.xtp` unit.
    """

    def __init__(self, stripspace=False, duplicates=False, longest=False, take=None):
        self.superinit(super(), **vars(), ascii=True, utf16=True)

    def process(self, data):
        pattern = '|'.join(FR'(?P<{p.name}>\b{p.value}\b)' for p in wallets)
        pattern = FR'\b{pattern}\b'.encode('latin1')

        def check(match: re.Match[bytes]):
            for name, value in match.groupdict().items():
                if value is not None:
                    break
            else:
                raise RefineryCriticalException('Received empty match.')
            return self.labelled(value, kind=name)

        yield from self.matches_filtered(memoryview(data), pattern, check)

class xtxml (*paths, format=None, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit is implemented in refinery.units.formats.xml and has the following commandline Interface:

usage: xtxml [-h] [-L] [-Q] [-0] [-v] [-f F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME]
             [path [path ...]]

Extract values from an XML document.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -f, --format F   A format expression to be applied for computing the path of an item. This must
                   use metadata that is available on the item. The current tag can be accessed as
                   {0}. If no format is specified, the unit attempts to derive a good attribute
                   from the XML tree to use for generating paths.
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtxml(XMLToPathExtractorUnit):
    """
    Extract values from an XML document.
    """
    def unpack(self, data):
        root = xml.parse(data.strip())
        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        def walk(node: xml.XMLNode, *parts: str):
            def extract(node: xml.XMLNode = node):
                if not node.children:
                    return node.content.encode(self.codec)
                with MemoryFile() as stream:
                    node.write(stream)
                    return bytes(stream.getbuffer() | ppxml)
            tag_pre_count = Counter()
            tag_run_count = Counter()
            for child in node.children:
                tag_pre_count[child.tag] += 1
            yield UnpackResult('/'.join(parts), extract, **node.attributes)
            for child in node.children:
                if tag_pre_count[child.tag] == 1:
                    yield from walk(child, *parts, path(child))
                    continue
                tag_run_count[child.tag] += 1
                index = tag_run_count[child.tag]
                yield from walk(child, *parts, path(child, index))

        yield from walk(root, path(root))

class xtzip (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit is implemented in refinery.units.formats.archive.xtzip and has the following commandline Interface:

usage: xtzip [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD]
             [path [path ...]]

Extract files from a Zip archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtzip(ArchiveUnit):
    """
    Extract files from a Zip archive.
    """
    @ArchiveUnit.Requires('chardet', 'default', 'extended')
    def _chardet():
        import chardet
        return chardet

    @ArchiveUnit.Requires('pyzipper', 'arc', 'default', 'extended')
    def _pyzipper():
        import pyzipper
        return pyzipper

    @classmethod
    def _carver(cls):
        return carve_zip

    def unpack(self, data: bytearray):
        from zipfile import ZipFile, ZipInfo

        password = bytes(self.args.pwd)
        archive = ZipFile(MemoryFile(data))

        if password:
            archive.setpassword(password)
        else:
            def password_invalid(pwd: Optional[str], pyzipper=False):
                nonlocal archive
                if pwd is not None:
                    archive.setpassword(pwd.encode(self.codec))
                try:
                    archive.testzip()
                except NotImplementedError:
                    if pyzipper:
                        raise
                    self.log_debug('compression method unsupported, switching to pyzipper')
                    archive = self._pyzipper.AESZipFile(MemoryFile(data))
                    return password_invalid(pwd, True)
                except RuntimeError as E:
                    if 'password' not in str(E):
                        raise
                    return True
                else:
                    if pwd:
                        self.log_debug('using password:', pwd)
                    return False
            for pwd in [None, *self._COMMON_PASSWORDS]:
                if not password_invalid(pwd):
                    break
            else:
                raise RuntimeError('Archive is password-protected.')

        for info in archive.infolist():
            def xt(archive: ZipFile = archive, info: ZipInfo = info):
                try:
                    return archive.read(info.filename)
                except RuntimeError as E:
                    if 'password' not in str(E):
                        raise
                    if not password:
                        raise RuntimeError('archive is password-protected')
                    else:
                        raise RuntimeError(F'invalid password: {password.decode(self.codec)}') from E
            if info.filename:
                if info.is_dir():
                    continue

            # courtesy of https://stackoverflow.com/a/37773438/9130824
            filename = info.filename
            if info.flag_bits & ZIP_FILENAME_UTF8_FLAG == 0:
                filename_bytes = filename.encode('437')
                try:
                    guessed_encoding = self._chardet.detect(filename_bytes)['encoding']
                except ImportError:
                    guessed_encoding = None
                guessed_encoding = guessed_encoding or 'cp1252'
                filename = filename_bytes.decode(guessed_encoding, 'replace')

            try:
                date = datetime(*info.date_time)
            except Exception as e:
                self.log_info(F'{e!s} - unable to determine date from tuple {info.date_time} for: {filename}')
                date = None

            yield self._pack(filename, date, xt)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return data.rfind(ZipEndOfCentralDirectory.SIGNATURE) > 0

class xtzpaq (*paths, index=False, pwd=b'', date=b'date', path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

This unit is implemented in refinery.units.formats.archive.xtzpaq and has the following commandline Interface:

usage: xtzpaq [-h] [-L] [-Q] [-0] [-v] [-i] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME]
              [-p PWD]
              [path [path ...]]

Extract files from a ZPAQ archive.

positional arguments:
  path             Wildcard pattern for the path of the item to be extracted. Each item is
                   returned as a separate output of this unit. Paths may contain wildcards; The
                   default argument is a single wildcard, which means that every item will be
                   extracted. If a given path yields no results, the unit performs increasingly
                   fuzzy searches with it. This can be disabled using the --exact switch.

optional arguments:
  -i, --index      Archive is an index (no d-blocks).
  -l, --list       Return all matching paths as UTF8-encoded output chunks.
  -j, --join-path  Join path names from container with previous path names.
  -d, --drop-path  Do not modify the path variable for output chunks.
  -z, --fuzzy      Specify once to add a leading wildcard to each patterns, twice to also add a
                   trailing wildcard.
  -e, --exact      Path patterns never match on substrings.
  -r, --regex      Use regular expressions instead of wildcard patterns.
  -P, --path NAME  Name of the meta variable to receive the extracted path. The default value is
                   "path".
  -D, --date NAME  Name of the meta variable to receive the extracted file date. The default
                   value is "date".
  -p, --pwd PWD    Optionally specify an extraction password.

generic options:
  -h, --help       Show this help message and exit.
  -L, --lenient    Allow partial results as output.
  -Q, --quiet      Disables all log output.
  -0, --devnull    Do not produce any output.
  -v, --verbose    Specify up to two times to increase log level.

Expand source code Browse git

class xtzpaq(ArchiveUnit):
    """
    Extract files from a ZPAQ archive.
    """

    _MAGIC = B'\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3\x7A\x50\x51'

    def __init__(
        self, *paths,
        index: Arg.Switch('-i', help='Archive is an index (no d-blocks).') = False,
        **more
    ):
        for _code, _size in {
            _TCU32: 4,
            _TCI32: 4,
            _TCU16: 2,
            _TCI16: 2,
        }.items():
            _item_size = array(_code).itemsize
            if _item_size == _size:
                continue
            raise RuntimeError(
                F'Expected array type "{_code}" to have entries of size {_size}, but the API '
                F'reports a size of {_item_size}.')

        super().__init__(*paths, index=index, **more)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return cls._MAGIC in data

    def unpack(self, archive: bytearray):
        def mkdate(date) -> datetime:
            date = int(date)
            year = date // 1000000 // 10000
            month = date // 100000000 % 100
            day = date // 1000000 % 100
            hour = date // 10000 % 100
            minute = date // 100 % 100
            second = date % 100
            return datetime(year, month, day, hour, minute, second, 0)

        @dataclass
        class DT:
            date: int = 0
            attr: int = 0
            name: str = ""
            frag: List[int] = field(default_factory=list)

            @property
            def dt(self) -> Optional[datetime]:
                if self.date > 0:
                    return mkdate(self.date)

        # TODO: implement password-protected archives
        # key = self.args.pwd
        index = self.args.index
        bsize: Dict[int, int] = {}  # frag ID -> d block compressed size
        dt: Dict[str, DT] = {}      # filename -> date, attr, frags
        frag: List[bytes] = []      # ID -> hash[20] size[4] data
        csize = 0                   # expected offset of next non d block
        streaming = False
        journaling = False

        done = False
        dc = Decompressor()
        src = dc.set_input(archive)

        while not done and dc.read_block():
            while not done:
                filename = dc.read_filename()
                if filename is None:
                    break
                self.log_info('reading file', filename)
                comment = dc.read_comment()
                jsize = 0
                if len(comment) >= 4 and comment[-4:] == "jDC\x01":
                    num = re.search('^\\d+', comment)
                    if not num:
                        raise RuntimeError('missing size in comment')
                    jsize = int(num[0])
                    if streaming:
                        raise RuntimeError('journaling block after streaming one')
                    journaling = True
                    self.log_info('archive type is journaling')
                else:
                    if journaling:
                        raise RuntimeError('streaming block after journaling one')
                    if index:
                        raise RuntimeError('streaming block in index')
                    streaming = True
                    self.log_info('archive type is streaming')

                # Test journaling filename. The format must be
                # jDC[YYYYMMDDHHMMSS][t][NNNNNNNNNN]
                # where YYYYMMDDHHMMSS is the date, t is the type {c,d,h,i}, and
                # NNNNNNNNNN is the 10 digit first fragment ID for types c,d,h.
                # They must be in ascending lexicographical order.

                frag_id = 0
                block_type = None

                if journaling:
                    if len(filename) != 28:
                        raise RuntimeError('filename size not 28')
                    if filename[:3] != 'jDC':
                        raise RuntimeError('filename not jDC')
                    block_type = filename[17]
                    if block_type not in 'cdhi':
                        raise RuntimeError('type not c,d,h,i')
                    try:
                        mkdate(filename[3:17])
                    except Exception as E:
                        raise RuntimeError('invalid date') from E
                    frag_id = int(filename[18:28])
                    if not 1 <= frag_id <= 4294967295:
                        raise RuntimeError('fragment ID out of range')

                seg = MemoryFile(size_limit=jsize)
                dc.set_output(seg)
                sha1 = hashlib.sha1()
                dc.set_hasher(sha1)
                dc.decompress_data()

                if journaling and len(seg) != jsize:
                    raise RuntimeError('incomplete output')

                checksum = dc.read_segment_end()
                if checksum is None:
                    self.log_debug('no checksum')
                elif checksum != sha1.digest():
                    raise RuntimeError('SHA1 mismatch')

                # check csize at first non-d block
                if csize and block_type in 'chi':
                    if csize != offset:
                        raise RuntimeError(F'csize={csize} does not point to offset={offset}')
                    csize = 0

                # get csize from c block
                seglen = len(seg)
                seg = StructReader(seg.getbuffer())
                if block_type == 'c':
                    if seglen < 8:
                        raise RuntimeError("c block too small")
                    csize = seg.u64()
                    offset = src.tell() + 1
                    self.log_debug(F'csize={csize} at offset={offset}')
                    if csize >> 63:
                        self.log_warn('incomplete transaction at end of archive')
                        done = True
                    elif index and csize != 0:
                        raise RuntimeError('nonzero csize in index')
                    # Set csize to expected offset of first non d block
                    # assuming 1 more byte for unread end of block marker.
                    csize += offset

                if block_type == 'd':
                    if index:
                        raise RuntimeError('d block in index')
                    bsize[frag_id] = src.tell() + 1 - offset  # compressed size
                    self.log_debug(F' {bsize[frag_id]} -> {len(seg)}')
                    # Test frag size list at end. The format is f[id..id+n-1] fid n
                    # where fid may be id or 0. sizes must sum to the rest of block.
                    if seglen < 8:
                        raise RuntimeError('d block too small')
                    seg.seekset(-8)
                    fid = seg.u32() or frag_id
                    n = seg.u32()
                    if fid != frag_id:
                        raise RuntimeError('missing ID')
                    if n > (seglen - 8) // 4:
                        raise RuntimeError('frag list too big')
                    fragsum = 0  # computed sum of frag sizes
                    seg.seekset(-4 * (n + 2))
                    for _ in range(n):
                        fragsum += seg.u32()
                    if fragsum + n * 4 + 8 != seglen:
                        raise RuntimeError('bad frag size list')
                    # Save frag hashes and sizes. For output, save data too.
                    seg.seekset(fragsum)
                    data = memoryview(seg.getbuffer())
                    assert seg.remaining_bytes == n * 4 + 8
                    for i in range(n):
                        while len(frag) <= frag_id + i:
                            frag.append(B'')
                        if frag[frag_id + i]:
                            raise RuntimeError('duplicate frag ID')
                        f = seg.u32()
                        h = hashlib.sha1(data[:f]).digest()
                        frag[frag_id + i] = h + f.to_bytes(4, 'little') + data[:f]
                        data = data[f:]

                    assert len(data) == n * 4 + 8
                    assert seg.remaining_bytes == 8

                # Test and save h block. Format is: bsize (sha1[20] size)...
                # where bsize is the compressed size of the d block with the same id,
                # and each size corresonds to a fragment in that block. The list
                # must match the list in the d block if present.

                if block_type == 'h':
                    if seglen % 24 != 4:
                        raise RuntimeError('bad h block size')
                    b = seg.u32()
                    self.log_debug(F'[{frag_id}..{frag_id + seglen // 24}[ {b}')
                    fragsum = 0 # uncompressed size of all frags
                    for i in range(seglen // 24):
                        fd = seg.read(24)
                        if index:
                            while len(frag) <= frag_id + i:
                                frag.append(B'')
                            if frag[frag_id + i]:
                                raise RuntimeError('data in index')
                            frag[frag_id + i] = fd
                        elif frag_id + i >= len(frag) or len(frag[frag_id + i]) < 24:
                            raise RuntimeError('no matching d block')
                        elif frag[frag_id + i][:24] != fd:
                            raise RuntimeError('frag size or hash mismatch')
                        fragsum += int.from_bytes(fd[20:24], 'little')

                # Test i blocks and save files to extract. Format is:
                #   date filename 0 na attr[0..na) ni ptr[0..ni)   (to update)
                #   0    filename                                  (to delete)
                # Date is 64 bits in YYYYMMDDHHMMSS format.

                if block_type == 'i':
                    while not seg.eof:
                        f = DT(seg.u64())
                        f.name = seg.read_c_string('utf8')
                        if f.date > 0:
                            na = seg.u32()
                            if na > 65535:
                                raise ValueError('attr size > 65535')
                            f.attr = seg.read_integer(na * 8)
                            ni = seg.u32()
                            for i in range(ni):
                                a = seg.u32()
                                f.frag.append(a)
                                if index:
                                    continue
                                elif not 1 <= a < len(frag):
                                    raise RuntimeError('frag ID out of range')
                                elif not frag[a]:
                                    raise LookupError('missing frag data')
                        dt[f.name] = f

                if streaming:
                    yield self._pack(filename, None, seg.getvalue())

            offset = src.tell()

        self.log_debug(F'{offset} bytes of archive tested')

        if not journaling:
            return

        for name, f in dt.items():
            if not f.date:
                continue
            size = sum(
                int.from_bytes(frag[fp][20:24], 'little')
                for fp in f.frag
                if 0 < fp < len(frag) and len(frag[fp]) >= 24
            )
            out = MemoryFile()
            for fp in f.frag:
                if fp < len(frag):
                    out.write(memoryview(frag[fp])[24:])
            if len(out) != size:
                self.log_warn('invalid size during unpacking')
            yield self._pack(name, f.dt, out.getvalue())

class xxh (seed=0, text=False)

This unit is implemented in refinery.units.crypto.hash.xxhash and has the following commandline Interface:

usage: xxh [-h] [-L] [-Q] [-0] [-v] [-t] [seed]

Implements the xxHash hashing algorithm.

positional arguments:
  seed           specify the seed value; the default is 0

optional arguments:
  -t, --text     Output a hexadecimal representation of the hash.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.

Expand source code Browse git

class xxh(HashUnit):
    """
    Implements the xxHash hashing algorithm.
    """
    def __init__(
        self,
        seed: HashUnit.Arg.Number(metavar='seed', help='specify the seed value; the default is {default}') = 0,
        text=False
    ):
        super().__init__(text, seed=seed)

    def _algorithm(self, data):
        return xxhash(data, self.args.seed)

class xxtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size=1)

This unit is implemented in refinery.units.crypto.cipher.xxtea and has the following commandline Interface:

usage: xxtea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-b N] key

positional arguments:
  key                 The encryption key.

optional arguments:
  -i, --iv IV         Specifies the initialization vector. If none is specified, then a block of
                      zero bytes is used.
  -p, --padding P     Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm
                      does nothing. By default, all other algorithms are attempted. In most
                      cases, the data was not correctly decrypted if none of these work.
  -m, --mode M        Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB,
                      OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and
                      ECB otherwise.
  -r, --raw           Set the padding to raw; ignored when a padding is specified.
  -s, --swap          Decode blocks as big endian rather than little endian.
  -b, --block-size N  Cipher block size in 32-bit words. The default value 1 implies that the
                      input is treated as a single block, which is common behaviour of many
                      implementations.

generic options:
  -h, --help          Show this help message and exit.
  -L, --lenient       Allow partial results as output.
  -Q, --quiet         Disables all log output.
  -0, --devnull       Do not produce any output.
  -v, --verbose       Specify up to two times to increase log level.
  -R, --reverse       Use the reverse operation; Specify twice to normalize (first decode, then
                      encode).

Expand source code Browse git

class xxtea(TEAUnit, cipher=BlockCipherFactory(XXTEA)):

    block_size: int = 8

    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False, swap=False,
        block_size: Arg.Number('-b', help=(
            'Cipher block size in 32-bit words. The default value {default} implies that the input '
            'is treated as a single block, which is common behaviour of many implementations.')) = 1
    ):
        super().__init__(key, iv, padding, mode, raw, swap=swap, block_size=block_size)

    def _prepare_block(self, data: bytes):
        if self.args.block_size < 2:
            blocks, remainder = divmod(len(data), 4)
            if remainder:
                blocks += 1
            self.block_size = blocks * 4
        else:
            self.block_size = self.args.block_size * 4

    def encrypt(self, data: bytes) -> bytes:
        self._prepare_block(data)
        return super().encrypt(data)

    def decrypt(self, data: bytes) -> bytes:
        self._prepare_block(data)
        return super().decrypt(data)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(block_size=self.block_size, **optionals)

class zl (level=9, window=15, force=False, zlib_header=False, gzip_header=False)

This unit is implemented in refinery.units.compression.zl and has the following commandline Interface:

usage: zl [-h] [-L] [-Q] [-0] [-v] [-R] [-l N] [-w N] [-f] [-z | -g]

ZLib compression and decompression.

optional arguments:
  -l, --level N      Specify a compression level between 0 and 9.
  -w, --window N     Manually specify the window size between 8 and 15.
  -f, --force        Decompress as far as possible, even if all known methods fail.
  -z, --zlib-header  Use a ZLIB header.
  -g, --gzip-header  Use a GZIP header.

generic options:
  -h, --help         Show this help message and exit.
  -L, --lenient      Allow partial results as output.
  -Q, --quiet        Disables all log output.
  -0, --devnull      Do not produce any output.
  -v, --verbose      Specify up to two times to increase log level.
  -R, --reverse      Use the reverse operation; Specify twice to normalize (first decode, then
                     encode).

Expand source code Browse git

class zl(Unit):
    """
    ZLib compression and decompression.
    """

    def __init__(
        self,
        level  : Arg.Number('-l', bound=(0, 0X9), help='Specify a compression level between 0 and 9.') = 9,
        window : Arg.Number('-w', bound=(8, 0XF), help='Manually specify the window size between 8 and 15.') = 15,
        force  : Arg.Switch('-f', help='Decompress as far as possible, even if all known methods fail.') = False,
        zlib_header: Arg.Switch('-z', group='MODE', help='Use a ZLIB header.') = False,
        gzip_header: Arg.Switch('-g', group='MODE', help='Use a GZIP header.') = False
    ):
        if zlib_header and gzip_header:
            raise ValueError('You can only specify one header type (ZLIB or GZIP).')
        return super().__init__(level=level, window=window, force=force, zlib_header=zlib_header, gzip_header=gzip_header)

    def _force_decompress(self, data, mode):
        z = zlib.decompressobj(mode)

        def as_many_as_possible():
            for k in range(len(data)):
                try: yield z.decompress(data[k : k + 1])
                except zlib.error: break

        return B''.join(as_many_as_possible())

    def process(self, data):
        if data[0] == 0x78 or data[0:2] == B'\x1F\x8B' or self.args.zlib_header or self.args.gzip_header:
            mode_candidates = [self.args.window | 0x20, -self.args.window]
        else:
            mode_candidates = [-self.args.window, self.args.window | 0x20]
        mode_candidates.extend([0x10 | self.args.window, 0])
        for mode in mode_candidates:
            self.log_debug(F'using mode {mode} for decompression')
            try:
                z = zlib.decompressobj(mode)
                return z.decompress(data)
            except zlib.error:
                pass
        if self.args.force:
            return self._force_decompress(data, mode_candidates[0])
        raise ValueError('could not detect any zlib stream.')

    def reverse(self, data):
        mode = -self.args.window
        if self.args.zlib_header:
            mode = -mode
        if self.args.gzip_header:
            mode = -mode | 0x10
        self.log_debug(F'using mode {mode:+2d} for compression')
        zl = zlib.compressobj(self.args.level, zlib.DEFLATED, mode)
        zz = zl.compress(data)
        return zz + zl.flush(zlib.Z_FINISH)

    @classmethod
    def handles(self, data: bytearray):
        for sig in (
            B'\x1F\x8B',  # gzip header
            B'\x78\x01',  # zlib low compression
            B'\x78\x9C',  # zlib medium compression
            B'\x78\xDA',  # zlib high compression
        ):
            if data[:2] == sig:
                return True

class zstd

This unit is implemented in refinery.units.compression.zstd and has the following commandline Interface:

usage: zstd [-h] [-L] [-Q] [-0] [-v] [-R]

ZStandard (ZSTD) compression and decompression.

generic options:
  -h, --help     Show this help message and exit.
  -L, --lenient  Allow partial results as output.
  -Q, --quiet    Disables all log output.
  -0, --devnull  Do not produce any output.
  -v, --verbose  Specify up to two times to increase log level.
  -R, --reverse  Use the reverse operation; Specify twice to normalize (first decode, then
                 encode).

Expand source code Browse git

class zstd(Unit):
    """
    ZStandard (ZSTD) compression and decompression.
    """
    @Unit.Requires('pyzstd', 'all')
    def _pyzstd():
        import pyzstd
        return pyzstd

    def process(self, data):
        return self._pyzstd.ZstdDecompressor().decompress(data)

    def reverse(self, data):
        zc = self._pyzstd.ZstdCompressor()
        zc.compress(data)
        return zc.flush()