Module refinery
__ __ High Octane Triage Analysis __
|| _||______ __ __________ _____ ||
|| \||___ \__| ____/ ______/___ / ____\ ||
========||=====|| | __/ |/ \ /==| / __ \ __\===]|
'======|| | \ | | \_ _| \ ___/| | ||
||____ /__|___|__/ / | \____]| | ||
===============''====\/=========/ /==|__|=====|__|======'
\ /
\/
This is the binary refinery package documentation; see GitHub and PyPi for more information.
The package refinery
exports all Unit
s which are of type Entry
;
this marker implies that the unit exposes a shell command. The command line interface for each of
these units is given below, this is the same text as would be available by executing the command
with the -h
or --help
option. The documentation for this module only lists the classes that
correspond to exported refinery units, but for convenience, the refinery
module also exports the
classes Unit
and Arg
.
To better understand how the command line parameters are parsed, it is also recommended to study
the module documentation of the following library modules, as their content is relevant for how the
various Unit
s can be combined.
refinery.lib.frame
: framing syntax for working on lists of binary chunksrefinery.lib.argformats
: the multibin syntax for refinery argumentsrefinery.lib.meta
: defining and using metadata variables within framesrefinery.units
: writing custom units, add command-line arguments, and how to use refinery units within Python code.
Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
R"""
----------------------------------------------------------
__ __ High Octane Triage Analysis __
|| _||______ __ __________ _____ ||
|| \||___ \__| ____/ ______/___ / ____\ ||
========||=====|| | __/ |/ \ /==| / __ \ __\===]|
'======|| | \ | | \_ _| \ ___/| | ||
||____ /__|___|__/ / | \____]| | ||
===============''====\/=========/ /==|__|=====|__|======'
\ /
\/
This is the binary refinery package documentation; see
[GitHub](https://github.com/binref/refinery/) and
[PyPi](https://pypi.org/project/binary-refinery/)
for more information.
The package `refinery` exports all `refinery.units.Unit`s which are of type `refinery.units.Entry`;
this marker implies that the unit exposes a shell command. The command line interface for each of
these units is given below, this is the same text as would be available by executing the command
with the `-h` or `--help` option. The documentation for this module only lists the classes that
correspond to exported refinery units, but for convenience, the `refinery` module also exports the
classes `refinery.units.Unit` and `refinery.units.Arg`.
To better understand how the command line parameters are parsed, it is also recommended to study
the module documentation of the following library modules, as their content is relevant for how the
various `refinery.units.Unit`s can be combined.
1. `refinery.lib.frame`: framing syntax for working on lists of binary chunks
2. `refinery.lib.argformats`: the multibin syntax for refinery arguments
3. `refinery.lib.meta`: defining and using metadata variables within frames
4. `refinery.units`: writing custom units, add command-line arguments, and how to use refinery
units within Python code.
"""
__version__ = '0.8.1'
__distribution__ = 'binary-refinery'
from typing import Dict, List, Optional, Type, TypeVar, Iterable
from importlib import resources
from datetime import datetime
from threading import RLock
import pickle
from refinery.units import Arg, Unit
_T = TypeVar('_T')
def _singleton(cls: Type[_T]) -> _T:
return cls()
@_singleton
class __unit_loader__:
"""
Every unit can be imported from the refinery base module. The import is performed on demand to
reduce import times. The library ships with a pickled dictionary that maps unit names to their
corresponding module path. This data is expected to be stored as `__init__.pkl` in the package
directory.
"""
units: Dict[str, str]
cache: Dict[str, Type[Unit]]
_lock: RLock = RLock()
def __init__(self):
with resources.path(__name__, '__init__.py') as current_file:
# This is an annoying hack to allow this to work when __init__.pkl does not
# yet exist during setup. Starting with Python 3.9, we could use the slightly
# less awkward: resources.files(__name__).joinpath('__init__.pkl')
self.path = current_file.parent / '__init__.pkl'
self.reloading = False
self.loaded = False
self.units = {}
self.cache = {}
self.last_reload = datetime(1985, 8, 5)
self.load()
def __enter__(self):
self._lock.__enter__()
return self
def __exit__(self, et, ev, tb):
return self._lock.__exit__(et, ev, tb)
def load(self):
try:
cache: dict = pickle.load(self.path.open('rb'))
except (FileNotFoundError, EOFError):
cache = None
else:
try:
version = cache['version']
except KeyError:
cache = None
else:
if version != __version__:
cache = None
if cache is None:
self.reload()
else:
self.units = cache['units']
self.loaded = True
def clear(self):
self.loaded = False
self.units.clear()
self.cache.clear()
def save(self):
try:
pickle.dump({
'units': self.units,
'version': __version__,
}, self.path.open('wb'))
except Exception:
pass
else:
self.loaded = True
def reload(self):
if not self.reloading:
from refinery.lib.loader import get_all_entry_points
self.reloading = True
self.clear()
for executable in get_all_entry_points():
name = executable.__qualname__
self.units[name] = executable.__module__
self.cache[name] = executable
self.save()
self.reloading = False
def resolve(self, name) -> Optional[Unit]:
if not self.loaded:
self.load()
try:
module_path = self.units[name]
module = __import__(module_path, None, None, [name])
entry = getattr(module, name)
self.cache[name] = entry
return entry
except (KeyError, ModuleNotFoundError):
return None
@_singleton
class __pdoc__(dict):
def __init__(self, *a, **kw):
super().__init__()
self._loaded = False
def _strip_globals(self, hlp: str):
def _strip(lines: Iterable[str]):
triggered = False
for line in lines:
if triggered:
if line.lstrip() != line:
continue
triggered = False
if line.lower().startswith('global options:'):
triggered = True
continue
yield line
return ''.join(_strip(hlp.splitlines(keepends=True)))
def _load(self):
if self._loaded:
return
from .explore import get_help_string
self['Unit'] = False
self['Arg'] = False
with __unit_loader__ as ul:
for name in ul.units:
unit = ul.resolve(name)
if unit is None:
continue
for base in unit.mro():
try:
abstractmethods: List[str] = base.__abstractmethods__
except AttributeError:
break
for method in abstractmethods:
if method.startswith('_'):
continue
at = getattr(unit, method, None)
bt = getattr(unit.mro()[1], method, None)
if at and at is not bt:
self[F'{name}.{method}'] = False
hlp = get_help_string(unit, width=97)
hlp = hlp.replace('\x60', '')
hlp = self._strip_globals(hlp).strip()
hlp = (
F'This unit is implemented in `{unit.__module__}` and has the following '
F'commandline Interface:\n```text\n{hlp}\n```'
)
self[name] = hlp
self._loaded = True
def items(self):
self._load()
return super().items()
__all__ = sorted(__unit_loader__.units, key=lambda x: x.lower()) + [
Unit.__name__, Arg.__name__, '__unit_loader__', '__pdoc__']
def load(name) -> Optional[Unit]:
with __unit_loader__ as ul:
return ul.resolve(name)
def __getattr__(name):
with __unit_loader__ as ul:
unit = ul.resolve(name)
if unit is None:
raise AttributeError(name)
return unit
def __dir__():
return __all__
Sub-modules
refinery.data
-
This module contains data resources.
refinery.explore
-
A commandline script to search for binary refinery units based on keywords.
refinery.lib
-
Library functions used by various refinery units.
refinery.shell
-
Shell-Like Unit Interface …
refinery.units
-
This package contains all refinery units. To write an executable refinery unit, it is sufficient to write a class that inherits from …
Units
class a3x (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.a3x
and has the following commandline Interface:usage: a3x [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script bytecode. The unit also works on compiled AutoIt executables. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class a3x(PathExtractorUnit): """ Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script bytecode. The unit also works on compiled AutoIt executables. """ def unpack(self, data: bytearray): view = memoryview(data) cursor = 0 errors: Dict[int, Exception] = {} script_count = 0 truncated: Set[A3xRecord] = set() intact: Set[A3xRecord] = set() def _package(records: Iterable[A3xRecord]) -> Generator[UnpackResult, None, None]: for k, record in enumerate(records, 1): self.log_info(F'record {k} type:', record.type) self.log_info(F'record {k} path:', record.src_path) if record.path is None: continue yield UnpackResult( record.path, record.extract, srcpath=record.src_path, created=record.created.isoformat(' ', 'seconds'), written=record.written.isoformat(' ', 'seconds'), ) while cursor < len(view): self.log_debug(F'searching at offset 0x{cursor:08X}') nc = data.find(A3xScript.MAGIC, cursor) if nc >= 0: cursor = nc else: rp = data.find(A3xRecord.MAGIC, cursor) - A3xScript.WIDTH if rp <= cursor: break cursor = rp try: script = A3xScript(view[cursor:]) except Exception as E: errors[cursor] = E cursor += 1 continue else: valid = script.has_valid_magic() if valid: _m = 'correct' else: _m = 'invalid' if not script.body: cursor += A3xScript.WIDTH if not script.has_valid_magic(): cursor += len(A3xRecord.MAGIC) continue if script.truncated: _a = 'truncated' truncated.update(script.body) else: script_count += 1 _a = 'intact' intact.update(script.body) self.log_info( F'{_a} script of type', script.type, F'and length 0x{len(script):08X}', F'with {len(script.body)} records and {_m} magic:', script.magic ) cursor += len(script) if script.truncated: if not script.has_valid_magic(): cursor += len(A3xRecord.MAGIC) continue yield from _package(script.body) remaining = truncated - intact if remaining: self.log_warn('emitting records from truncated scripts') yield from _package(remaining) return elif truncated: self.log_debug('good news: intact scripts contained all records from truncated scripts') if script_count == 0: error = None for offset, error in errors.items(): self.log_warn(F'error at offset 0x{offset:08X}:', error) if error: raise error @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return A3xScript.MAGIC in data or A3xRecord.MAGIC in data
class add (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.add
and has the following commandline Interface:usage: add [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Add the given argument to each block. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class add(BinaryOperationWithAutoBlockAdjustment): """ Add the given argument to each block. """ @staticmethod def operate(a, b): return a + b @staticmethod def inplace(a, b): a += b
class adler32 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.checksums
and has the following commandline Interface:usage: adler32 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the Adler32 Hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class adler32(HashUnit): """ Returns the Adler32 Hash of the input data. """ def _algorithm(self, data: bytes) -> bytes: return struct.pack('>I', zlib.adler32(data))
class aes (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.aes
and has the following commandline Interface:usage: aes [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] [-A N] key AES encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CCM, CFB, CTR, EAX, ECB, GCM, OCB, OFB. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -M, --mac-len N Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in bytes. -A, --assoc-len N Only for CCM: Length of the associated data. If not specified, all associated data is buffered internally. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class aes(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(AES)): """ AES encryption and decryption. """ pass
class alu (operator, *argument, seed=0, prologue=None, epilogue=None, inc=False, dec=False, cbc=False, bigendian=False, blocksize=None, precision=None)
-
This unit is implemented in
refinery.units.blockwise.alu
and has the following commandline Interface:usage: alu [-h] [-L] [-Q] [-0] [-v] [-s SEED] [-p E] [-e E | -I | -D | -X] [-E] [-B N] [-P N] operator [argument [argument ...]] The arithmetic-logical unit. It allows you to specify a custom Python expression where the following variables are allowed: - the variable A: same as V[0] - the variable B: current block - the variable E: block value of encoded input (not changed after update) - the variable N: number of bytes in the input - the variable K: current index in the input - the variable S: the internal state value - the variable V: the vector of arguments - the variable I: function that casts to a signed int in current precision - the variable U: function that casts to unsigned int in current precision - the variable R: function; R(x,4) rotates x by 4 to the right - the variable L: function; L(x,4) rotates x by 4 to the left - the variable M: function; M(x,8) picks the lower 8 bits of x - the variable X: function that negates the bits of the input (The rotation operations are interpreted as shifts when arbitrary precision is used.) Each block of the input is replaced by the value of this expression. Additionally, it is possible to specify prologue and epilogue expressions which are used to update the state variable S before and after the update of each block, respectively. positional arguments: operator A Python expression defining the operation. argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -s, --seed SEED Optional seed value for the state variable S. The default is zero. This can be an expression involving the variable N. -p, --prologue E Optional expression with which the state variable S is updated before a block is operated on. -e, --epilogue E Optional expression with which the state variable S is updated after a block was operated on. -I, --inc equivalent to --epilogue=S+1 -D, --dec equivalent to --epilogue=S-1 -X, --cbc equivalent to --epilogue=(B) -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. -P, --precision N The size of the variables used for computing the result. By default, this is equal to the block size. The value may be zero, indicating that arbitrary precision is required. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class alu(ArithmeticUnit): """ The arithmetic-logical unit. It allows you to specify a custom Python expression where the following variables are allowed: - the variable `A`: same as `V[0]` - the variable `B`: current block - the variable `E`: block value of encoded input (not changed after update) - the variable `N`: number of bytes in the input - the variable `K`: current index in the input - the variable `S`: the internal state value - the variable `V`: the vector of arguments - the variable `I`: function that casts to a signed int in current precision - the variable `U`: function that casts to unsigned int in current precision - the variable `R`: function; `R(x,4)` rotates x by 4 to the right - the variable `L`: function; `L(x,4)` rotates x by 4 to the left - the variable `M`: function; `M(x,8)` picks the lower 8 bits of x - the variable `X`: function that negates the bits of the input (The rotation operations are interpreted as shifts when arbitrary precision is used.) Each block of the input is replaced by the value of this expression. Additionally, it is possible to specify prologue and epilogue expressions which are used to update the state variable `S` before and after the update of each block, respectively. """ @staticmethod def _parse_op(definition, default=None): definition = definition or default if not definition: raise ValueError('No definition given') return definition def __init__( self, operator: Arg(type=str, help='A Python expression defining the operation.'), *argument, seed: Arg('-s', type=str, help=( 'Optional seed value for the state variable S. The default is zero. This can be an expression ' 'involving the variable N.')) = 0, prologue: Arg('-p', type=str, metavar='E', help=( 'Optional expression with which the state variable S is updated before a block is operated on.')) = None, epilogue: Arg('-e', type=str, metavar='E', group='EPI', help=( 'Optional expression with which the state variable S is updated after a block was operated on.')) = None, inc: Arg('-I', group='EPI', help='equivalent to --epilogue=S+1') = False, dec: Arg('-D', group='EPI', help='equivalent to --epilogue=S-1') = False, cbc: Arg('-X', group='EPI', help='equivalent to --epilogue=(B)') = False, bigendian=False, blocksize=None, precision=None ): for flag, flag_is_set, expression in [ ('--cbc', cbc, '(B)'), ('--inc', inc, 'S+1'), ('--dec', dec, 'S-1'), ]: if flag_is_set: if epilogue is not None: raise ValueError( F'Ambiguous specification; epilogue was already set to {epilogue} ' F'when {flag} was parsed.' ) epilogue = expression self._index = IndexCounter() super().__init__( self._index, *argument, bigendian=bigendian, blocksize=blocksize, precision=precision, seed=seed, operator=self._parse_op(operator), prologue=self._parse_op(prologue, 'S'), epilogue=self._parse_op(epilogue, 'S'), ) @property def _is_ecb(self): return not self.args.epilogue and not self.args.prologue def _fastblock(self, _): raise FastBlockError def process(self, data): context = dict(metavars(data)) seed = self.args.seed fbits = self.fbits fmask = self.fmask if isinstance(seed, str): seed = PythonExpression(seed, 'N', constants=metavars(data), mask=fmask) if callable(seed): seed = seed(context, N=len(data)) self._index.init(self.fmask) def _expression(definition: str): return PythonExpression(definition, *'IBEASMNVRLX', all_variables_allowed=True, mask=fmask) prologue = _expression(self.args.prologue).expression epilogue = _expression(self.args.epilogue).expression operator = _expression(self.args.operator).expression def cast_unsigned(n) -> int: return int(n) & fmask def cast_signed(n) -> int: n = int(n) & fmask if n >> (fbits - 1): return -((~n + 1) & fmask) else: return n if fbits is INF: def rotate_r(n, k): return n >> k def rotate_l(n, k): return n << k else: def rotate_r(n, k): return (n >> k) | (n << (fbits - k)) & fmask def rotate_l(n, k): return (n << k) | (n >> (fbits - k)) & fmask def negate_bits(n): return n ^ fmask def mask_to_bits(x, b): return x & ((1 << b) - 1) context.update( N=len(data), S=seed, I=cast_signed, U=cast_unsigned, R=rotate_r, L=rotate_l, X=negate_bits, M=mask_to_bits, ) def operate(block, index, *args): context.update(K=index, B=block, E=block, V=args) if args: context['A'] = args[0] context['S'] = eval(prologue, None, context) context['B'] = eval(operator, None, context) context['S'] = eval(epilogue, None, context) return context['B'] placeholder = self.operate self.operate = operate try: result = super().process(data) finally: self.operate = placeholder return result @staticmethod def operate(block, index, *args): raise RuntimeError('This operate method cannot be called.') def inplace(self, block, *args) -> None: super().inplace(block, *args)
class aplib
-
This unit is implemented in
refinery.units.compression.ap
and has the following commandline Interface:usage: aplib [-h] [-L] [-Q] [-0] [-v] [-R] [-F] APLib compression and decompression. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class aplib(Unit): """ APLib compression and decompression. """ def reverse(self, buf): return compressor(buf).compress() def process(self, buf): view = memoryview(buf) size = 0 if view[:4] == B'AP32': size = int.from_bytes(buf[4:8], 'little') if size > 0x80: size = 0 else: self.log_info(F'detected aPLib header of size {size}') return decompressor(view[size:]).decompress() @classmethod def handles(self, data: bytearray): if data[:4] == B'AP32': return True return None
class asm (mode='x32', *, count=None, until=None, no_address=False, no_hexdump=False)
-
This unit is implemented in
refinery.units.sinks.asm
and has the following commandline Interface:usage: asm [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-A] [-H] [[x32|x64|..]] Disassembles the input data using capstone and produces a human-readable disassembly listing. It internally uses the opc unit for this, which is an alternative option if you are looking for more programmatic disassembly. positional arguments: [x32|x64|..] Machine code architecture, default is x32. Select from the following list: x16, x32, x64, ppc32, ppc64, mips32, mips64. optional arguments: -c, --count N Maximum number of bytes to disassemble, infinite by default. -u, --until STR Disassemble until the given string appears among the disassembly. -A, --no-address Disable address display. -H, --no-hexdump Disable opcodes hexdump. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class asm(opc): """ Disassembles the input data using capstone and produces a human-readable disassembly listing. It internally uses the `refinery.opc` unit for this, which is an alternative option if you are looking for more programmatic disassembly. """ def __init__( self, mode='x32', *, count=None, until=None, no_address: Arg.Switch('-A', help='Disable address display.') = False, no_hexdump: Arg.Switch('-H', help='Disable opcodes hexdump.') = False, ): super().__init__( mode=mode, nvar='_name', avar='_addr', ovar='_arg', count=count, until=until, no_address=no_address, no_hexdump=no_hexdump, ) def process(self, data): insns = list(super().process(data)) if not insns: return no_address = self.args.no_address no_hexdump = self.args.no_hexdump def _hl(x): return len(hex(x)) args_width = max(len(insn['_args']) for insn in insns) memo_width = max(len(insn['_name']) for insn in insns) addr_width = max(_hl(insn['_addr']) for insn in insns) if no_address: addr_width = 0 memo_width = memo_width + 2 max_data_bytes_count = max(len(c) for c in insns) padding = addr_width + memo_width + args_width + 10 metrics_opc = HexDumpMetrics(max_data_bytes_count, padding=padding) for insn in insns: hd = one(hexdump(insn, metrics_opc)) name = insn.meta.pop('_name') args = insn.meta.pop('_args') addr = insn.meta.pop('_addr') msg = F' {name:<{memo_width}} {args:<{args_width}}' if not no_hexdump: msg = F'{msg} ; {hd}' if not no_address: msg = F'{addr:0{addr_width}X}: {msg}' yield msg.encode(self.codec)
class atbash
-
This unit is implemented in
refinery.units.encoding.atbash
and has the following commandline Interface:usage: atbash [-h] [-L] [-Q] [-0] [-v] [-R] https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st century, except for picking out crypto nerds. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class atbash(Unit): """ https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st century, except for picking out crypto nerds. """ def process(self, data: bytearray): uc = range(B'A'[0], B'Z'[0] + 1) lc = range(B'a'[0], B'z'[0] + 1) for k, letter in enumerate(data): if letter in uc: data[k] = uc[~uc.index(letter)] continue if letter in lc: data[k] = lc[~lc.index(letter)] continue return data reverse = process
class autoxor (range=slice(1, 32, None))
-
This unit is implemented in
refinery.units.misc.autoxor
and has the following commandline Interface:usage: autoxor [-h] [-L] [-Q] [-0] [-v] [start:end:step] Assumes a XOR-encoded input and automatically attempts to find the correct XOR key. The method is based on the assumption that the plaintext input contains one letter that occurs with a much higher frequency than all other letters; this is the case for the null byte in PEs, and also for the space character in many text files. positional arguments: start:end:step range of length values to try in Python slice syntax, the default is 1:32. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class autoxor(xkey): """ Assumes a XOR-encoded input and automatically attempts to find the correct XOR key. The method is based on the assumption that the plaintext input contains one letter that occurs with a much higher frequency than all other letters; this is the case for the null byte in PEs, and also for the space character in many text files. """ def process(self, data: bytearray): key = super().process(data) if not key: self.log_warn('No key was found; returning original data.') return data bin, = data | xor(key) txt, = bin | xor(0x20) if re.fullmatch(BR'[\s!-~]+', txt) and not txt.isspace(): key = bytes(key | xor(0x20)) bin = txt return self.labelled(bin, key=key)
class b32
-
This unit is implemented in
refinery.units.encoding.b32
and has the following commandline Interface:usage: b32 [-h] [-L] [-Q] [-0] [-v] [-R] Base32 encoding and decoding. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class b32(Unit): """ Base32 encoding and decoding. """ def reverse(self, data): return base64.b32encode(data) def process(self, data: bytearray): before_padding = 0 for before_padding in range(len(data), 0, -1): if data[before_padding - 1:before_padding] != B'=': break padding_size = -before_padding % 8 missing = before_padding + padding_size - len(data) if missing > 0: self.log_info(F'detected incorrect padding: added {missing} padding characters') data.extend(B'=' * missing) if missing < 0: self.log_info(F'detected incorrect padding: removed {-missing} padding characters') data[padding_size + before_padding:] = [] return base64.b32decode(data, casefold=True)
class b58
-
This unit is implemented in
refinery.units.encoding.b58
and has the following commandline Interface:usage: b58 [-h] [-L] [-Q] [-0] [-v] [-R] Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the alphabet omits digits and letters that look similar. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class b58(base): """ Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the alphabet omits digits and letters that look similar. """ def __init__(self): super().__init__(b'123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz')
class b62
-
This unit is implemented in
refinery.units.encoding.b62
and has the following commandline Interface:usage: b62 [-h] [-L] [-Q] [-0] [-v] [-R] Base62 encoding and decoding. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class b62(base): """ Base62 encoding and decoding. """ def __init__(self): super().__init__(b'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')
class b64 (urlsafe=False)
-
This unit is implemented in
refinery.units.encoding.b64
and has the following commandline Interface:usage: b64 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-u] Base64 encoding and decoding. optional arguments: -u, --urlsafe use URL-safe alphabet generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class b64(Unit): """ Base64 encoding and decoding. """ def __init__(self, urlsafe: Arg.Switch('-u', help='use URL-safe alphabet') = False): super().__init__(urlsafe=urlsafe) def reverse(self, data): altchars = None if self.args.urlsafe: altchars = B'-_' return base64.b64encode(data, altchars=altchars) def process(self, data: bytearray): if not data: return data if len(data) == 1: raise ValueError('single byte can not be base64-decoded.') data.extend(B'===') altchars = None if (B'-' in data or B'_' in data) and (B'+' not in data and B'/' not in data) or self.args.urlsafe: altchars = B'-_' return base64.b64decode(data, altchars=altchars) @classmethod def handles(self, data: bytearray) -> bool: from refinery.lib.patterns import formats if not formats.spaced_b64.value.fullmatch(data): return False histogram = set() lcase_count = 0 ucase_count = 0 digit_count = 0 other_count = 0 total_count = len(data) for byte in data: histogram.add(byte) if len(histogram) > 60: return True elif byte in range(0x61, 0x7B): lcase_count += 1 elif byte in range(0x41, 0x5B): ucase_count += 1 elif byte in range(0x30, 0x40): digit_count += 1 elif byte in B'\v\f\t\r\n\x20': total_count -= 1 else: other_count += 1 for c in (lcase_count, ucase_count, digit_count, other_count): # Call this a false positive if more than 2/3ds of the data # consist of a single category of letters. if c * 3 > total_count * 2: return False return True
class b65536
-
This unit is implemented in
refinery.units.encoding.b65536
and has the following commandline Interface:usage: b65536 [-h] [-L] [-Q] [-0] [-v] [-R] Base65536 encoding and decoding. A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class b65536(Unit): """ Base65536 encoding and decoding. A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set. """ def reverse(self, data): if not data: return B'' output = MemoryFile() length = len(data) for x in range(0, length, 2): b1 = data[x] b2 = data[x + 1] if x + 1 < length else -1 code_point = _BLOCK_START[b2] + b1 output.write(chr(code_point).encode()) return output.getvalue() def process(self, data): if not data: return B'' done = False output = MemoryFile() for ch in data.decode(): code_point = ord(ch) b1 = code_point & ((1 << 8) - 1) try: b2 = _B2[code_point - b1] except KeyError: self.log_info('Invalid base65536 code point: %d, skipping' % code_point) continue b = b1.to_bytes(1, "little") if b2 == -1 else b1.to_bytes(1, "little") + b2.to_bytes(1, "little") if len(b) == 1: if done: raise ValueError('base65536 sequence continued after final byte') done = True output.write(b) return output.getvalue()
class b85
-
This unit is implemented in
refinery.units.encoding.b85
and has the following commandline Interface:usage: b85 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] Base85 encoding and decoding. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class b85(Unit): """ Base85 encoding and decoding. """ def reverse(self, data): return base64.b85encode(data) def process(self, data): if re.search(BR'\s', data) is not None: data = re.sub(BR'\s+', B'', data) return base64.b85decode(data) @classmethod def handles(self, data: bytearray): from refinery.lib.patterns import formats return formats.spaced_b85.value.fullmatch(data)
class b92
-
This unit is implemented in
refinery.units.encoding.b92
and has the following commandline Interface:usage: b92 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] Base92 encoding and decoding. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class b92(Unit): """ Base92 encoding and decoding. """ def reverse(self, data): if not data: return B'~' reader = StructReader(data, bigendian=True) output = MemoryFile() while reader.remaining_bits > 0: try: block = reader.read_integer(13) except EOFError: count = reader.remaining_bits block = reader.read_integer(count) self.log_debug(F'reading {count} remaining bits: {block:0{count}b}') shift = 6 - count if shift >= 0: block <<= shift self.log_debug(F'encoding block: {block:06b}') output.write_byte(_B92_ALPHABET[block]) break block <<= 13 - count self.log_debug(F'encoding block: {block:013b}') hi, lo = divmod(block, 91) output.write_byte(_B92_ALPHABET[hi]) output.write_byte(_B92_ALPHABET[lo]) return output.getvalue() def process(self, data): if data == B'~': return B'' output = MemoryFile() buffer = 0 length = 0 view = memoryview(data) q, r = divmod(len(view), 2) if r > 0: bits = 6 tail = _B92_DECODING[data[~0]] else: bits = 13 tail = _B92_DECODING[data[~1]] * 91 + _B92_DECODING[data[~0]] view = view[:(q - 1) * 2] it = iter(view) for a, b in zip(it, it): block = _B92_DECODING[a] * 91 + _B92_DECODING[b] assert length < 8 buffer <<= 13 buffer |= block length += 13 size, length = divmod(length, 8) assert size > 0 output.write((buffer >> length).to_bytes(size, 'big')) buffer &= (1 << length) - 1 missing = 8 - length shift = bits - missing if shift < 8: bytecount = 1 else: bytecount = 2 shift -= 8 missing += 8 if shift < 0: raise RefineryPartialResult( F'Invalid padding, missing {-shift} bits.', output.getvalue()) buffer <<= missing buffer |= tail >> shift length += missing output.write(buffer.to_bytes(bytecount, 'big')) if tail & ((1 << shift) - 1) != 0: raise RefineryPartialResult( F'Invalid padding, lower {shift} bits of {tail:0{bits}b} are not zero.', output.getvalue()) return output.getvalue() @classmethod def handles(self, data: bytearray): from refinery.lib.patterns import formats return formats.b92.value.fullmatch(data)
class base (base=0, strip_padding=False, little_endian=False, strict_digits=False)
-
This unit is implemented in
refinery.units.encoding.base
and has the following commandline Interface:usage: base [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-e] [-d] [base|alphabet] Encodes and decodes integers in arbitrary base. positional arguments: base|alphabet Either the base to be used or an alphabet. If an explicit alphabet is given, its length determines the base. The default base 0 treats the input as a Python integer literal. If a numeric base is given, digits from the alphabet "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" are used. optional arguments: -s, --strip-padding Do not add leading zeros to the output. -e, --little-endian Use little endian byte order instead of big endian. -d, --strict-digits Check that all input digits are part of the alphabet. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class base(Unit): """ Encodes and decodes integers in arbitrary base. """ def __init__( self, base: Arg(type=numseq, metavar='base|alphabet', help=( R'Either the base to be used or an alphabet. If an explicit alphabet is given, its length ' R'determines the base. The default base 0 treats the input as a Python integer literal. If ' F'a numeric base is given, digits from the alphabet "{_DEFAULT_ALPH_STR}" are used. ')) = 0, strip_padding: Arg.Switch('-s', help='Do not add leading zeros to the output.') = False, little_endian: Arg.Switch('-e', help='Use little endian byte order instead of big endian.') = False, strict_digits: Arg.Switch('-d', help='Check that all input digits are part of the alphabet.') = False, ): super().__init__( base=base, strip_padding=strip_padding, little_endian=little_endian, strict_digits=strict_digits, ) @property def _args(self): base = self.args.base if isinstance(base, int): if not base: return 0, B'' if base in _LARGER_ALPHABETS: return base, _LARGER_ALPHABETS[base] if base not in range(2, len(_DEFAULT_ALPHABET) + 1): raise ValueError(F'base may only be an integer between 2 and {len(_DEFAULT_ALPHABET)}') return base, _DEFAULT_ALPHABET[:base] if len(set(base)) != len(base): raise ValueError('the given alphabet contains duplicate letters') return len(base), bytearray(base) @property def byteorder(self): return 'little' if self.args.little_endian else 'big' def reverse(self, data): base, alphabet = self._args self.log_info('using byte order', self.byteorder) number = int.from_bytes(data, byteorder=self.byteorder) if base == 0: return B'0x%X' % number if base > len(alphabet): raise ValueError(F'Only {len(alphabet)} available; not enough to encode base {base}') data_bits = len(data) * 8 base_bits = math.log2(base) result = bytearray() while data_bits >= 1: number, k = divmod(number, base) result.append(alphabet[k]) if not number and self.args.strip_padding: break data_bits -= base_bits result.reverse() return result def process(self, data: bytearray): base, alphabet = self._args be_lenient = not self.args.strict_digits if be_lenient and alphabet.upper() == alphabet: lcased = (c + 0x20 if 0x41 <= c <= 0x5a else c for c in data) if all(x == y for x, y in zip(data, lcased)): data = data.upper() if base and base != 64 and be_lenient: check = set(alphabet) index = 0 it = iter(data) for b in it: if b not in check: break index += 1 for b in it: if b in check: data[index] = b index += 1 self.log_info(F'stripped {len(data) - index} invalid digits from input data') del data[index:] if len(alphabet) <= len(_DEFAULT_ALPHABET): defaults = _DEFAULT_ALPHABET[:base] if alphabet != defaults: self.log_info('translating input data to a default alphabet for faster conversion') data_translated = data.translate(bytes.maketrans(alphabet, defaults)) result = int(data_translated, base) else: result = int(data, base) elif len(alphabet) == 64: import base64 _b64_alphabet = _LARGER_ALPHABETS[64] if alphabet != _b64_alphabet: data = data.translate(bytes.maketrans(alphabet, _b64_alphabet)) return base64.b64decode(data + b'===', validate=self.args.strict_digits) elif len(alphabet) == 85: import base64 _b85_alphabet = _LARGER_ALPHABETS[85] if alphabet != _b85_alphabet: data = data.translate(bytes.maketrans(alphabet, _b85_alphabet)) return base64.b85decode(data) else: self.log_warn('very long alphabet, unable to use built-ins; reverting to (slow) fallback.') result = 0 lookup = {digit: k for k, digit in enumerate(alphabet)} for digit in data: result *= base result += lookup[digit] if not base or self.args.strip_padding: bits = result.bit_length() else: bits = (len(data) - 1) * math.log2(base) + math.log2(alphabet.index(data[0]) + 1) bits = math.ceil(bits) size, rest = divmod(bits, 8) size += int(bool(rest)) return result.to_bytes(size, byteorder=self.byteorder)
class bat (keep_all=False, keep_comment=False, keep_definitions=False, keep_echo=False)
-
This unit is implemented in
refinery.units.formats.bat
and has the following commandline Interface:usage: bat [-h] [-L] [-Q] [-0] [-v] [-a] [-c] [-d] [-e] Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script is interpreted, variables are substituted for previously defined values, including commonly defined operating system environment variables. Variable definitions that are later evaluated are removed from the script, as are all echo commands and comments. optional arguments: -a, --keep-all Do not strip anything after deobfuscation. -c, --keep-comment Do not strip comments from the script. -d, --keep-definitions Do not strip variable definitions. -e, --keep-echo Do not strip echo calls in the script. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class bat(Unit): """ Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script is interpreted, variables are substituted for previously defined values, including commonly defined operating system environment variables. Variable definitions that are later evaluated are removed from the script, as are all echo commands and comments. """ def __init__( self, keep_all : Unit.Arg.Switch('-a', help='Do not strip anything after deobfuscation.') = False, keep_comment : Unit.Arg.Switch('-c', help='Do not strip comments from the script.') = False, keep_definitions : Unit.Arg.Switch('-d', help='Do not strip variable definitions.') = False, keep_echo : Unit.Arg.Switch('-e', help='Do not strip echo calls in the script.') = False, ): ... @unicoded def process(self, data: str) -> str: mode = STRIP.ALL if self.args.keep_all: mode = STRIP.NONE elif self.args.keep_comment: mode ^= STRIP.COMMENT elif self.args.keep_definitions: mode ^= STRIP.DEFINITION elif self.args.keep_echo: mode ^= STRIP.ECHO return BatchDeobfuscator().deobfuscate(data, mode)
class bitrev (bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.bitrev
and has the following commandline Interface:usage: bitrev [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer multiple of the block size are ignored. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class bitrev(UnaryOperation): """ Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer multiple of the block size are ignored. """ @staticmethod def operate(arg): raise RuntimeError('operate was called before the unit was initialized') def __init__(self, bigendian=False, blocksize=None): """ Unreadable bit reversal operations due to: https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel """ super().__init__(bigendian=bigendian, blocksize=blocksize, _truncate=1) if self.bytestream: def operate(v): return ((v * 0x202020202) & 0x10884422010) % 1023 elif self.blocksize in (2, 4, 8): def operate(v): s = self.fbits m = self.fmask w = v while s > 1: s >>= 1 m = m ^ (m << s) w = ((w << s) & ~m) | ((w >> s) & m) return w else: def operate(v): w = v & 0 for s in range(self.fbits): w |= ((v >> s) & 1) << (self.fbits - s - 1) return w self.operate = operate
class bitsnip (slices=[slice(0, 1, None)], bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.bitsnip
and has the following commandline Interface:usage: bitsnip [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] [slices [slices ...]] Pick a certain range of bits from each block of the input. The extracted ranges of bits are concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are indexed from least significant at index 0 to most significant in each block. When the unit operates in big endian mode, the internal bit buffer is shifted left in each step and new bits are inserted as the least significant portion. Conversely, in default (little endian) mode, newly extracted bits are added as the now most significant ones. After concatenating all bit slices into a large integer, this integer is converted into a byte string according to the given byte ordering. positional arguments: slices Specify start:stop:size, where size can be used to pad or truncate the extracted bits. If size is omitted, it defaults to (stop-start). If no slice is specified, it defaults to 0, which corresponds to 0:1:1, i.e. extracting the lowest bit. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class bitsnip(BlockTransformationBase): """ Pick a certain range of bits from each block of the input. The extracted ranges of bits are concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are indexed from least significant at index 0 to most significant in each block. When the unit operates in big endian mode, the internal bit buffer is shifted left in each step and new bits are inserted as the least significant portion. Conversely, in default (little endian) mode, newly extracted bits are added as the now most significant ones. After concatenating all bit slices into a large integer, this integer is converted into a byte string according to the given byte ordering. """ def __init__( self, slices: Arg(help=( 'Specify start:stop:size, where size can be used to pad or truncate the extracted ' 'bits. If size is omitted, it defaults to (stop-start). If no slice is specified, ' 'it defaults to 0, which corresponds to 0:1:1, i.e. extracting the lowest bit.') ) = [slice(0, 1)], bigendian=False, blocksize=None ): super().__init__(slices=slices, bigendian=bigendian, blocksize=blocksize) def process(self, data: bytearray): bitsnip_data = 0 bitsnip_size = 0 slices: List[Tuple[int, int, int]] = [] maxbits = 8 * self.blocksize args: Iterable[slice] = iter(self.args.slices) bigendian: bool = self.args.bigendian for s in args: start = s.start stop = s.stop if start is None: start = 0 if stop is None: stop = maxbits elif stop > maxbits: raise ValueError(F'the selection {start}:{stop} is out of bounds for the block size {self.blocksize}') if start >= stop: continue size = stop - start mask = (1 << size) - 1 size = s.step or size slices.append((start, mask, size)) for item in self.chunk(data): for shift, mask, size in slices: bits = (item >> shift) & mask if bigendian: bitsnip_data <<= size bitsnip_data |= bits else: bitsnip_data |= bits << bitsnip_size bitsnip_size += size length, remainder = divmod(bitsnip_size, 8) if remainder != 0: self.log_info(F'discarding {bitsnip_size % 8} bits') if bigendian: bitsnip_data >>= remainder else: bitsnip_data &= (1 << (8 * length)) - 1 if bigendian: return bitsnip_data.to_bytes(length, 'big') else: return bitsnip_data.to_bytes(length, 'little')
class blabla (key, nonce=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', rounds=10, discard=0, stateful=False)
-
This unit is implemented in
refinery.units.crypto.cipher.blabla
and has the following commandline Interface:usage: blabla [-h] [-L] [-Q] [-0] [-v] [-R] [-r N] [-d N] [-s] key [nonce] Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is similar to ChaCha in design but operates on 64-bit blocks. positional arguments: key The encryption key. nonce The 16-byte nonce. The default are 16 null bytes. optional arguments: -r, --rounds N The number of rounds, default is 10. -d, --discard N Discard the first N bytes of the keystream, 0 by default. -s, --stateful Do not reset the key stream while processing the chunks of one frame. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class blabla(StreamCipherUnit): """ Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is similar to ChaCha in design but operates on 64-bit blocks. """ key_size = {32} def __init__( self, key, nonce: Arg(help='The 16-byte nonce. The default are 16 null bytes.') = bytes(16), rounds: Arg.Number('-r', help='The number of rounds, default is {default}.') = 10, discard=0, stateful=False ): super().__init__(key=key, nonce=nonce, rounds=rounds, discard=discard, stateful=stateful) def keystream(self): r = self.args.rounds n = self.args.nonce k = struct.unpack('<4Q', self.args.key) try: n = struct.unpack('<2Q', n) except Exception: raise ValueError(F'The given nonce has invalid length of {len(n)}, it must be 16 bytes in size.') q = [ 0x6170786593810fab, # 0x0 0x3320646ec7398aee, # 0x1 0x79622d3217318274, # 0x2 0x6b206574babadada, # 0x3 *k, # 0x4 .. 0x7 0x2ae36e593e46ad5f, # 0x8 0xb68f143029225fc9, # 0x9 0x8da1e08468303aa6, # 0xA 0xa48a209acd50a4a7, # 0xB 0x7fdc12f23f90778c, # 0xC 1, # 0xD *n # 0xE .. 0xF ] while True: v = [*q] for _ in range(r): for a, b, c, d in [ (0x0, 0x4, 0x8, 0xC), (0x1, 0x5, 0x9, 0xD), (0x2, 0x6, 0xA, 0xE), (0x3, 0x7, 0xB, 0xF), (0x0, 0x5, 0xA, 0xF), (0x1, 0x6, 0xB, 0xC), (0x2, 0x7, 0x8, 0xD), (0x3, 0x4, 0x9, 0xE), ]: v[a] = v[a] + v[b] & _M64 v[d] = rotr64(v[d] ^ v[a], 32) v[c] = v[c] + v[d] & _M64 v[b] = rotr64(v[b] ^ v[c], 24) v[a] = v[a] + v[b] & _M64 v[d] = rotr64(v[d] ^ v[a], 16) v[c] = v[c] + v[d] & _M64 v[b] = rotr64(v[b] ^ v[c], 63) v = [x + y & _M64 for x, y in zip(q, v)] q[0xD] += 1 yield from struct.pack('<16Q', *v)
class blk224 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: blk224 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the BLK224 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class blk256 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: blk256 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the BLK256 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class blk384 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: blk384 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the BLK384 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class blk512 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: blk512 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the BLK512 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class blowfish (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.blowfish
and has the following commandline Interface:usage: blowfish [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key Blowfish encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX, ECB, OFB. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -M, --mac-len N Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in bytes. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class blowfish(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(Blowfish)): """ Blowfish encryption and decryption. """ pass
class blz
-
This unit is implemented in
refinery.units.compression.blz
and has the following commandline Interface:usage: blz [-h] [-L] [-Q] [-0] [-v] [-R] BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree implementation: It requires a lot of time & memory. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class blz(Unit): """ BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree implementation: It requires a lot of time & memory. """ def _begin(self, data): self._src = StructReader(memoryview(data)) self._dst = MemoryFile(bytearray()) return self def _reset(self): self._src.seek(0) self._dst.seek(0) self._dst.truncate() return self def _decompress(self): ( signature, version, src_count, src_crc32, dst_count, dst_crc32, ) = self._src.read_struct('>6L') if signature != 0x626C7A1A: raise ValueError(F'Invalid BriefLZ signature: {signature:08X}, should be 626C7A1A.') if version > 10: raise ValueError(F'Invalid version number {version}, should be less than 10.') self.log_debug(F'signature: 0x{signature:08X} V{version}') self.log_debug(F'src count: 0x{src_count:08X}') self.log_debug(F'src crc32: 0x{src_crc32:08X}') self.log_debug(F'dst count: 0x{dst_count:08X}') self.log_debug(F'dst crc32: 0x{dst_crc32:08X}') src = self._src.getbuffer() src = src[24:24 + src_count] if len(src) < src_count: self.log_warn(F'Only {len(src)} bytes in buffer, but header annoucned a length of {src_count}.') if src_crc32: check = zlib.crc32(src) if check != src_crc32: self.log_warn(F'Invalid source data CRC {check:08X}, should be {src_crc32:08X}.') dst = self._decompress_chunk(dst_count) if not dst_crc32: return dst check = zlib.crc32(dst) if check != dst_crc32: self.log_warn(F'Invalid result data CRC {check:08X}, should be {dst_crc32:08X}.') return dst def _decompress_modded(self): self._src.seekrel(8) total_size = self._src.u64() chunk_size = self._src.u64() remaining = total_size self.log_debug(F'total size: 0x{total_size:016X}') self.log_debug(F'chunk size: 0x{chunk_size:016X}') while remaining > chunk_size: self._decompress_chunk(chunk_size) remaining -= chunk_size return self._decompress_chunk(remaining) def _decompress_chunk(self, size=None): bitcount = 0 bitstore = 0 decompressed = 1 def readbit(): nonlocal bitcount, bitstore if not bitcount: bitstore = int.from_bytes(self._src.read_exactly(2), 'little') bitcount = 0xF else: bitcount = bitcount - 1 return (bitstore >> bitcount) & 1 def readint(): result = 2 + readbit() while readbit(): result <<= 1 result += readbit() return result self._dst.write(self._src.read_exactly(1)) try: while not size or decompressed < size: if readbit(): length = readint() + 2 sector = readint() - 2 offset = self._src.read_byte() + 1 delta = offset + 0x100 * sector available = self._dst.tell() if delta not in range(available + 1): raise RefineryPartialResult( F'Requested rewind by 0x{delta:08X} bytes with only 0x{available:08X} bytes in output buffer.', partial=self._dst.getvalue()) quotient, remainder = divmod(length, delta) replay = memoryview(self._dst.getbuffer()) replay = bytes(replay[-delta:] if quotient else replay[-delta:length - delta]) replay = quotient * replay + replay[:remainder] self._dst.write(replay) decompressed += length else: self._dst.write(self._src.read_exactly(1)) decompressed += 1 except EOFError as E: raise RefineryPartialResult(str(E), partial=self._dst.getbuffer()) dst = self._dst.getbuffer() if decompressed < size: raise RefineryPartialResult( F'Attempted to decompress {size} bytes, got only {len(dst)}.', dst) if decompressed > size: raise RuntimeError('Decompressed buffer contained more bytes than expected.') return dst def _compress(self): from refinery.lib.suffixtree import SuffixTree try: self.log_info('computing suffix tree') tree = SuffixTree(self._src.getbuffer()) except Exception: raise bitstore = 0 # The bit stream to be written bitcount = 0 # The number of bits in the bit stream buffer = MemoryFile(bytearray()) # Write empty header and first byte of source self._dst.write(bytearray(24)) self._dst.write(self._src.read_exactly(1)) def writeint(n: int) -> None: """ Write an integer to the bit stream. """ nonlocal bitstore, bitcount nbits = n.bit_length() if nbits < 2: raise ValueError # The highest bit is implicitly assumed: n ^= 1 << (nbits - 1) remaining = nbits - 2 while remaining: remaining -= 1 bitstore <<= 2 bitcount += 2 bitstore |= ((n >> remaining) & 3) | 1 bitstore <<= 2 bitcount += 2 bitstore |= (n & 1) << 1 src = self._src.getbuffer() remaining = len(src) - 1 self.log_info('compressing data') while True: cursor = len(src) - remaining rest = src[cursor:] if bitcount >= 0x10: block_count, bitcount = divmod(bitcount, 0x10) info_channel = bitstore >> bitcount bitstore = info_channel << bitcount ^ bitstore # The decompressor will read bits from top to bottom, and each 16 bit block has to be # little-endian encoded. The bit stream is encoded top to bottom bit in the bitstore # variable, and by encoding it as a big endian integer, the stream is in the correct # order. However, we need to swap adjacent bytes to achieve little endian encoding for # each of the blocks: info_channel = bytearray(info_channel.to_bytes(block_count * 2, 'big')) for k in range(block_count): k0 = 2 * k + 0 k1 = 2 * k + 1 info_channel[k0], info_channel[k1] = info_channel[k1], info_channel[k0] info_channel = memoryview(info_channel) data_channel = memoryview(buffer.getbuffer()) self._dst.write(info_channel[:2]) self._dst.write(data_channel[:-1]) self._dst.write(info_channel[2:]) data_channel = bytes(data_channel[-1:]) buffer.truncate(0) store = buffer if bitcount else self._dst store.write(data_channel) if remaining + bitcount < 0x10: buffer = buffer.getbuffer() if rest or buffer: bitstore <<= 0x10 - bitcount self._dst.write(bitstore.to_bytes(2, 'little')) self._dst.write(buffer) self._dst.write(rest) elif bitcount: raise RuntimeError('Bitbuffer Overflow') break node = tree.root length = 0 offset = 0 sector = None while node.children and length < len(rest): for child in node.children.values(): if tree.data[child.start] == rest[length]: node = child break if node.start >= cursor: break offset = node.start - length length = node.end + 1 - offset length = min(remaining, length) if length >= 4: sector, offset = divmod(cursor - offset - 1, 0x100) bitcount += 1 bitstore <<= 1 if sector is None: buffer.write(rest[:1]) remaining -= 1 continue bitstore |= 1 buffer.write(bytes((offset,))) writeint(length - 2) writeint(sector + 2) remaining -= length self._dst.seek(24) dst = self._dst.peek() self._dst.seek(0) self._dst.write(struct.pack('>6L', 0x626C7A1A, 1, len(dst), zlib.crc32(dst), len(src), zlib.crc32(src))) return self._dst.getbuffer() def process(self, data): self._begin(data) partial = None try: return self._decompress() except ValueError as error: if isinstance(error, RefineryPartialResult): partial = error self.log_warn(F'Reverting to modified BriefLZ after decompression error: {error!s}') self._reset() try: return self._decompress_modded() except RefineryPartialResult: raise except Exception as error: if not partial: raise raise partial from error def reverse(self, data): return self._begin(data)._compress()
class brotli
-
This unit is implemented in
refinery.units.compression.brotli
and has the following commandline Interface:usage: brotli [-h] [-L] [-Q] [-0] [-v] [-R] Brotli compression and decompression. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class brotli(Unit): """ Brotli compression and decompression. """ @Unit.Requires('brotlipy', 'all') def _brotli(): import brotli return brotli def process(self, data): return self._brotli.decompress(bytes(data)) def reverse(self, data): return self._brotli.compress(bytes(data))
class bruteforce (name, length=slice(1, None, None), format=None, alphabet=None, pattern=None, printable=False, digits=False, identifier=False, letters=False)
-
This unit is implemented in
refinery.units.strings.bruteforce
and has the following commandline Interface:usage: bruteforce [-h] [-L] [-Q] [-0] [-v] [-a B | -r REGEX | -p | -d | -i | -l] name [length] [format] Generates all possible combinations of letters in a given alphabet. For each generated string, one copy of each input chunk is generated and populated with a meta variable containing that string. This can be used for simple brute forcing checks. positional arguments: name Name of the meta variable to be populated. length Specifies the range of characters to brute force, default is 1:. format Optional format expression for the output string. The format sequence "{0}" is the current brute force string, the sequence "{1}" represents the input data. optional arguments: -a, --alphabet B The alphabet from which to choose the letters. Entire byte range by default. -r, --pattern REGEX Provide a regular expression pattern to define the alphabet. -p, --printable Equivalent to --pattern=[\s\x20-\x7E] -d, --digits Equivalent to --pattern=\d -i, --identifier Equivalent to --pattern=\w -l, --letters Equivalent to --pattern=[a-zA-Z] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class bruteforce(Unit): """ Generates all possible combinations of letters in a given alphabet. For each generated string, one copy of each input chunk is generated and populated with a meta variable containing that string. This can be used for simple brute forcing checks. """ def __init__( self, name : Arg.String(help='Name of the meta variable to be populated.'), length: Arg.Bounds(metavar='length', help=( 'Specifies the range of characters to brute force, default is {default}.' )) = slice(1, None), format: Arg.String(help=( 'Optional format expression for the output string. The format sequence "{0}" is the ' 'current brute force string, the sequence "{1}" represents the input data.' )) = None, alphabet : Arg.Binary('-a', group='ALPH', help=( 'The alphabet from which to choose the letters. Entire byte range by default.' )) = None, pattern : Arg.RegExp('-r', group='ALPH', help='Provide a regular expression pattern to define the alphabet.') = None, printable : Arg.Switch('-p', group='ALPH', help='Equivalent to --pattern=[\\s\\x20-\\x7E]') = False, digits : Arg.Switch('-d', group='ALPH', help='Equivalent to --pattern=\\d') = False, identifier: Arg.Switch('-i', group='ALPH', help='Equivalent to --pattern=\\w') = False, letters : Arg.Switch('-l', group='ALPH', help='Equivalent to --pattern=[a-zA-Z]') = False, ): options = sum(1 for x in [printable, digits, identifier, letters] if x) if options > 1 or options and pattern: raise ValueError('Invalid selection.') if printable: pattern = b'[\\s\\x20-\\x7E]' if digits: pattern = b'\\d' if identifier: pattern = b'\\w' if letters: pattern = b'[a-zA-Z]' super().__init__( name=name, length=length, format=format, alphabet=alphabet, pattern=pattern, ) def _alphabet(self) -> bytes: alphabet = self.args.alphabet if alphabet: return alphabet alphabet = bytes(range(0x100)) pattern = self.args.pattern if not pattern: return alphabet alphabet = B''.join(re.findall(pattern, alphabet, flags=re.DOTALL)) if alphabet: return alphabet raise ValueError(F'Invalid regular expression: {pattern}') def process(self, data: bytearray): format_spec: str = self.args.format meta = metavars(data) name = self.args.name kwargs = {name: None} for length in integers_of_slice(self.args.length): self.log_info(F'generating {length} digits') if not isinstance(length, int) or length < 0: raise ValueError(F'Unable to brute force {length} characters.') for string in itertools.product(self._alphabet(), repeat=length): string = bytes(string) if format_spec: string = meta.format_bin(format_spec, self.codec, [string, data]) kwargs[name] = string yield self.labelled(data, **kwargs)
class byteswap (size=4)
-
This unit is implemented in
refinery.units.blockwise.byteswap
and has the following commandline Interface:usage: byteswap [-h] [-L] [-Q] [-0] [-v] [N] Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block size are discarded. positional arguments: N the block size in bytes; the default is 4. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class byteswap(UnaryOperation): """ Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block size are discarded. """ def __init__(self, size: Arg.Number(help='the block size in bytes; the default is {default}.') = 4): super().__init__(blocksize=size, _truncate=2) def inplace(self, block: ndarray) -> None: block.byteswap(True) operate = NotImplemented def process(self, data): try: return self._fastblock(data) except FastBlockError: b = self.blocksize n = len(data) m = n - n % b v = memoryview(data) if b == 1: self.log_warn('running this unit with a block size of 1 does not have any effect') return data for k in range(0, m, b): _end = k and k - 1 or None data[k : k + b] = v[k + b - 1:_end:-1] if m < n: del v del data[m:] return data
class bz2 (level=9)
-
This unit is implemented in
refinery.units.compression.bz2
and has the following commandline Interface:usage: bz2 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-l LEVEL] BZip2 compression and decompression. optional arguments: -l, --level LEVEL compression level preset between 1 and 9 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class bz2(Unit): """ BZip2 compression and decompression. """ def __init__(self, level: Arg('-l', type=number[1:9], help='compression level preset between 1 and 9') = 9): super().__init__(level=level) def process(self, data): return bz2_.decompress(data) def reverse(self, data): return bz2_.compress(data, self.args.level) @classmethod def handles(self, data: bytearray): return data[:3] == B'BZh'
class camellia (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.camellia
and has the following commandline Interface:usage: camellia [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] key Camellia encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class camellia(StandardBlockCipherUnit, cipher=BlockCipherFactory(Camellia)): """ Camellia encryption and decryption. """ pass
class carve (format, unique=False, decode=False, single=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True)
-
This unit is implemented in
refinery.units.pattern.carve
and has the following commandline Interface:usage: carve [-h] [-L] [-Q] [-0] [-v] [-q] [-d] [-s] [-n N] [-m N] [-e N] [-x] [-l] [-t N] [-a | -u] format Extracts patches of data in particular formats from the input. positional arguments: format Specify one of the following formats: integer, float, number, string, multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote, urlquote-coarse, urlquote-narrow, intarray, numarray, word, letters, wshenc, alphanumeric, b32, b64, b85, b92, b64any, b64url, hex, uppercase-hex, spaced- hex, spaced-b64, spaced-b85, utf8, hexdump, hexarray, uuencode optional arguments: -q, --unique Yield every match only once. -d, --decode Automatically decode known patterns. -s, --single Only get the biggest match; equivalent to -qlt1 -n, --min N Matches must have length at least N. -m, --max N Matches must have length at most N. -e, --len N Matches must be of length N. -x, --stripspace Strip all whitespace from input data. -l, --longest Sort results by length. -t, --take N Return only the first N occurrences in order of appearance. -a, --no-utf16 Search for ASCII encoded patterns only. -u, --no-ascii Search for UTF16 encoded patterns only. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve(PatternExtractor): """ Extracts patches of data in particular formats from the input. """ def __init__( self, format: Arg.Choice(choices=[p.display for p in formats], metavar='format', help='Specify one of the following formats: {choices}'), unique: Arg.Switch('-q', help='Yield every match only once.') = False, decode: Arg.Switch('-d', help='Automatically decode known patterns.') = False, single: Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1') = False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True ): if single: take = 1 longest = True unique = True super().__init__( min=min, max=max, len=len, stripspace=stripspace, duplicates=not unique, longest=longest, take=take, ascii=ascii, utf16=utf16, format=formats.from_dashname(format) ) if not decode: decoder = NotImplemented elif self.args.format in (formats.multiline_string, formats.string): from ..encoding.esc import esc decoder = esc(unicode=True, quoted=True) elif self.args.format is formats.integer: from ..encoding.base import base decoder = base() elif self.args.format in (formats.uppercase_hex, formats.spaced_hex, formats.hex): from ..encoding.hex import hex decoder = hex() elif self.args.format is formats.hexdump: from ..formats.hexload import hexload decoder = hexload() elif self.args.format is formats.intarray: from ..blockwise.pack import pack decoder = pack() elif self.args.format in (formats.b64, formats.b64any, formats.spaced_b64): from ..encoding.b64 import b64 decoder = b64() elif self.args.format in (formats.b85, formats.spaced_b85): from ..encoding.b85 import b85 decoder = b85() elif self.args.format is formats.b64url: from ..encoding.b64 import b64 decoder = b64(urlsafe=True) elif self.args.format is formats.b32: from ..encoding.b32 import b32 decoder = b32() elif self.args.format is formats.ps1str: from ..encoding.ps1str import ps1str decoder = ps1str() elif self.args.format is formats.vbastr: from ..encoding.ps1str import ps1str decoder = ps1str() elif self.args.format is formats.hexarray: from ..blockwise.pack import pack decoder = pack(0x10) elif self.args.format is formats.wshenc: from ..encoding.wshenc import wshenc decoder = wshenc() elif self.args.format is formats.uuencode: from ..encoding.uuenc import uuenc decoder = uuenc() elif self.args.format in ( formats.urlquote, formats.urlquote_coarse, formats.urlquote_narrow, ): from ..encoding.url import url decoder = url() else: decoder = NotImplemented self.decoder = decoder def process(self, data): it = iter(self.matches_filtered(memoryview(data), self.args.format.value.bin_compiled)) if self.decoder is NotImplemented: yield from it for chunk in it: try: yield self.decoder(chunk) except Exception as E: self.log_info(F'decoder failure: {E!s}')
class carve_7z
-
This unit is implemented in
refinery.units.pattern.carve_7z
and has the following commandline Interface:usage: carve-7z [-h] [-L] [-Q] [-0] [-v] Extracts anything from the input data that looks like a 7zip archive file. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_7z(Unit): """ Extracts anything from the input data that looks like a 7zip archive file. """ @Unit.Requires('py7zr', 'arc', 'default', 'extended') def _py7zr(): import py7zr return py7zr HEADER_SIGNATURE = B'7z\xBC\xAF\x27\x1C' def process(self, data: bytearray): cursor = 0 mv = memoryview(data) while True: start = data.find(self.HEADER_SIGNATURE, cursor) if start < cursor: break self.log_debug(F'found header at offset: 0x{start:08X}') try: mf = MemoryFileRecorder(mv[start:]) self.log_debug('attempting to read archive') archive = self._py7zr.SevenZipFile(mf) self.log_debug('attempting to test archive') success = archive.test() is not False except ImportError: raise except Exception as error: self.log_debug('parsing archive failed:', error) success = False if success: self.log_info(F'identified archive of size 0x{mf.max_cursor:08X} at offset 0x{start:08X}') cursor = start + mf.max_cursor yield self.labelled(mv[start:cursor], offset=start) else: cursor = start + 5
class carve_json (dictonly=False)
-
This unit is implemented in
refinery.units.pattern.carve_json
and has the following commandline Interface:usage: carve-json [-h] [-L] [-Q] [-0] [-v] [-d] Extracts anything from the input data that looks like JSON. optional arguments: -d, --dictonly only extract JSON dictionaries, do not extract lists. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_json(Unit): """ Extracts anything from the input data that looks like JSON. """ def __init__(self, dictonly: Arg.Switch('-d', help='only extract JSON dictionaries, do not extract lists.') = False): super().__init__(dictonly=dictonly) def process(self, data): for start, chunk in JSONCarver(data, dictonly=self.args.dictonly): yield self.labelled(chunk, offset=start)
class carve_lnk
-
This unit is implemented in
refinery.units.pattern.carve_lnk
and has the following commandline Interface:usage: carve-lnk [-h] [-L] [-Q] [-0] [-v] Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file) generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_lnk(Unit): """ Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file) """ @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'extended') def _LnkParse3(): import LnkParse3 import LnkParse3.extra_factory return LnkParse3 def process(self, data: bytearray): pos = 0 mem = memoryview(data) sig = B'\x4C\x00\x00\x00\x01\x14\x02\x00' lnk = self._LnkParse3 while True: pos = data.find(sig, pos) if pos < 0: break try: parsed = lnk.lnk_file(indata=mem[pos:]) except Exception: pos += 1 continue end = pos + parsed.header.size() + parsed.string_data.size() if parsed.has_target_id_list(): end += parsed.targets.size() if parsed.has_link_info() and not parsed.force_no_link_info(): with suppress(AttributeError): end += parsed.info.size() with NoLogging(): while end < len(mem): extra = lnk.extra_factory.ExtraFactory(mem[end:]) try: ec = extra.extra_class() except Exception: break if ec is None: break if 'UNKNOWN' in ec().name(): break end += extra.item_size() terminal_block = mem[end:end + 4] if terminal_block != B'\0\0\0\0': self.log_warn(F'detected LNK at offset 0x{pos:X}, but size calculation did not end on a terminal block') continue else: end += 4 yield self.labelled(mem[pos:end], offset=pos) pos = end
class carve_pe (*paths, list=False, join_path=False, drop_path=False, path=b'name', recursive=False, keep_root=False, memdump=False, fileinfo=False)
-
This unit is implemented in
refinery.units.pattern.carve_pe
and has the following commandline Interface:usage: carve-pe [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-P NAME] [-r] [-k] [-m] [-f] [path [path ...]] Extracts anything from the input data that looks like a Portable Executable (PE) file. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "name". -r, --recursive Extract PE files that are contained in already extracted PEs. -k, --keep-root If the input chunk is itself a PE, include it as an output chunk. -m, --memdump Use the virtual memory layout of a PE file to calculate its size. -f, --fileinfo Use the PE meta information to deduce a file name meta variable. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_pe(PathExtractorUnit): """ Extracts anything from the input data that looks like a Portable Executable (PE) file. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, path=b'name', recursive: Arg.Switch('-r', help='Extract PE files that are contained in already extracted PEs.') = False, keep_root: Arg.Switch('-k', help='If the input chunk is itself a PE, include it as an output chunk.') = False, memdump : Arg.Switch('-m', help='Use the virtual memory layout of a PE file to calculate its size.') = False, fileinfo : Arg.Switch('-f', help='Use the PE meta information to deduce a file name meta variable.') = False ): super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, path=path, recursive=recursive, keep_root=keep_root, memdump=memdump, fileinfo=fileinfo, ) def unpack(self, data): cursor = 0 mv = memoryview(data) while True: offset = data.find(B'MZ', cursor) if offset < cursor: break cursor = offset + 2 ntoffset = mv[offset + 0x3C:offset + 0x3E] if len(ntoffset) < 2: return ntoffset, = unpack('H', ntoffset) if mv[offset + ntoffset:offset + ntoffset + 2] != B'PE': self.log_debug(F'invalid NT header signature for candidate at 0x{offset:08X}') continue try: pe = PE(data=data[offset:], fast_load=True) except PEFormatError as err: self.log_debug(F'parsing of PE header at 0x{offset:08X} failed:', err) continue pesize = get_pe_size(pe, memdump=self.args.memdump) pedata = mv[offset:offset + pesize] info = {} if self.args.fileinfo: pe_meta_parser = pemeta() try: info = pe_meta_parser.parse_version(pe) or {} except Exception as error: self.log_warn(F'Unable to obtain file information: {error!s}') try: info.update(pe_meta_parser.parse_header(pe) or {}) except Exception: pass try: path = info['OriginalFilename'] except KeyError: try: path = info['ExportName'] except KeyError: extension = 'exe' if pe.is_exe() else 'dll' if pe.is_dll() else 'sys' path = F'carve-0x{offset:08X}.{extension}' if offset > 0 or self.args.keep_root: yield UnpackResult(path, pedata, offset=offset) self.log_info(F'extracted PE file of size 0x{pesize:08X} from 0x{offset:08X}') else: self.log_info(F'ignored root file of size 0x{pesize:08X} from 0x{offset:08X}') continue if not offset or self.args.recursive: cursor += pe.OPTIONAL_HEADER.SizeOfHeaders else: cursor += pesize - 2
class carve_rtf
-
This unit is implemented in
refinery.units.pattern.carve_rtf
and has the following commandline Interface:usage: carve-rtf [-h] [-L] [-Q] [-0] [-v] Extracts anything from the input data that looks like an RTF document. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_rtf(Unit): """ Extracts anything from the input data that looks like an RTF document. """ def process(self, data: bytearray): pos = 0 mem = memoryview(data) sig = re.escape(b'{\\rtf') while True: match = re.search(sig, mem[pos:], flags=re.IGNORECASE) if match is None: break pos = pos + match.start() end = pos + 1 depth = 1 while depth and end < len(mem): if mem[end] == 0x7B: # { depth += 1 if mem[end] == 0x7D: # } depth -= 1 end += 1 if depth > 0: break yield self.labelled(mem[pos:end], offset=pos) pos = end
class carve_xml
-
This unit is implemented in
refinery.units.pattern.carve_xml
and has the following commandline Interface:usage: carve-xml [-h] [-L] [-Q] [-0] [-v] Extracts anything from the input data that looks like XML. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_xml(Unit): """ Extracts anything from the input data that looks like XML. """ def process(self, data): for offset, chunk in XMLCarver(data): yield self.labelled(chunk, offset=offset)
class carve_zip
-
This unit is implemented in
refinery.units.pattern.carve_zip
and has the following commandline Interface:usage: carve-zip [-h] [-L] [-Q] [-0] [-v] Extracts anything from the input data that looks like a zip archive file. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class carve_zip(Unit): """ Extracts anything from the input data that looks like a zip archive file. """ def process(self, data: bytearray): end = len(data) mem = memoryview(data) rev = [] while True: end = data.rfind(ZipEndOfCentralDirectory.SIGNATURE, 0, end) if end < 0: break try: end_marker = ZipEndOfCentralDirectory(mem[end:]) except ValueError as e: self.log_info(F'error parsing end of central directory at 0x{end:X}: {e!s}') continue else: self.log_info(F'successfully parsed end of central directory at 0x{end:X}') start = end - end_marker.directory_size shift = start - end_marker.directory_offset if start < 0: self.log_debug('end of central directory size is invalid') continue try: central_directory = ZipCentralDirectory(mem[start:]) except ValueError: self.log_debug('computed location of central directory is invalid') end = end - len(ZipEndOfCentralDirectory.SIGNATURE) continue start = central_directory.header_offset + shift if mem[start:start + 4] not in (B'PK\x03\x04', B'\0\0\0\0'): # SFX payloads seem to have a nulled header, so we permit this. self.log_debug('computed start of ZIP archive does not have the correct signature bytes') continue rev.append((start, end + len(end_marker))) end = start for start, end in reversed(rev): zip = mem[start:end + len(end_marker)] yield self.labelled(zip, offset=start)
class cast (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.cast
and has the following commandline Interface:usage: cast [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key CAST encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX, ECB, OFB. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -M, --mac-len N Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in bytes. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class cast(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(CAST)): """ CAST encryption and decryption. """ pass
class cca (data)
-
This unit is implemented in
refinery.units.strings.cca
and has the following commandline Interface:usage: cca [-h] [-L] [-Q] [-0] [-v] data Short for ConCatAppend: This unit concatenates the input data with its argument by appending the latter to the former. See also ccp for the unit that prepends instead. positional arguments: data Binary string to be appended to the input. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class cca(Unit): """ Short for ConCatAppend: This unit concatenates the input data with its argument by appending the latter to the former. See also `refinery.ccp` for the unit that prepends instead. """ def __init__(self, data: Arg(help='Binary string to be appended to the input.')): super().__init__(data=data) def process(self, data: bytearray): data.extend(self.args.data) return data
class ccp (data)
-
This unit is implemented in
refinery.units.strings.ccp
and has the following commandline Interface:usage: ccp [-h] [-L] [-Q] [-0] [-v] data Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending the latter to the former. See also cca for the unit that appends instead. positional arguments: data Binary string to be prepended to the input. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ccp(Unit): """ Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending the latter to the former. See also `refinery.cca` for the unit that appends instead. """ def __init__(self, data: Arg(help='Binary string to be prepended to the input.')): super().__init__(data=data) def process(self, data: bytearray): data[:0] = self.args.data return data
class cfmt (*formats, variable=None, separator=' ', multiplex=False, binary=False, unescape=False)
-
This unit is implemented in
refinery.units.strings.cfmt
and has the following commandline Interface:usage: cfmt [-h] [-L] [-Q] [-0] [-v] [-n N] [-s S | -m] [-b] [-e] [format [format ...]] Stands for "Convert to ForMaT": Transform a given chunk by applying a format string operation. The positional format string placeholder {} will be replaced by the incoming data, named placeholders have to exist as meta variables in the current chunk. For example, the following pipeline can be used to print all files in a given directory with their corresponding SHA-256 hash: ef ** [| sha256 -t | cfmt {} {path} ]] By default, format string arguments are simply joined along a space character to form a single format string. positional arguments: format Format strings. optional arguments: -n, --variable N Store the formatted string in a meta variable. -s, --separator S Separator to insert between format strings. The default is a space character. -m, --multiplex Do not join the format strings along the separator, generate one output for each. -b, --binary Use the binary formatter instead of the string formatter. -e, --unescape Interpret escape sequences in format strings. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class cfmt(Unit): """ Stands for "Convert to ForMaT": Transform a given chunk by applying a format string operation. The positional format string placeholder `{}` will be replaced by the incoming data, named placeholders have to exist as meta variables in the current chunk. For example, the following pipeline can be used to print all files in a given directory with their corresponding SHA-256 hash: ef ** [| sha256 -t | cfmt {} {path} ]] By default, format string arguments are simply joined along a space character to form a single format string. """ def __init__( self, *formats : Arg(help='Format strings.', type=str, metavar='format'), variable : Arg('-n', type=str, metavar='N', help='Store the formatted string in a meta variable.') = None, separator: Arg('-s', group='SEP', metavar='S', help='Separator to insert between format strings. The default is a space character.') = ' ', multiplex: Arg.Switch('-m', group='SEP', help='Do not join the format strings along the separator, generate one output for each.') = False, binary : Arg.Switch('-b', help='Use the binary formatter instead of the string formatter.') = False, unescape : Arg.Switch('-e', help='Interpret escape sequences in format strings.') = False, ): def fixfmt(fmt: bytes): if unescape: if isinstance(fmt, str): fmt = fmt.encode('latin1') return fmt.decode('unicode-escape') elif not isinstance(fmt, str): fmt = fmt.decode(self.codec) return fmt formats = [fixfmt(f) for f in formats] if not multiplex: formats = [fixfmt(separator).join(formats)] super().__init__(formats=formats, variable=variable, binary=binary) def process(self, data): meta = metavars(data) meta.ghost = True args = [data] variable = self.args.variable if self.args.binary: formatter = partial(meta.format_bin, codec=self.codec, args=args) else: def formatter(spec): return meta.format_str(spec, self.codec, args).encode(self.codec) for spec in self.args.formats: result = formatter(spec) if variable is not None: result = self.labelled(data, **{variable: result}) yield result
class chacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
This unit is implemented in
refinery.units.crypto.cipher.chacha
and has the following commandline Interface:usage: chacha [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce] ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. optional arguments: -s, --stateful Do not reset the key stream while processing the chunks of one frame. -d, --discard N Discard the first N bytes of the keystream, 0 by default. -m, --magic MAGIC The magic constant; depends on the key size by default. -x, --offset N Optionally specify the stream index, default is 0. -r, --rounds N The number of rounds. Has to be an even number. Default is 20. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class chacha(LatinCipherUnit): """ ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored. """ def keystream(self) -> Iterable[int]: key = self.args.key if len(key) == 64: it = ChaChaCipher.FromState(key) else: it = ChaChaCipher( key, self.args.nonce, self.args.magic, self.args.rounds, self.args.offset, ) yield from it
class chacha20 (key, nonce=b'REFINERY')
-
This unit is implemented in
refinery.units.crypto.cipher.chacha
and has the following commandline Interface:usage: chacha20 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce] ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20 is functionally equivalent to chacha with 20 rounds, but this unit uses the PyCryptodome library C implementation rather than the pure Python implementation used by chacha. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class chacha20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20)): """ ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20 is functionally equivalent to `refinery.chacha` with 20 rounds, but this unit uses the PyCryptodome library C implementation rather than the pure Python implementation used by `refinery.chacha`. """ pass
class chacha20poly1305 (key, nonce=b'REFINERY')
-
This unit is implemented in
refinery.units.crypto.cipher.chacha
and has the following commandline Interface:usage: chacha20poly1305 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce] ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20 variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce instead. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class chacha20poly1305(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20_Poly1305)): """ ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20 variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce instead. """ def _get_cipher(self, reset_cache=False): cipher = super()._get_cipher(reset_cache) cipher.block_size = 1 return cipher
class chaskey (key, iv=b'', padding=None, mode=None, raw=False, rounds=12, swap=False, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False)
-
This unit is implemented in
refinery.units.crypto.cipher.chaskey
and has the following commandline Interface:usage: chaskey [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-k N] [-s] [-e] [-S N] key This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey with 16 rounds and in CTR mode. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -k, --rounds N Number of rounds to use, the default is 12 -s, --swap Use big endian byte order for all blocks. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class chaskey(StandardBlockCipherUnit, cipher=BlockCipherFactory(Chaskey)): """ This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey with 16 rounds and in CTR mode. """ def __init__( self, key, iv=b'', padding=None, mode=None, raw=False, rounds: Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R, swap: Arg.Switch('-s', help='Use big endian byte order for all blocks.') = False, **more ): super().__init__(key, iv, padding=padding, mode=mode, raw=raw, rounds=rounds, swap=swap, **more) def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( swap=self.args.swap, rounds=self.args.rounds, **optionals )
class chop (size, step=None, truncate=False)
-
This unit is implemented in
refinery.units.meta.chop
and has the following commandline Interface:usage: chop [-h] [-L] [-Q] [-0] [-v] [-t] N [N] Reinterprets the input as a sequence of equally sized chunks and outputs this sequence. positional arguments: N Chop data into chunks of this size N Optionally specify a step size (which is equal to the size by default) which indicates the number of bytes by which the cursor will be increased after extracting a chunk. optional arguments: -t, --truncate Truncate possible excess bytes at the end of the input, by default they are appended as a single chunk. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class chop(Unit): """ Reinterprets the input as a sequence of equally sized chunks and outputs this sequence. """ def __init__( self, size: Arg.Number('size', help='Chop data into chunks of this size'), step: Arg.Number('step', help=( 'Optionally specify a step size (which is equal to the size by default) which indicates the number of bytes by ' 'which the cursor will be increased after extracting a chunk.')) = None, truncate: Arg.Switch('-t', help=( 'Truncate possible excess bytes at the end of the input, by default they are appended as a single chunk.')) = False, ): return super().__init__(size=size, step=step, truncate=truncate) def process(self, data): view = memoryview(data) size = self.args.size step = self.args.step if size < 1: raise ValueError('The chunk size has to be a positive integer value.') yield from splitchunks(view, size, step, self.args.truncate)
class clower
-
This unit is implemented in
refinery.units.strings.clower
and has the following commandline Interface:usage: clower [-h] [-L] [-Q] [-0] [-v] Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the input to lowercase. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class clower(Unit): """ Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the input to lowercase. """ def process(self, data): return data.lower()
class cm (invert=False, all=False, reset=False, size=False, ext=False, entropy=False, ic=False, magic=False, sha1=False, sha256=False, crc32=False, md5=False, hashes=False, *names)
-
This unit is implemented in
refinery.units.meta.cm
and has the following commandline Interface:usage: cm [-h] [-L] [-Q] [-0] [-v] [-x | -a] [-r] [-S] [-X] [-E] [-C] [-M] [-1] [-2] [-3] [-5] [-H] [name [name ...]] The Common Meta variables unit populates the set of meta variables of the current chunk with commonly used metadata. The unit has no effect outside a frame. positional arguments: name A variable name that can include the common properties: mime, ext, magic, size, entropy, ic, crc32, sha1, sha256, sha512, md5. If none is given, the size variable is populated. For most of these, an optional argument is available that can be used as a shorthand: optional arguments: -x, --invert populate only options that have not been specified -a, --all populate all options -r, --reset discard all meta variables that were not explicitly specified -S, --size size of the chunk -X, --ext guess file extension -E, --entropy compute data entropy -C, --ic compute the index of coincidence -M, --magic compute file magic -1, --sha1 compute hash: SHA-1 -2, --sha256 compute hash: SHA-256 -3, --crc32 compute hash: CRC32 -5, --md5 compute hash: MD5 -H, --hashes compute all common hashes generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class cm(Unit): """ The Common Meta variables unit populates the set of meta variables of the current chunk with commonly used metadata. The unit has no effect outside a frame. """ def __init__( self, invert : Arg.Switch('-x', group='ALL', help='populate only options that have not been specified') = False, all : Arg.Switch('-a', group='ALL', help='populate all options') = False, reset : Arg.Switch('-r', help='discard all meta variables that were not explicitly specified') = False, size : Arg.Switch('-S', help='size of the chunk') = False, ext : Arg.Switch('-X', help='guess file extension') = False, entropy : Arg.Switch('-E', help='compute data entropy') = False, ic : Arg.Switch('-C', help='compute the index of coincidence') = False, magic : Arg.Switch('-M', help='compute file magic') = False, sha1 : Arg.Switch('-1', help='compute hash: SHA-1') = False, sha256 : Arg.Switch('-2', help='compute hash: SHA-256') = False, crc32 : Arg.Switch('-3', help='compute hash: CRC32') = False, md5 : Arg.Switch('-5', help='compute hash: MD5') = False, hashes : Arg.Switch('-H', help='compute all common hashes') = False, *names : Arg(metavar='name', help=( F'A variable name that can include the common properties: {_COMMON_PROPERTIES_LIST}.' R' If none is given, the size variable is populated. For most of these, an optional ' R'argument is available that can be used as a shorthand:')) ): def stringify(name): if isinstance(name, str): return name return name.decode(self.codec) names = {stringify(name) for name in names} if hashes: md5 = sha256 = sha1 = crc32 = True if size: names.add('size') if ext: names.add('ext') if entropy: names.add('entropy') if ic: names.add('ic') if magic: names.add('magic') if sha1: names.add('sha1') if sha256: names.add('sha256') if crc32: names.add('crc32') if md5: names.add('md5') if not names and not reset: names.add('size') if all: if invert: raise ValueError('invert and all are both enabled, resulting in empty configuration.') names = set(LazyMetaOracle.derivations) elif invert: names = set(LazyMetaOracle.derivations) - names super().__init__(names=names, reset=reset) def process(self, data): return data def filter(self, chunks): names = self.args.names reset = self.args.reset for chunk in chunks: chunk: Chunk if not chunk.visible: yield chunk continue meta = metavars(chunk) if reset: chunk.meta.clear() for name in names: chunk[name] = meta[name] yield chunk
class couple (*commandline, buffer=False, noerror=False, timeout=0.0)
-
This unit is implemented in
refinery.units.misc.couple
and has the following commandline Interface:usage: couple [-h] [-L] [-Q] [-0] [-v] [-b] [-e] [-t T] ... Turns any command into a refinery unit. Data is processed by feeding it to the standard input of a process spawned from the given command line, and then reading the standard output of that process as the result of the operation. The main purpose of this unit is to allow using the syntax from frame with other command line tools. By default, the couple unit streams the output from the executed command as individual outputs, but the buffer option can be set to buffer all output of a single execution. The format string expression {} or {0} can be used as one of the arguments passed to the external command to represent the incoming data. In this case, the data will not be sent to the standard input device of the new process. positional arguments: (all remaining) All remaining command line tokens form an arbitrary command line to be executed. Use format string syntax to insert meta variables and incoming data chunks. optional arguments: -b, --buffer Buffer the command output for one execution rather than streaming it. -e, --noerror do not merge stdin and stderr; stderr will only be output if -v is also specified. -t, --timeout T Set an execution timeout as a floating point number in seconds, there is none by default. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class couple(Unit): """ Turns any command into a refinery unit. Data is processed by feeding it to the standard input of a process spawned from the given command line, and then reading the standard output of that process as the result of the operation. The main purpose of this unit is to allow using the syntax from `refinery.lib.frame` with other command line tools. By default, the `refinery.couple` unit streams the output from the executed command as individual outputs, but the `buffer` option can be set to buffer all output of a single execution. The format string expression `{}` or `{0}` can be used as one of the arguments passed to the external command to represent the incoming data. In this case, the data will not be sent to the standard input device of the new process. """ _JOIN_TIME = 0.1 def __init__( self, *commandline : Arg(nargs='...', type=str, metavar='(all remaining)', help=( 'All remaining command line tokens form an arbitrary command line to be executed. Use format string syntax ' 'to insert meta variables and incoming data chunks.')), buffer: Arg.Switch('-b', help='Buffer the command output for one execution rather than streaming it.') = False, noerror: Arg('-e', help='do not merge stdin and stderr; stderr will only be output if -v is also specified.') = False, timeout: Arg('-t', metavar='T', help='Set an execution timeout as a floating point number in seconds, there is none by default.') = 0.0 ): if not commandline: raise ValueError('you need to provide a command line.') super().__init__(commandline=commandline, noerror=noerror, buffer=buffer, timeout=timeout) def process(self, data): def shlexjoin(): import shlex return ' '.join(shlex.quote(cmd) for cmd in commandline) meta = metavars(data) meta.ghost = True used = set() commandline = [ meta.format(cmd, self.codec, [data], None, False, used=used) for cmd in self.args.commandline ] if 0 in used: self.log_info('input used as command-line argument; sending no input to process stdin') data = None self.log_debug(shlexjoin) posix = 'posix' in sys.builtin_module_names process = Popen(commandline, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False, close_fds=posix) if self.args.buffer and not self.args.timeout: out, err = process.communicate(data) for line in err.splitlines(): self.log_info(line) yield out return import io from threading import Thread, Event from queue import Queue, Empty from time import process_time, sleep start = 0 result = None qerr = Queue() qout = Queue() done = Event() def adapter(stream, queue: Queue, event: Event): while not event.is_set(): out = stream.read1() if out: queue.put(out) else: break stream.close() recvout = Thread(target=adapter, args=(process.stdout, qout, done), daemon=True) recverr = Thread(target=adapter, args=(process.stderr, qerr, done), daemon=True) recvout.start() recverr.start() if data: process.stdin.write(data) process.stdin.close() start = process_time() if self.args.buffer or self.args.timeout: result = io.BytesIO() def queue_read(q: Queue): try: return q.get_nowait() except Empty: return None errbuf = io.BytesIO() while True: out = queue_read(qout) err = None if self.args.noerror: err = queue_read(qerr) else: out = out or queue_read(qerr) if err and self.log_info(): errbuf.write(err) errbuf.seek(0) lines = errbuf.readlines() errbuf.seek(0) errbuf.truncate() if lines: if not (done.is_set() or lines[~0].endswith(B'\n')): errbuf.write(lines.pop()) for line in lines: msg = line.rstrip(B'\n') if msg: self.log_info(msg) if out: if self.args.buffer or self.args.timeout: result.write(out) if not self.args.buffer: yield out if done.is_set(): if recverr.is_alive(): self.log_warn('stderr receiver thread zombied') if recvout.is_alive(): self.log_warn('stdout receiver thread zombied') break elif not err and not out and process.poll() is not None: recverr.join(self._JOIN_TIME) recvout.join(self._JOIN_TIME) done.set() elif self.args.timeout: if process_time() - start > self.args.timeout: self.log_info('terminating process after timeout expired') done.set() process.terminate() for wait in range(4): if process.poll() is not None: break sleep(self._JOIN_TIME) else: self.log_warn('process termination may have failed') recverr.join(self._JOIN_TIME) recvout.join(self._JOIN_TIME) if not len(result.getbuffer()): result = RuntimeError('timeout reached, process had no output') else: result = RefineryPartialResult( 'timeout reached, returning all collected output', partial=result.getvalue()) if isinstance(result, Exception): raise result elif self.args.buffer: yield result.getvalue()
class cp1252
-
This unit is implemented in
refinery.units.encoding.cp1252
and has the following commandline Interface:usage: cp1252 [-h] [-L] [-Q] [-0] [-v] [-R] Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class cp1252(Unit): """ Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data. """ def process(self, data): return data.decode(self.codec).encode('cp1252') def reverse(self, data): return data.decode('cp1252').encode(self.codec)
class crc32 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.checksums
and has the following commandline Interface:usage: crc32 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the CRC32 Hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class crc32(HashUnit): """ Returns the CRC32 Hash of the input data. """ def _algorithm(self, data: bytes) -> bytes: return struct.pack('>I', zlib.crc32(data))
class csb (format, utf16=True, ascii=True)
-
This unit is implemented in
refinery.units.pattern.carve
and has the following commandline Interface:usage: csb [-h] [-L] [-Q] [-0] [-v] [-a | -u] format Short for carve single buffer; carves the single largest buffer of a given format from the input data and returns it. positional arguments: format Specify one of the following formats: integer, float, number, string, multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote, urlquote-coarse, urlquote-narrow, intarray, numarray, word, letters, wshenc, alphanumeric, b32, b64, b85, b92, b64any, b64url, hex, uppercase-hex, spaced- hex, spaced-b64, spaced-b85, utf8, hexdump, hexarray, uuencode optional arguments: -a, --no-utf16 Search for ASCII encoded patterns only. -u, --no-ascii Search for UTF16 encoded patterns only. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class csb(carve): """ Short for carve single buffer; carves the single largest buffer of a given format from the input data and returns it. """ def __init__(self, format, utf16=True, ascii=True): super().__init__( format, decode=False, single=True, utf16=utf16, ascii=ascii, )
class csd (format, utf16=True, ascii=True)
-
This unit is implemented in
refinery.units.pattern.carve
and has the following commandline Interface:usage: csd [-h] [-L] [-Q] [-0] [-v] [-a | -u] format Short for carve & decode; carves the single largest buffer of a given format from the input and decodes it with the appropriate decoder. positional arguments: format Specify one of the following formats: integer, float, number, string, multiline-string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote, urlquote-coarse, urlquote-narrow, intarray, numarray, word, letters, wshenc, alphanumeric, b32, b64, b85, b92, b64any, b64url, hex, uppercase-hex, spaced- hex, spaced-b64, spaced-b85, utf8, hexdump, hexarray, uuencode optional arguments: -a, --no-utf16 Search for ASCII encoded patterns only. -u, --no-ascii Search for UTF16 encoded patterns only. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class csd(carve): """ Short for carve & decode; carves the single largest buffer of a given format from the input and decodes it with the appropriate decoder. """ def __init__(self, format, utf16=True, ascii=True): super().__init__( format, decode=True, single=True, utf16=utf16, ascii=ascii, )
class csv (quote=b'"', delim=b',')
-
This unit is implemented in
refinery.units.formats.csv
and has the following commandline Interface:usage: csv [-h] [-L] [-Q] [-0] [-v] [-R] [-q QUOTE] [-d DELIM] Extracts the rows of a CSV document with header and converts them into JSON chunks. optional arguments: -q, --quote QUOTE Specify the quote character, the default is a double quote. -d, --delim DELIM Specify the delimiter, the default is a single comma. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class csv(Unit): """ Extracts the rows of a CSV document with header and converts them into JSON chunks. """ def __init__( self, quote: Unit.Arg('-q', help='Specify the quote character, the default is a double quote.') = B'"', delim: Unit.Arg('-d', help='Specify the delimiter, the default is a single comma.') = B',' ): super().__init__(quote=quote, delim=delim) def json_to_csv(self, table: dict): quote = self.args.quote.decode(self.codec) delim = self.args.delim.decode(self.codec) if not isinstance(table, list): raise ValueError('Input must be a JSON list.') out = MemoryFile() with io.TextIOWrapper(out, self.codec, newline='') as stream: writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) for row in table: if not isinstance(row, list): break if not all(isinstance(item, str) for item in row): break writer.writerow(row) else: return out.getvalue() keys = {} # A dictionary is used here over a set because dictionaries remember insertion order. # When feeding the unit a sequence of JSON objects, the user would likely expect the # column order in the resulting CSV to derive from the entry oder in the JSON data. for row in table: for key in row: if not isinstance(key, str): continue keys[key] = None keys = list(keys) out = MemoryFile() with io.TextIOWrapper(out, self.codec, newline='') as stream: writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) writer.writerow(keys) for row in table: writer.writerow([str(row.get(key, '')) for key in keys]) return out.getvalue() def reverse(self, data: bytearray): try: table: List[Dict[str, Any]] = json.loads(data) except Exception: table: List[Dict[str, Any]] = [json.loads(line) for line in data.splitlines()] return self.json_to_csv(table) def process(self, data): quote = self.args.quote.decode(self.codec) delim = self.args.delim.decode(self.codec) def convert(field: str): if field.isdigit() and not field.startswith('0'): return int(field) date = isodate(field) if date is not None: return date.isoformat(' ', 'seconds') return field with io.TextIOWrapper(MemoryFile(data), self.codec) as stream: rows = _csv.reader(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) keys = next(rows) for row in rows: out = {key: convert(value) for key, value in zip(keys, row)} yield json.dumps(out, indent=4).encode(self.codec)
Methods
def json_to_csv(self, table)
-
Expand source code Browse git
def json_to_csv(self, table: dict): quote = self.args.quote.decode(self.codec) delim = self.args.delim.decode(self.codec) if not isinstance(table, list): raise ValueError('Input must be a JSON list.') out = MemoryFile() with io.TextIOWrapper(out, self.codec, newline='') as stream: writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) for row in table: if not isinstance(row, list): break if not all(isinstance(item, str) for item in row): break writer.writerow(row) else: return out.getvalue() keys = {} # A dictionary is used here over a set because dictionaries remember insertion order. # When feeding the unit a sequence of JSON objects, the user would likely expect the # column order in the resulting CSV to derive from the entry oder in the JSON data. for row in table: for key in row: if not isinstance(key, str): continue keys[key] = None keys = list(keys) out = MemoryFile() with io.TextIOWrapper(out, self.codec, newline='') as stream: writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) writer.writerow(keys) for row in table: writer.writerow([str(row.get(key, '')) for key in keys]) return out.getvalue()
class cswap
-
This unit is implemented in
refinery.units.strings.cswap
and has the following commandline Interface:usage: cswap [-h] [-L] [-Q] [-0] [-v] Swap the case of the input string; all lowercase letters are turned into their uppercase variant and vice-versa. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class cswap(Unit): """ Swap the case of the input string; all lowercase letters are turned into their uppercase variant and vice-versa. """ def process(self, data: bytearray): lcase = bytes(range(B'a'[0], B'z'[0] + 1)) ucase = bytes(range(B'A'[0], B'Z'[0] + 1)) delta = lcase[0] - ucase[0] for k, letter in enumerate(data): if letter in ucase: data[k] += delta elif letter in lcase: data[k] -= delta return data
class cupper
-
This unit is implemented in
refinery.units.strings.cupper
and has the following commandline Interface:usage: cupper [-h] [-L] [-Q] [-0] [-v] Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the input to uppercase. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class cupper(Unit): """ Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the input to uppercase. """ def process(self, data): return data.upper()
class datefix (format='%Y-%m-%d %H:%M:%S', dos=False)
-
This unit is implemented in
refinery.units.misc.datefix
and has the following commandline Interface:usage: datefix [-h] [-L] [-Q] [-0] [-v] [-d] [format] Parses all kinds of date formats and unifies them into the same format. positional arguments: format Specify the output format as a strftime-like string, using ISO by default. optional arguments: -d, --dos Parse timestamps in DOS rather than Unix format. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class datefix(Unit): """ Parses all kinds of date formats and unifies them into the same format. """ _FORMATS = [ '%B %dth %Y %H:%M:%S (UTC)', # November 27th 2019 17:37:02 (UTC) '%B %dnd %Y %H:%M:%S (UTC)', # November 22nd 2019 17:37:02 (UTC) '%B %dst %Y %H:%M:%S (UTC)', # November 21st 2019 17:37:02 (UTC) '%Y-%m-%dT%H:%M:%S', # 2010-03-15T06:27:50 '%Y-%m-%d %H:%M:%S', # iso (2010-03-15 06:27:50.000000) '%Y-%m-%d %H:%M:%SZ%f', '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%SZ%f', '%a %b %d %Y %H:%M:%S', # Thu Apr 24 2014 12:32:21 '%m/%d/%Y %H:%M:%S', '%m/%d/%Y', ] _TIMEZONE_REGEXES = [re_compile(p) for p in [ R'([+-])(\d{2})(\d{2})$', # Thu Apr 24 2014 12:32:21 GMT-0700 R'([+-])(\d{2}):(\d{2})$', # 2017:09:11 23:47:22+02:00 R'GMT([+-])(\d{2})(\d{2}) \(.+\)$' # Thu Apr 24 2014 12:32:21 GMT-0700 (PDT) ]] def __init__( self, format: Arg(help='Specify the output format as a strftime-like string, using ISO by default.') = '%Y-%m-%d %H:%M:%S', dos: Arg('-d', help='Parse timestamps in DOS rather than Unix format.') = False ): super().__init__(format=format, dos=dos) @staticmethod def dostime(stamp: int) -> datetime: """ Parses a given DOS timestamp into a datetime object. """ d, t = stamp >> 16, stamp & 0xFFFF s = (t & 0x1F) << 1 return datetime( year = ((d & 0xFE00) >> 0x9) + 1980, # noqa month = ((d & 0x01E0) >> 0x5), # noqa day = ((d & 0x001F) >> 0x0), # noqa hour = ((t & 0xF800) >> 0xB), # noqa minute = ((t & 0x07E0) >> 0x5), # noqa second = 59 if s == 60 else s, # noqa ) def _format(self, dt: datetime) -> str: return dt.strftime(self.args.format) def _extract_timezone(self, data): for r in self._TIMEZONE_REGEXES: m = r.search(data) if not m: continue pm = m[1] td = timedelta( hours=int(m[2]), minutes=int(m[3])) if pm == '-': td = -td return data[:-len(m[0])].strip(), td return data, None @linewise def process(self, data: str) -> str: data = data.strip() # replace colons (i.e. for exiftool dates: 2017:01:01) if len(data) > 10 and data[4] == ':' and data[7] == ':': data = F'{data[0:4]}-{data[5:7]}-{data[8:]}' # strips Z at end (i.e. 20171022055144Z) if data.endswith('Z'): data = data[:-1] if data.startswith('0x'): try: data = str(int(data, 16)) except Exception: pass # parses timestamps and dates without much format if data.isdigit(): time_stamp = int(data) if len(data) > 14: raise Exception('cannot parse all-numeric string as date: %s' % data) elif len(data) == 14: # i.e. 20111020193727 return self._format(datetime.strptime(data, '%Y%m%d%H%M%S')) elif len(data) == 13: # i.e. 1458016535000 time_stamp //= 1000 data = data[:-3] if self.args.dos: return self._format(self.dostime(time_stamp)) else: return self._format(date_from_timestamp(time_stamp)) data, time_delta = self._extract_timezone(data) for f in self._FORMATS: try: dt = datetime.strptime(data, f) except ValueError: continue return self._format(dt if time_delta is None else dt - time_delta) return data
Static methods
def dostime(stamp)
-
Parses a given DOS timestamp into a datetime object.
Expand source code Browse git
@staticmethod def dostime(stamp: int) -> datetime: """ Parses a given DOS timestamp into a datetime object. """ d, t = stamp >> 16, stamp & 0xFFFF s = (t & 0x1F) << 1 return datetime( year = ((d & 0xFE00) >> 0x9) + 1980, # noqa month = ((d & 0x01E0) >> 0x5), # noqa day = ((d & 0x001F) >> 0x0), # noqa hour = ((t & 0xF800) >> 0xB), # noqa minute = ((t & 0x07E0) >> 0x5), # noqa second = 59 if s == 60 else s, # noqa )
class decompress (prepend=True, tolerance=12, max_ratio=1.0, min_ratio=0.0001, strict_limits=False)
-
This unit is implemented in
refinery.units.compression.decompress
and has the following commandline Interface:usage: decompress [-h] [-L] [-Q] [-0] [-v] [-P] [-t N] [-m R] [-n R] [-l] Attempts all available decompression units against the input and returns the output of the first successful one. If none succeeds, the data is returned unaltered. The process is heavily biased against LZNT1 decompression due to a large tendency for LZNT1 false positives. optional arguments: -P, --no-prepend By default, if decompression fails, the unit attempts to prefix the data with all possible values of a single byte and decompress the result. This behavior can be disabled with this flag. -t, --tolerance N Maximum number of bytes to strip from the beginning of the data; The default value is 12. -m, --max-ratio R To determine whether a decompression algorithm was successful, the ratio of compressed size to decompressed size may at most be as large as this number, a floating point value R; default value is 1.0. -n, --min-ratio R Require that compression ratios must be at least as large as R. This is a "too good to be true" heuristic against algorithms like lznt1 that can produce false positives. The default is 0.0001. -l, --strict-limits For recognized formats, i.e. when a magic signature is present, the above limits are disabled by default. Activate this flag to enforce them in every case. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class decompress(Unit): """ Attempts all available decompression units against the input and returns the output of the first successful one. If none succeeds, the data is returned unaltered. The process is heavily biased against LZNT1 decompression due to a large tendency for LZNT1 false positives. """ def __init__( self, prepend: Arg.Switch('-P', '--no-prepend', off=True, help=( 'By default, if decompression fails, the unit attempts to prefix ' 'the data with all possible values of a single byte and decompress ' 'the result. This behavior can be disabled with this flag.') ) = True, tolerance: Arg.Number('-t', help=( 'Maximum number of bytes to strip from the beginning of the data; ' 'The default value is 12.') ) = 12, max_ratio: Arg('-m', metavar='R', help=( 'To determine whether a decompression algorithm was successful, the ' 'ratio of compressed size to decompressed size may at most be as large ' 'as this number, a floating point value R; default value is {default}.') ) = 1.0, min_ratio: Arg('-n', metavar='R', help=( 'Require that compression ratios must be at least as large as R. This ' 'is a "too good to be true" heuristic against algorithms like lznt1 ' 'that can produce false positives. The default is {default}.') ) = 0.0001, strict_limits: Arg('-l', action='store_true', help=( 'For recognized formats, i.e. when a magic signature is present, the ' 'above limits are disabled by default. Activate this flag to enforce ' 'them in every case.') ) = False ): if min_ratio <= 0: raise ValueError('The compression factor must be nonnegative.') super().__init__( tolerance=tolerance, prepend=prepend, min_ratio=min_ratio, max_ratio=max_ratio, strict_limits=strict_limits, ) self.engines: List[Unit] = [ engine.assemble() for engine in [ zstd, szdd, bz2, zl, lzf, lzma, lzw, jcalg, lzo, aplib, qlz, brotli, blz, lzjb, lz4, lznt1, nrv2e, nrv2d, nrv2b] ] for engine in self.engines: engine.log_detach() def process(self, data): data = memoryview(data) class Decompression(NamedTuple): engine: Unit rating: _R result: Optional[ByteString] = None cutoff: int = 0 prefix: Optional[int] = None def __str__(self): status = self.rating.summary engine = self.engine.name prefix = self.prefix if prefix is not None: prefix = F'0x{prefix:02X}' return F'prefix={prefix}, cutoff=0x{self.cutoff:02X}, [{status}] engine={engine}' def __len__(self): return len(self.result) @property def ratio(self): if not self.result: return INF return len(data) / len(self) @property def unmodified(self): return self.prefix is None and self.cutoff == 0 @property def method(self): return self.engine.name if self.args.prepend: buffer = bytearray(1 + len(data)) buffer[1:] = data best_by_rating: Dict[_R, Decompression] = {} def best_current_rating(): return max(best_by_rating, default=_R.InvalidData) def decompress(engine: Unit, cutoff: int = 0, prefix: Optional[int] = None): ingest = data[cutoff:] rating = _R.ValidData if prefix is not None: buffer[0] = prefix ingest = buffer is_handled = engine.handles(ingest) if is_handled is True: rating |= _R.KnownFormat if is_handled is False: return Decompression(engine, _R.InvalidData, None, cutoff, prefix) try: result = next(engine.act(ingest)) except RefineryPartialResult as pr: rating |= _R.HadOutput result = pr.partial except Exception: result = None else: rating |= _R.Successful return Decompression(engine, rating, result, cutoff, prefix) def update(new: Decompression, discard_if_too_good=False): ratio = new.ratio if self.args.strict_limits or not new.rating & _R.KnownFormat: if ratio > self.args.max_ratio: return if ratio < self.args.min_ratio: return best = best_by_rating.get(new.rating, None) prefix = new.prefix if prefix is not None: prefix = F'0x{prefix:02X}' if new.unmodified and best and not best.unmodified: threshold = 1 else: threshold = 0.95 if not best or len(new) < len(best): q = 0 else: q = len(best) / len(new) ratio *= 100 brief = new.rating.brief if q < threshold: if best and discard_if_too_good: if q < 0.5: return if new.failed: return self.log_info(lambda: F'[switch] [{brief}] [q={q:07.4f}] compression ratio {ratio:07.4f}% with: {new!s}') best_by_rating[new.rating] = new else: self.log_debug(lambda: F'[reject] [{brief}] [q={q:07.4f}] compression ratio {ratio:07.4f}% with: {new!s}') for engine in self.engines: self.log_debug(F'attempting engine: {engine.name}') careful = isinstance(engine, (lznt1, lzf, lzjb)) for t in range(self.args.tolerance + 1): if best_current_rating() >= _R.Successful and careful and t > 0: break update(decompress(engine, t), careful) if self.args.prepend and best_current_rating() < _R.Successful: for p in range(0x100): update(decompress(engine, 0, p), careful) for r in sorted(best_by_rating, reverse=True): if dc := best_by_rating[r]: if not dc.rating & _R.HadOutput: continue self.log_info(F'settling on {dc.method} decompression.') if dc.rating & _R.KnownFormat: self.log_info('supporting evidence: found a known magic signature') if dc.rating & _R.HadNoErrors: self.log_info('supporting evidence: engine produced output without errors') elif dc.rating & _R.HadOutput: self.log_info('supporting evidence: there were errors, but the engine produced output') if not dc.rating & _R.Successful: self.log_info('the only decompression with result returned only a partial result.') return self.labelled(dc.result, method=dc.method) raise ValueError('no compression engine worked')
class dedup (key=None, count=False)
-
This unit is implemented in
refinery.units.meta.dedup
and has the following commandline Interface:usage: dedup [-h] [-L] [-Q] [-0] [-v] [-c] [key] Deduplicates a sequence of multiple inputs. The deduplication is limited to the current frame. positional arguments: key An optional meta variable expression to deduplicate. optional arguments: -c, --count Store the count of each deduplicated chunk. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dedup(Unit): """ Deduplicates a sequence of multiple inputs. The deduplication is limited to the current `refinery.lib.frame`. """ def __init__( self, key: Arg('key', type=str, help='An optional meta variable expression to deduplicate.') = None, count: Arg.Switch('-c', help='Store the count of each deduplicated chunk.') = False ): super().__init__(key=key, count=count) def filter(self, chunks): keyvar = self.args.key if keyvar is not None: def key(chunk): v = PythonExpression.Evaluate(keyvar, metavars(chunk)) if isbuffer(v): v = md5(v).digest() return v else: def key(chunk): return md5(chunk).digest() if self.args.count: counts = {} buffer = {} hashes = None else: hashes = set() counts = None buffer = None for chunk in chunks: if not chunk.visible: yield chunk continue uid = key(chunk) if hashes is None: counts[uid] = counts.get(uid, 0) + 1 buffer.setdefault(uid, chunk) elif uid in hashes: continue else: hashes.add(uid) yield chunk if hashes is None: for uid, chunk in buffer.items(): yield self.labelled(chunk, count=counts[uid])
class defang (url_only=False, url_protocol=False, dot_only=False, quote_md=False)
-
This unit is implemented in
refinery.units.pattern.defang
and has the following commandline Interface:usage: defang [-h] [-L] [-Q] [-0] [-v] [-R] [-u] [-p] [-d] [-q] Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot in the expression by [.]. For example, 127.0.0.1 will be replaced by 127.0.0[.]1. For URL indicators, the colon after the procol scheme is also wrapped in brackets. optional arguments: -u, --url-only Only defang URLs, do not look for domains or IPs. -p, --url-protocol Escape the protocol in URLs. -d, --dot-only Do not escape the protocol colon in URLs. -q, --quote-md Wrap all indicators in backticks for markdown code. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class defang(Unit): """ Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot in the expression by `[.]`. For example, `127.0.0.1` will be replaced by `127.0.0[.]1`. For URL indicators, the colon after the procol scheme is also wrapped in brackets. """ _WHITELIST = [ B'wscript.shell', ] _PROTOCOL_ESCAPES = { B'http': B'hxxp', B'https': B'hxxps', B'ftp': B'fxp', B'ftps': B'fxps', } def __init__( self, url_only: Arg.Switch('-u', help='Only defang URLs, do not look for domains or IPs.') = False, url_protocol: Arg.Switch('-p', help='Escape the protocol in URLs.') = False, dot_only: Arg.Switch('-d', help='Do not escape the protocol colon in URLs.') = False, quote_md: Arg.Switch('-q', help='Wrap all indicators in backticks for markdown code.') = False ): self.superinit(super(), **vars()) def _quote(self, word): return word if not self.args.quote_md else B'`%s`' % word def reverse(self, data: bytearray): def refang(hostname): return hostname[0].replace(B'[.]', B'.') data = defanged.hostname.sub(refang, data) data = data.replace(B'[:]//', B'://') data = data.replace(B'[://]', B'://') data = re.sub(B'h.{3}?(s?)://', B'http\\1://', data) data = re.sub(B'fxp(s?)://', B'ftp\\1://', data) return data def process(self, data): def replace_hostname(hostname: bytes, match=True): if match: return self._quote(replace_hostname(hostname[0], False)) self.log_info('replace:', hostname) host = hostname user, atsgn, host = host.rpartition(B'@') host, colon, port = host.rpartition(B':') host = host.lower() if not colon: host = port port = B'' if host in self._WHITELIST: return hostname host = re.split(R'(?:\[\.\]|\.)', host.decode('latin1')) if len(host) == 1: return hostname components = iter(reversed(host)) defanged_parts = [next(components)] separator = '[.]' for part in components: defanged_parts.append(separator) defanged_parts.append(part) separator = '[.]' if part in tlds else '.' defanged_host = ''.join(reversed(defanged_parts)).encode('latin1') return user + atsgn + defanged_host + colon + port def replace_url(url: bytes): if not url: return url self.log_info('replace:', url) url = url.replace(B'[:]//', B'://', 1) url = url.replace(B'[.]', B'.') prefix = B'tcp' if url.startswith(B'://'): scheme = 0 elif url.startswith(B'//'): scheme = 1 prefix = prefix + B':' else: scheme = 2 prefix = B'' parsed = urlparse(prefix + url) operations = { name: self.process(getattr(parsed, name)) for name in ('path', 'params', 'query', 'fragment') } if self.args.url_protocol and parsed.scheme: operations.update(scheme=self._PROTOCOL_ESCAPES.get(parsed.scheme.lower(), scheme)) if scheme < 2: operations.update(scheme=B'') operations.update(netloc=replace_hostname(parsed.netloc, False)) url = urlunparse(parsed._replace(**operations)) if scheme == 0: url = B':' + url if not self.args.dot_only: url = url.replace(B'://', B'[:]//') return self._quote(url) urlsplit = defanged.url.split(data) step = defanged.url.value.groups + 1 urlsplit[1::step] = [replace_url(t) for t in itertools.islice(iter(urlsplit), 1, None, step)] if not self.args.url_only: urlsplit[0::step] = [ indicators.hostname.sub(replace_hostname, t) for t in itertools.islice(iter(urlsplit), 0, None, step) ] def fuse(urlsplit): txt = itertools.islice(iter(urlsplit), 0, None, step) url = itertools.islice(iter(urlsplit), 1, None, step) while True: try: yield next(txt) yield next(url) except StopIteration: break return B''.join(fuse(urlsplit))
class deob_js_arrays
-
This unit is implemented in
refinery.units.obfuscation.js.arrays
and has the following commandline Interface:usage: deob-js-arrays [-h] [-L] [-Q] [-0] [-v] JavaScript deobfuscator to turn ["Z", "t", "s", "e"][0] into "Z". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_js_arrays(Deobfuscator): """ JavaScript deobfuscator to turn `["Z", "t", "s", "e"][0]` into `"Z"`. """ def deobfuscate(self, data): def litpick(match): try: array = match[1] index = int(match[2]) lpick = array.split(',')[index].strip() self.log_debug(lambda: F'{lpick} = {match[0]}') except (TypeError, IndexError): lpick = match[0] return lpick p = R'\s{{0,5}}'.join([ '\\[', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\]', '\\[', '({i})', '\\]' ]).format(i=formats.integer, s=formats.string) return re.sub(p, litpick, data)
class deob_js_getattr
-
This unit is implemented in
refinery.units.obfuscation.js.getattr
and has the following commandline Interface:usage: deob-js-getattr [-h] [-L] [-Q] [-0] [-v] JavaScript deobfuscator to turn WScript["CreateObject"] into WScript.CreateObject. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_js_getattr(Deobfuscator): """ JavaScript deobfuscator to turn `WScript["CreateObject"]` into `WScript.CreateObject`. """ def deobfuscate(self, data): def dottify(match): name = match[2][1:-1] if name.isidentifier(): return F'{match[1]}.{name}' return match[0] return re.sub(FR'(\w+)\[({formats.string})\]', dottify, data)
class deob_js_tuples
-
This unit is implemented in
refinery.units.obfuscation.js.tuples
and has the following commandline Interface:usage: deob-js-tuples [-h] [-L] [-Q] [-0] [-v] JavaScript deobfuscator to turn ("Z", "t", "s", "e") into "e". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_js_tuples(Deobfuscator): """ JavaScript deobfuscator to turn `("Z", "t", "s", "e")` into `"e"`. """ def deobfuscate(self, data): def litpick(match): try: array = match[1] lpick = array.split(',')[-1].strip() self.log_debug(lambda: F'{lpick} = {match[0]}') except (TypeError, IndexError): lpick = match[0] return lpick p = R'\s{{0,5}}'.join([ '\\(', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\)' ]).format(i=formats.integer, s=formats.string) return re.sub(p, litpick, data)
class deob_ps1 (timeout=100)
-
This unit is implemented in
refinery.units.obfuscation.ps1.all
and has the following commandline Interface:usage: deob-ps1 [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT] optional arguments: -t, --timeout TIMEOUT Maximum number of iterations; the default is 100. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1(IterativeDeobfuscator): _SUBUNITS: List[Type[Deobfuscator]] = [ deob_ps1_escape, deob_ps1_cases, deob_ps1_brackets, deob_ps1_format, deob_ps1_typecast, deob_ps1_stringreplace, deob_ps1_b64convert, deob_ps1_encodings, deob_ps1_concat, deob_ps1_invoke, deob_ps1_uncurly ] def deobfuscate(self, data): units = [u() for u in self._SUBUNITS] for u in units: u.log_level = self.log_level for unit in units: self.log_debug(lambda: F'invoking {unit.name}') checkpoint = hash(data) data = unit.deobfuscate(data) if checkpoint != hash(data) and not self.log_debug('data has changed.'): self.log_info(F'used {unit.name}') return re.sub(R'[\r\n]+', '\n', data)
class deob_ps1_b64convert
-
This unit is implemented in
refinery.units.obfuscation.ps1.b64convert
and has the following commandline Interface:usage: deob-ps1-b64convert [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_b64convert(Deobfuscator): _SENTINEL = re.compile('\\s*'.join( (re.escape('[System.Convert]::FromBase64String'), '\\(', '({s})', '\\)') ).format(s=formats.ps1str), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = Ps1StringLiterals(data) def replacer(match: re.Match[str]): if strlit.get_container(match.start()): return match[0] try: string, = string_unquote(match[1]) except ValueError: return match[0] try: bytes = base64.b64decode(string) except Exception: return match[0] return '@({})'.format(','.join(F'0x{b:02X}' for b in bytes)) return self._SENTINEL.sub(replacer, data)
class deob_ps1_brackets
-
This unit is implemented in
refinery.units.obfuscation.ps1.brackets
and has the following commandline Interface:usage: deob-ps1-brackets [-h] [-L] [-Q] [-0] [-v] PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e. ("{0}{2}{1}") is transformed to "{0}{2}{1}". Currently, only integer and string constants are supported. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_brackets(Deobfuscator): """ PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e. `("{0}{2}{1}")` is transformed to `"{0}{2}{1}"`. Currently, only integer and string constants are supported. """ _SENTINEL = re.compile( RF'''(?<![\w"']{{2}})''' # this may be a function call RF'''(\-\w+)?''' # not a function call but an argument RF'''\(\s*({formats.integer}|{formats.ps1str})\s*(\S)''', flags=re.IGNORECASE ) def deobfuscate(self, data): strlit = Ps1StringLiterals(data) repeat = True @strlit.outside def replacement(match): nonlocal repeat if match[3] == ')': repeat = True return (match[1] or '') + match[2] while repeat: repeat = False data = self._SENTINEL.sub(replacement, data) return data
class deob_ps1_cases
-
This unit is implemented in
refinery.units.obfuscation.ps1.cases
and has the following commandline Interface:usage: deob-ps1-cases [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_cases(Deobfuscator): _NAMES = [ '-BXor', '-Exec Bypass', '-NoLogo', '-NonInter', '-Replace', '-Windows Hidden', '.Invoke', 'Assembly', 'Byte', 'Char', 'ChildItem', 'CreateThread', 'Get-Variable', 'GetType', 'IntPtr', 'Invoke-Expression', 'Invoke', 'Length', 'Net.WebClient', 'PowerShell', 'PSVersionTable', 'Set-Item', 'Set-Variable', 'Start-Sleep', 'ToString', 'Type', 'Value', 'Void', ] @outside(formats.ps1str) def deobfuscate(self, data): for name in self._NAMES: data = re.sub(RF'\b{re.escape(name)}\b', name, data, flags=re.IGNORECASE) return data
class deob_ps1_concat (timeout=100)
-
This unit is implemented in
refinery.units.obfuscation.ps1.concat
and has the following commandline Interface:usage: deob-ps1-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT] optional arguments: -t, --timeout TIMEOUT Maximum number of iterations; the default is 100. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_concat(IterativeDeobfuscator): _SENTINEL = re.compile(R'''['"]\s*[+&]\s*['"]''') def deobfuscate(self, data): def concat(data): strlit = Ps1StringLiterals(data) repeat = True while repeat: for match in self._SENTINEL.finditer(data): a, b = match.span() a = strlit.get_container(a) if a is None: continue b = strlit.get_container(b) if b is None or b != a + 1: continue a = strlit.ranges[a] b = strlit.ranges[b] stra = data[slice(*a)] strb = data[slice(*b)] parts = list(string_unquote(stra)) it = iter(string_unquote(strb)) parts[~0] += next(it) parts.extend(it) yield data[:a[0]] + string_quote(parts) data = data[b[1]:] strlit.update(data) break else: repeat = False yield data return ''.join(concat(data))
class deob_ps1_encodings
-
This unit is implemented in
refinery.units.obfuscation.ps1.encodings
and has the following commandline Interface:usage: deob-ps1-encodings [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_encodings(Deobfuscator): _SENTINEL = re.compile('\\s*'.join( (re.escape('[System.Text.Encoding]::') + '(\\w+)\\.GetString', '\\(', '@\\(', '({a})', '\\)', '\\)') ).format(a=formats.intarray), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = Ps1StringLiterals(data) def replacer(match: re.Match[str]): if strlit.get_container(match.start()): return match[0] try: bytes = bytearray(int(x.strip(), 0) for x in match[2].split(',')) except Exception: return match[0] encoding = { 'ASCII': 'ascii', 'BigEndianUnicode': 'utf-16be', 'Default': 'latin1', 'Unicode': 'utf-16le', }.get(match[1], match[1]) try: codecs.lookup(encoding) except LookupError: encoding = 'utf8' try: string = bytes.decode(encoding) except Exception: return match[0] return string_quote(string) return self._SENTINEL.sub(replacer, data)
class deob_ps1_escape
-
This unit is implemented in
refinery.units.obfuscation.ps1.escape
and has the following commandline Interface:usage: deob-ps1-escape [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_escape(Deobfuscator): def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def repl(m): return m[1] return re.sub(R'''`([^0abfnrtv`#'"\$])''', repl, data)
class deob_ps1_format
-
This unit is implemented in
refinery.units.obfuscation.ps1.format
and has the following commandline Interface:usage: deob-ps1-format [-h] [-L] [-Q] [-0] [-v] PowerShell deobfuscation for the following "format string"-based technique: - "{0}{2}{1}"-f 'signa','ures','t' - "{0}na{2}{1}"-f 'sig','ures','t' generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_format(Deobfuscator): """ PowerShell deobfuscation for the following "format string"-based technique: - `"{0}{2}{1}"-f 'signa','ures','t'` - `"{0}na{2}{1}"-f 'sig','ures','t'` """ def deobfuscate(self, data): repeat = True while repeat: repeat = False for string in re.finditer(str(formats.ps1str), data): argmatch = re.search(R'^\s*-[fF]\s*((?:{s},\s*)*{s})'.format(s=formats.ps1str), data[string.end():]) if not argmatch: continue def dbgmsg(): sample = string[0] if len(sample) > 33: sample = F"{sample[1:30]}...{sample[0]}" return F'found match at {string.start()}: {sample}' self.log_debug(dbgmsg) args = re.split(F'({formats.ps1str})', argmatch[1]) args = [list(string_unquote(a.strip())) for a in args[1::2]] def formatter(string): buffer = [] for k, part in enumerate(re.split(R'(\{\d+\})', string)): if k % 2 == 0: if part: buffer.append(part) continue try: index = int(part[1:-1]) arg = args[index] except IndexError as IE: raise IndexError(F'only found {len(args)} arguments and format sequence {index}, aborting.') from IE it = iter(arg) buffer.append(next(it)) if len(arg) > 1: yield ''.join(buffer) buffer = [] for last, part in lookahead(it): if last: buffer.append(part) break yield part yield ''.join(buffer) try: result = string_apply(string[0], formatter) except IndexError: continue data = data[:string.start()] + result + data[argmatch.end() + string.end():] repeat = True break return data
class deob_ps1_invoke
-
This unit is implemented in
refinery.units.obfuscation.ps1.invoke
and has the following commandline Interface:usage: deob-ps1-invoke [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_invoke(Deobfuscator): def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def invrepl1(m): return m[1] + m[3] data = re.sub( R'''(\.|::)''' # preceeded by dot or namespace delimiter R'''(['"])(\w{1,200})\2''' # quoted string (actually a method name) R'''(?=[\s\(\.\,\;\+\-])''', # only if followed by certain characters invrepl1, data # remove quotes around symbol ) @strlit.outside def invrepl2(m): return m[1] + '(' data = re.sub( '\\s{0,5}'.join([ '[.&]', '(\\(', # sourcing operator '(?:gcm|get-command)', ')?', # potentially a get-command '([\'"])([-a-z]{1,100})\\2' # string enclosing a command '(?(1)\\s{0,5}\\)|)', # closing bracket for get-command ]), '\\3', data, flags=re.IGNORECASE ) data = re.sub( R'''(\w{1,200})\.Invoke\s*\(''', invrepl2, data, flags=re.IGNORECASE ) return data
class deob_ps1_secstr (*a)
-
This unit is implemented in
refinery.units.obfuscation.ps1.securestring
and has the following commandline Interface:usage: deob-ps1-secstr [-h] [-L] [-Q] [-0] [-v] [a [a ...]] positional arguments: a generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_secstr(Deobfuscator): def __init__(self, *a, **kw): super().__init__(*a, **kw) self._pack = pack() self._secstr = secstr() self._pattern = re.compile( R'\s{{0,20}}'.join([ R'''(['"])({b})\1''', R'\|', R'\.?', R'&?', R'''(['"]?)ConvertTo-SecureString\3''', R'-ke?y?', R'''(\(?)({a}|{i}\s{{0,20}}\.\.\s{{0,20}}{i})''', R'((?:\)\s{{0,20}}){{0,10}})?' ]).format( b=formats.b64, a=formats.intarray, i=formats.integer ), flags=re.IGNORECASE | re.DOTALL ) def _decrypt_block(self, data, match): if '..' in match[5]: a, b = [int(x.strip(), 0) for x in match[5].split('..')] key = range(min(a, b), max(a, b) + 1) if a > b: key = reversed(key) self._secstr.args.key = bytes(bytearray(key)) else: self._secstr.args.key = self._pack(match[5].encode(self.codec)) decoded = self._secstr(match[2].encode(self.codec)) decoded = decoded.decode(self.codec) result = F'\n\n{decoded}\n\n' brackets = match[6].count(')') start = match.start() if match[4]: brackets -= 1 if brackets <= 0: if brackets < 0: result += ')' return start, result while brackets: start -= 1 if data[start] == '(': brackets -= 1 if data[start] == ')': brackets += 1 return start, result def deobfuscate(self, data): while True: match = self._pattern.search(data) if not match: break start, result = self._decrypt_block(data, match) data = data[:start] + result + data[match.end():] return data
class deob_ps1_stringreplace
-
This unit is implemented in
refinery.units.obfuscation.ps1.stringreplace
and has the following commandline Interface:usage: deob-ps1-stringreplace [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_stringreplace(Deobfuscator): _SENTINEL = re.compile(( R'(?i)[\'"]\s*' # end of haystack string R'(-c|-i|-|\.)replace' # the replace call R'([\(\s]*)({s})([\)\s]*),' # needle (with brackets) R'([\(\s]*)({s})([\)\s]*)' # insert (with brackets) ).format(s=formats.ps1str), flags=re.IGNORECASE) def deobfuscate(self, data): repeat = True strlit = Ps1StringLiterals(data) while repeat: repeat = False needle = None for match in self._SENTINEL.finditer(data): k = strlit.get_container(match.start()) if k is None: continue offset, end = strlit.ranges[k] if match.start() != end - 1: continue string = data[offset:end] pf, bl1, needle, bl2, br1, insert, br2 = match.groups() end = match.end() case = '' if pf[0] in '.c' else '(?i)' bl = bl1.count('(') - bl2.count(')') br = br2.count(')') - br1.count('(') if pf[0] == '.': bl -= 1 br -= 1 if bl != 0 or br < 0: continue needle = list(string_unquote(needle)) if len(needle) > 1: continue needle = needle[0] head, *body = string_unquote(insert) self.log_info('replacing', needle, 'by', insert) if not body: def perform_replacement(string): return re.sub(F'{case}{re.escape(needle)}', lambda _: head, string) else: *body, tail = body def perform_replacement(string): # noqa parts = re.split(F'{case}{re.escape(needle)}', string) if len(parts) == 1: yield string return it = iter(parts) yield next(it) + head yield from body for last, part in lookahead(it): if last: yield tail + part else: yield tail + part + head yield from body replaced = string_apply(string, perform_replacement) + (br * ')') strlit.ranges[k] = offset, offset + len(replaced) - br strlit.ranges[k + 1: k + 3] = [] strlit.shift(len(replaced) + offset - end, k + 1) data = data[:offset] + replaced + data[end:] repeat = True break return data
class deob_ps1_typecast
-
This unit is implemented in
refinery.units.obfuscation.ps1.typecast
and has the following commandline Interface:usage: deob-ps1-typecast [-h] [-L] [-Q] [-0] [-v] Replaces sequences like [Char]120 to their string representation, in this case the string "x". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_typecast(Deobfuscator): """ Replaces sequences like [Char]120 to their string representation, in this case the string "x". """ def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def strip_typecast(m): return m[1] data = re.sub( FR'\[(?:string|char\[\])\]\s*({formats.ps1str!s})', strip_typecast, data, flags=re.IGNORECASE ) @strlit.outside def char_literal(match): c = chr(int(match[1].lower(), 0)) if c == "'": return '''"'"''' return F"'{c}'" data = re.sub( R'\[char\]\s*0*(0x[0-9a-f]+|\d+)', char_literal, data, flags=re.IGNORECASE ) def char_array(match): result = bytes(int(x, 0) for x in match[1].split(',')) try: result = result.decode('ascii') if not all(x in string.printable or x.isspace() for x in result): raise ValueError except ValueError: return match[0] else: return string_quote(result) data = re.sub( R'\s*'.join([ R'\[char\[\]\]', R'\((', R'(?:\s*(?:0x[0-9a-f]+|\d+)\s*,)+', R'(?:0x[0-9a-f]+|\d+)', R')\)' ]), char_array, data, flags=re.IGNORECASE ) return data
class deob_ps1_uncurly
-
This unit is implemented in
refinery.units.obfuscation.ps1.uncurly
and has the following commandline Interface:usage: deob-ps1-uncurly [-h] [-L] [-Q] [-0] [-v] PowerShell deobfuscation that removes superfluous curly braces around variable names that do not require it, i.e. ${variable} is transformed to just $variable. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_ps1_uncurly(Deobfuscator): """ PowerShell deobfuscation that removes superfluous curly braces around variable names that do not require it, i.e. `${variable}` is transformed to just `$variable`. """ _SENTINEL = re.compile(R'\$\{(\w+)\}') def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def strip(m): return F'${m[1]}' return self._SENTINEL.sub(strip, data)
class deob_vba (timeout=100)
-
This unit is implemented in
refinery.units.obfuscation.vba.all
and has the following commandline Interface:usage: deob-vba [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT] optional arguments: -t, --timeout TIMEOUT Maximum number of iterations; the default is 100. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba(IterativeDeobfuscator): _SUBUNITS: List[Type[Deobfuscator]] = [ deob_vba_comments, deob_vba_brackets, deob_vba_char_function, deob_vba_concat, deob_vba_arithmetic, deob_vba_constants, deob_vba_dummy_variables, deob_vba_stringreplace, deob_vba_stringreverse, ] def deobfuscate(self, data): units = [u() for u in self._SUBUNITS] for u in units: u.log_level = self.log_level for unit in units: self.log_debug(lambda: F'invoking {unit.name}') checkpoint = hash(data) data = unit.deobfuscate(data) if checkpoint != hash(data) and not self.log_debug('data has changed.'): self.log_info(F'used {unit.name}') return re.sub(R'[\r\n]+', '\n', data)
class deob_vba_arithmetic
-
This unit is implemented in
refinery.units.obfuscation.vba.arithmetic
and has the following commandline Interface:usage: deob-vba-arithmetic [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_arithmetic(Deobfuscator): def deobfuscate(self, data): strings = StringLiterals(formats.vbastr, data) def vba_int_eval(match: re.Match[str]) -> str: s = match[0].lower() if not s.startswith('&'): return s t, s = s[1], s[2:].rstrip('&') if t == 'h': return str(int(s, 16)) if t == 'b': return str(int(s, 2)) if t == 'o': return str(int(s, 8)) @strings.outside def evaluate(match: re.Match[str]): expression = match[0] expression = expression.strip() if not any(c.isdigit() for c in expression): return expression expression = re.sub(str(formats.vbaint), vba_int_eval, expression) brackets = 0 positions = [] ok = True head = tail = rest = '' for end, character in enumerate(expression): if character == '(': brackets += 1 positions.append(end) continue if character == ')': brackets -= 1 if brackets < 0: expression, tail = expression[:end], expression[end:] break else: positions.pop() if brackets == 0 and expression[0] == '(': expression, rest = expression[:end + 1], expression[end + 1:] break if expression.isdigit(): return match[0] if brackets > 0: pos = positions[~0] + 1 head = expression[:pos] expression = expression[pos:] try: result = str(_cautious_vba_eval(expression + rest)) except Exception: ok = False else: rest = '' if not ok and rest: try: result = str(_cautious_vba_eval(expression)) except Exception: expression += rest else: ok = True if not ok: result = expression self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})') else: if expression.startswith('(') and expression.endswith(')'): result = F'({result})' if tail: tail = self.deobfuscate(tail) return F'{head}{result}{rest}{tail}' pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format( i=str(formats.vbaint), f=str(formats.float))) return pattern.sub(evaluate, data)
class deob_vba_brackets
-
This unit is implemented in
refinery.units.obfuscation.vba.brackets
and has the following commandline Interface:usage: deob-vba-brackets [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_brackets(Deobfuscator): _SENTINEL = re.compile( RF'''(?<![\w"']{{2}})''' # this may be a function call RF'''\(\s*({formats.vbaint}|{formats.vbastr}|{formats.float})\s*(\S)''', flags=re.IGNORECASE ) def deobfuscate(self, data): strlit = StringLiterals(formats.vbastr, data) repeat = True @strlit.outside def replacement(match): nonlocal repeat if match[2] == ')': repeat = True return match[1] while repeat: repeat = False data = self._SENTINEL.sub(replacement, data) return data
class deob_vba_char_function
-
This unit is implemented in
refinery.units.obfuscation.vba.char
and has the following commandline Interface:usage: deob-vba-char-function [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_char_function(Deobfuscator): def deobfuscate(self, data): strings = StringLiterals(formats.vbastr, data) @strings.outside def evaluate_char_function(match: re.Match[str]): try: c = chr(int(match[1])) except ValueError: return match[0] if c == '"': return '""""' if c == '\\': return '"\\"' c = repr(c)[1:-1] if len(c) > 1: return match[0] return '"{}"'.format(c) return re.sub(R'(?i)\bchrw?\s*\(\s*(\d+)\s*\)', evaluate_char_function, data)
class deob_vba_chr_literals
-
This unit is implemented in
refinery.units.obfuscation.vba.vba
and has the following commandline Interface:usage: deob-vba-chr-literals [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_chr_literals(Unit): def process(self, data): def _chr(m): code = int(m[1], 0) if code == 34: return B'""""' return B'"%s"' % chr(code).encode('unicode_escape') data = re.sub(BR'Chr\((\d+x?\d+)\)', _chr, data, flags=re.IGNORECASE) data = re.sub(BR'"\s*\&\s*"', B'', data) return data
class deob_vba_comments
-
This unit is implemented in
refinery.units.obfuscation.vba.comments
and has the following commandline Interface:usage: deob-vba-comments [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_comments(Deobfuscator): def deobfuscate(self, data): return re.sub(R"(?im)^\s{0,20}(?:'|rem\b|dim\b).*(?:\Z|$\n\r?)", '', data)
class deob_vba_concat (timeout=100)
-
This unit is implemented in
refinery.units.obfuscation.vba.concat
and has the following commandline Interface:usage: deob-vba-concat [-h] [-L] [-Q] [-0] [-v] [-t TIMEOUT] optional arguments: -t, --timeout TIMEOUT Maximum number of iterations; the default is 100. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_concat(IterativeDeobfuscator): _SENTINEL = re.compile(R'''"\s*(\++|&)\s*"''') def deobfuscate(self, data): def concat(data): strlit = StringLiterals(formats.vbastr, data) repeat = True while repeat: for match in self._SENTINEL.finditer(data): a, b = match.span() a = strlit.get_container(a) if a is None: continue b = strlit.get_container(b) if b is None or b != a + 1: continue _, a = strlit.ranges[a] b, c = strlit.ranges[b] yield data[:a - 1] + data[b + 1:c] data = data[c:] strlit.update(data) break else: repeat = False yield data return ''.join(concat(data))
class deob_vba_constants
-
This unit is implemented in
refinery.units.obfuscation.vba.constants
and has the following commandline Interface:usage: deob-vba-constants [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_constants(Deobfuscator): def deobfuscate(self, data): codelines = data.splitlines(keepends=True) constants = {} constline = {} variables = set() for k, line in enumerate(codelines): match = re.match(R'(?im)^\s*(?:sub|function)\s*(\w+)', line) if match: variables.add(match[1]) continue match = re.match( R'(?im)^(?:\s*const)?\s*(\w+)\s*=\s*({i}|{s})\s*(?:\'|rem|$)'.format( s=formats.ps1str, i=formats.integer ), line) if match is None or match[1] in variables: pass elif match[2] != constants.get(match[1], match[2]): self.log_debug(F'del {match[1]}') del constants[match[1]] del constline[match[1]] variables.add(match[1]) else: self.log_debug(F'add {match[1]} = {match[2]}') constants[match[1]] = match[2] constline[match[1]] = k codelines = [line for k, line in enumerate(codelines) if k not in constline.values()] data = ''.join(codelines) for name, value in constants.items(): data = re.sub(RF'\b{re.escape(name)!s}\b', lambda _: value, data) return data
class deob_vba_dummy_variables
-
This unit is implemented in
refinery.units.obfuscation.vba.dummies
and has the following commandline Interface:usage: deob-vba-dummy-variables [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_dummy_variables(Deobfuscator): def deobfuscate(self, data): lines = data.splitlines(keepends=False) names = collections.defaultdict(list) def might_be_used_in(name, line): # avoid finding the name within a string literal line = '""'.join(re.split(str(formats.ps1str), line)) line = re.split(RF'\b{name}\b', line) try: L, R = line except ValueError: return False L = L.strip().lower() if L.startswith("'") or L.startswith('rem'): return False R = R.strip().lower() if R.startswith('=') and 'if' not in L: return False if L.startswith('dim'): return False return True pattern = re.compile( R'(?i)^\s{0,8}(?:const\s{1,8})?(\w+)\s{1,8}=\s{1,8}.*$' ) for k, line in enumerate(lines): try: name = pattern.match(line)[1] except (AttributeError, TypeError): continue if re.search(r'\w+\(', line): # might be a function call continue names[name].append(k) for line in lines: while True: for name in names: if might_be_used_in(name, line): del names[name] break else: break return '\n'.join(line for k, line in enumerate(lines) if not any( k in rows for rows in names.values()))
class deob_vba_stringreplace
-
This unit is implemented in
refinery.units.obfuscation.vba.stringreplace
and has the following commandline Interface:usage: deob-vba-stringreplace [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_stringreplace(Deobfuscator): _SENTINEL = re.compile(( R'(?i)\bReplace\s*\(' # the replace call R'\s*({s}),' # haystack (with brackets) R'\s*({s}),' # needle (with brackets) R'\s*({s})\s*\)' # insert (with brackets) ).format(s=formats.vbastr), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = StringLiterals(formats.vbastr, data) @strlit.outside def replacement(match: re.Match[str]): return string_quote( string_unquote(match[1]).replace( string_unquote(match[2]), string_unquote(match[3]) ) ) return self._SENTINEL.sub(replacement, data)
class deob_vba_stringreverse
-
This unit is implemented in
refinery.units.obfuscation.vba.stringreverse
and has the following commandline Interface:usage: deob-vba-stringreverse [-h] [-L] [-Q] [-0] [-v] generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deob_vba_stringreverse(Deobfuscator): _SENTINEL = re.compile(( R'(?i)\bStrReverse\s*\(' # the reverse call R'\s*({s})\s*\)' # string ).format(s=formats.vbastr), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = StringLiterals(formats.vbastr, data) @strlit.outside def replacement(match: re.Match[str]): return string_quote(''.join(reversed(string_unquote(match[1])))) return self._SENTINEL.sub(replacement, data)
class des (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.des
and has the following commandline Interface:usage: des [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key DES encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX, ECB, OFB. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -M, --mac-len N Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in bytes. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class des(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES)): """ DES encryption and decryption. """ pass
class des3 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.des3
and has the following commandline Interface:usage: des3 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key 3-DES encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX, ECB, OFB. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -M, --mac-len N Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in bytes. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class des3(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES3)): """ 3-DES encryption and decryption. """ pass
class deskd (size=8)
-
This unit is implemented in
refinery.units.crypto.keyderive.deskd
and has the following commandline Interface:usage: deskd [-h] [-L] [-Q] [-0] [-v] [size] Stands for "DES Key Derivation". It implements the same functionality as DES_string_to_key in OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern key derivation function. positional arguments: size The number of bytes to generate, default is the maximum of 8. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class deskd(KeyDerivation): """ Stands for "DES Key Derivation". It implements the same functionality as `DES_string_to_key` in OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern key derivation function. """ def __init__(self, size: Arg(help='The number of bytes to generate, default is the maximum of 8.') = 8): super().__init__(size=size, salt=None) def process(self, password): from Cryptodome.Cipher import DES from Cryptodome.Util.strxor import strxor key = bytearray(8) for i, j in enumerate(password): if ((i % 16) < 8): key[i % 8] ^= (j << 1) & 0xFF else: j = (((j << 4) & 0xf0) | ((j >> 4) & 0x0f)) j = (((j << 2) & 0xcc) | ((j >> 2) & 0x33)) j = (((j << 1) & 0xaa) | ((j >> 1) & 0x55)) key[7 - (i % 8)] ^= j des_set_odd_parity(key) if password: n = len(password) password = password.ljust(n + 7 - ((n - 1) % 8), b'\0') des = DES.new(key, DES.MODE_ECB) for k in range(0, n, 8): key[:] = des.encrypt(strxor(password[k:k + 8], key)) des_set_odd_parity(key) if self.args.size > 8: raise RefineryPartialResult('can provide at most 8 bytes.', partial=key) return key[:self.args.size]
class dexstr
-
This unit is implemented in
refinery.units.formats.dexstr
and has the following commandline Interface:usage: dexstr [-h] [-L] [-Q] [-0] [-v] Extract strings from DEX (Dalvik Executable) files. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dexstr(Unit): """ Extract strings from DEX (Dalvik Executable) files. """ def process(self, data): dex = DexFile(data) for string in dex.read_strings(): yield string.encode(self.codec)
class dnarrays
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnarrays
and has the following commandline Interface:usage: dnarrays [-h] [-L] [-Q] [-0] [-v] Extracts arrays of strings or integers that are encoded in the .NET binary as IL opcodes. The data is exported as JSON. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnarrays(Unit): """ Extracts arrays of strings or integers that are encoded in the .NET binary as IL opcodes. The data is exported as JSON. """ @staticmethod def _read_int(reader: StructReader): value = reader.read_byte() - 0x16 if value < 0: raise ValueError elif value <= 8: return value elif value == 9: return reader.read_byte() elif value == 10: return reader.u32() else: raise ValueError @staticmethod def _read_str(reader: StructReader, header: DotNetHeader): if reader.read_byte() != 0x72: raise ValueError token: int = reader.read_integer(24) value: str = header.meta.Streams.US[token] if reader.read_byte() != 0x70: raise ValueError return value _STACK_ARRAY_PATTERN_STR = re.compile( BR'''(?x) (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load array length (?: \x8D...\x01 ) # newarr System.String (?: (?: \x25 ) # dup (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load integer index (?: \x72...\x70 ) # load the string (?: \xA2 ) # stelem.ref ){4,} ''', flags=re.DOTALL) def _str_arrays(self, data: ByteStr, header: DotNetHeader, tables: NetMetaDataTables): for match in self._STACK_ARRAY_PATTERN_STR.finditer(data): reader = StructReader(match[0]) result: list[str] = [] size = self._read_int(reader) if reader.read_byte() != 0x8D: raise RuntimeError stt = reader.read_integer(24) if reader.read_byte() != 0x01: raise RuntimeError if stt < 1 or tables.TypeRef[stt - 1].TypeName != 'String': continue self.log_info(F'str array pattern at 0x{match.start():X}, size {size}') for k in range(size): if reader.read_byte() != 0x25: raise RuntimeError if self._read_int(reader) != k: break result.append(self._read_str(reader, header)) if reader.read_byte() != 0xA2: raise RuntimeError else: yield match.start(), result _STACK_ARRAY_PATTERN_INT = re.compile( BR'''(?x) ( \x12.|\xFE\x0D.. ) # load array variable (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # push integer value (?: \x52 ) # store value into array (?: (?: \1 ) # load same array variable (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load integer index (?: \x58 ) # add; compute offset (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # push integer value (?: \x52 ) # store value into array ){4,} ''', flags=re.DOTALL) def _int_arrays(self, data: ByteStr, header: DotNetHeader, tables: NetMetaDataTables): for match in self._STACK_ARRAY_PATTERN_INT.finditer(data): self.log_info(F'int array pattern at 0x{match.start():X}') reader = StructReader(match[0]) result: list[int] = [] opc, = reader.peek(1) skip = {0x12: 2, 0xFE: 4}[opc] reader.seekrel(skip) for index in itertools.count(1): result.append(self._read_int(reader)) assert reader.read_byte() == 0x52 if reader.eof: yield match.start(), result break reader.seekrel(skip) if self._read_int(reader) != index: self.log_info('index inconsistency; aborting') break assert reader.read_byte() == 0x58 def process(self, data): @functools.lru_cache(maxsize=None) def method(offset: int): rva = header.pe.get_rva_from_offset(offset) method = min(tables.MethodDef, key=lambda m: (m.RVA > rva, rva - m.RVA)) return method.Name header = DotNetHeader(data) tables = header.meta.Streams.Tables arrays = dict(itertools.chain( self._int_arrays(data, header, tables), self._str_arrays(data, header, tables), )) result = collections.defaultdict(list) for offset in sorted(arrays): result[method(offset)].append(arrays[offset]) result = {m: {F'v{k}': v for k, v in enumerate(t, 1)} for m, t in result.items()} return json.dumps(result, indent=4).encode(self.codec)
class dnblob
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnblob
and has the following commandline Interface:usage: dnblob [-h] [-L] [-Q] [-0] [-v] Extracts all blobs defined in the #Blob stream of .NET executables. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnblob(Unit): """ Extracts all blobs defined in the `#Blob` stream of .NET executables. """ def process(self, data): header = DotNetHeader(data, parse_resources=False) for blob in header.meta.Streams.Blob.values(): yield blob
class dncfx
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dncfx
and has the following commandline Interface:usage: dncfx [-h] [-L] [-Q] [-0] [-v] Extracts the encrypted strings from ConfuserX protected .NET execuctables. Each decrypted string is returned as a single output. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dncfx(Unit): """ Extracts the encrypted strings from ConfuserX protected .NET execuctables. Each decrypted string is returned as a single output. """ _PATTERN_ARRAY_INIT = ( BR'(\x1F.|\x20....)' # load size of a chunk BR'\x8D.\x00\x00\x01' # create a UInt32 array BR'\x25' # dup BR'\xD0%s\x04' # ldtoken: RVA of array data BR'\x28.\x00\x00.' # call to InitializeArray ) def process(self, data): header = DotNetHeader(data, parse_resources=False) decompressor = lzma() class IntegerAssignment: def __init__(self, match): self.offset = match.start() self.value, = struct.unpack('<I', match[1]) def get_size(match): ins = match[1] fmt = '<B' if ins[0] == 0x1F else '<I' result, = struct.unpack(fmt, ins[-struct.calcsize(fmt):]) return result potential_seeds = [ IntegerAssignment(m) for m in re.finditer(br'\x20(....)', data, re.DOTALL) ] for entry in header.meta.RVAs: offset = header.pe.get_offset_from_rva(entry.RVA) index = struct.pack('<I', entry.Field.Index) strings_found = 0 for match in re.finditer(self._PATTERN_ARRAY_INIT % re.escape(index[:3]), data, flags=re.DOTALL): ms = match.start() def sortkey(t): weight = abs(t.offset - ms) if t.offset < ms: # this weights assignments after the array initialization down, but still # prefers them over assignments that are further away than 2kb weight += 2000 return weight size = get_size(match) if size % 0x10 or size > 10000: continue self.log_debug(F'found RVA {entry.Field.Index} initialized with length {size}.') potential_seeds.sort(key=sortkey) for seed in potential_seeds[1:400]: # the first potential_seed will always be the assignment of the size variable ciphertext = data[offset:offset + size * 4] key = self._xs64star(seed.value) key = chunks.pack(key, 4) + ciphertext[:-0x40] decrypted = strxor(key, ciphertext) try: decompressed = decompressor(decrypted) except Exception as e: self.log_debug( F'decompression failed for seed {seed.value:08X} at offset {seed.offset:08X}: {e}') continue else: self.log_info( F'decompression worked for seed {seed.value:08X} at offset {seed.offset:08X}.') if len(decompressed) < 0x100: continue for string in self._extract_strings(decompressed): strings_found += 1 yield string if strings_found > 10: break def _xs64star(self, state): for i in range(16): state ^= (state >> 12) & 0xFFFFFFFF state ^= (state << 25) & 0xFFFFFFFF state ^= (state >> 27) & 0xFFFFFFFF yield state & 0xFFFFFFFF def _extract_strings(self, blob): reader = StreamReader(blob) while reader.tell() < len(blob): try: size = reader.expect(UInt32) string = reader.expect(StringPrimitive, size=size, codec='UTF8', align=4) except ParserEOF: return if string: yield string.encode(self.codec)
class dnds (dereference=True, encode=None, digest=None)
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnds
and has the following commandline Interface:usage: dnds [-h] [-L] [-Q] [-0] [-v] [-r] [-e UNIT | -d HASH] Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class "BinaryFormatter". The output is a representation of the deserialized data in JSON format. optional arguments: -r, --keep-references Do not resolve Object references in serialized data. -e, --encode UNIT Select an encoder unit used to represent binary data in the JSON output. Available are: hex, esc, url, b64. -d, --digest HASH Select a hashing algorithm to digest binary data; instead of the data, only the hash will be displayed. The available algorithms are: md5, crc32, sha1, sha256, sha512. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnds(JSONEncoderUnit): """ Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class "BinaryFormatter". The output is a representation of the deserialized data in JSON format. """ def __init__( self, dereference: Arg.Switch('-r', '--keep-references', off=True, help='Do not resolve Object references in serialized data.') = True, encode=None, digest=None ): super().__init__(encode=encode, digest=digest, dereference=dereference) def process(self, data): self.log_debug('initializing parser, will fail on malformed stream') bf = BinaryFormatterParser( data, keep_meta=True, dereference=self.args.dereference, ignore_errors=not self.log_debug(), ) return self.to_json([ { 'Type': repr(record), 'Data': record } for record in bf ])
class dnfields (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnfields
and has the following commandline Interface:usage: dnfields [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] This unit can extract data from constant field variables in classes of .NET executables. Since the .NET header stores only the offset and not the size of constant fields, heuristics are used to search for opcode sequences that load the data and additional heuristics are used to guess the size of the data type. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnfields(PathExtractorUnit): """ This unit can extract data from constant field variables in classes of .NET executables. Since the .NET header stores only the offset and not the size of constant fields, heuristics are used to search for opcode sequences that load the data and additional heuristics are used to guess the size of the data type. """ _SIZEMAP = { '^s?byte$' : 1, '^s?char$' : 2, '^[us]?int.?16$' : 2, '^[us]?int.?32$' : 4, '^[us]?int.?64$' : 8, } def _guess_field_info(self, tables, data, t) -> FieldInfo: pattern = ( BR'(\x20....|\x1F.)' # ldc.i4 count BR'\x8D(...)([\x01\x02])' # newarr col|row BR'\x25' # dup BR'\xD0\x%02x\x%02x\x%02x\x04' # ldtoken t BR'(?:.{0,12}?' # ... BR'\x80(...)\x04)?' % ( # stsfld variable (t >> 0x00) & 0xFF, (t >> 0x08) & 0xFF, (t >> 0x10) & 0xFF ) ) for match in re.finditer(pattern, data, flags=re.DOTALL): count, j, r, name = match.groups() count, j, r = struct.unpack('<LLB', B'%s%s\0%s' % (count[1:].ljust(4, B'\0'), j, r)) if name: try: name = struct.unpack('<L', B'%s\0' % name) name = name[0] name = tables[4][name - 1].Name except Exception as E: self.log_info(F'attempt to parse field name failed: {E!s}') name = None element = tables[r][j - 1] for pattern, size in self._SIZEMAP.items(): if re.match(pattern, element.TypeName, flags=re.IGNORECASE): return FieldInfo(element.TypeName, count, size, name) def unpack(self, data): header = DotNetHeader(data, parse_resources=False) tables = header.meta.Streams.Tables fields = tables.FieldRVA if not fields: return iwidth = len(str(len(fields))) rwidth = max(len(F'{field.RVA:X}') for field in fields) rwidth = max(rwidth, 4) remaining_field_indices = set(range(len(tables.Field))) for k, rv in enumerate(fields): _index = rv.Field.Index field = tables.Field[_index - 1] remaining_field_indices.discard(_index - 1) fname = field.Name ftype = None if len(field.Signature) == 2: # Crude signature parser for non-array case. Reference: # https://www.codeproject.com/Articles/42649/NET-File-Format-Signatures-Under-the-Hood-Part-1 # https://www.codeproject.com/Articles/42655/NET-file-format-Signatures-under-the-hood-Part-2 guess = { 0x03: FieldInfo('Char', 1, 1, None), # noqa 0x04: FieldInfo('SByte', 1, 1, None), # noqa 0x05: FieldInfo('Byte', 1, 1, None), # noqa 0x06: FieldInfo('Int16', 1, 2, None), # noqa 0x07: FieldInfo('UInt16', 1, 2, None), # noqa 0x08: FieldInfo('Int32', 1, 4, None), # noqa 0x09: FieldInfo('UInt32', 1, 4, None), # noqa 0x0A: FieldInfo('Int64', 1, 8, None), # noqa 0x0B: FieldInfo('UInt64', 1, 8, None), # noqa 0x0C: FieldInfo('Single', 1, 4, None), # noqa 0x0D: FieldInfo('Double', 1, 8, None), # noqa }.get(field.Signature[1], None) else: guess = self._guess_field_info(tables, data, _index) if guess is None: self.log_debug(lambda: F'field {k:0{iwidth}d} with signature {field.Signature.hex()}: unable to guess type information') continue totalsize = guess.count * guess.size if guess.name is not None: fname = guess.name if not fname.isprintable(): fname = F'F{rv.RVA:0{rwidth}X}' ext = ftype = guess.type.lower() if guess.count > 1: ftype += F'[{guess.count}]' self.log_info( F'field {k:0{iwidth}d}; token 0x{_index:06X}; RVA 0x{rv.RVA:04X}; count {guess.count}; type {guess.type}; name {fname}') offset = header.pe.get_offset_from_rva(rv.RVA) yield UnpackResult( F'{fname}.{ext}', lambda t=offset, s=totalsize: data[t:t + s], name=fname, type=ftype, ) for _index in remaining_field_indices: field = tables.Field[_index] index = _index + 1 name = field.Name if field.Flags.HasFieldRVA: self.log_warn(F'field {name} has RVA flag set, but no RVA was found') token = index.to_bytes(3, 'little') values = set() for match in re.finditer(( BR'\x72(?P<token>...)\x70' # ldstr BR'(?:\x6F(?P<function>...)\x0A)?' # call GetBytes BR'\x80%s\x04' # stsfld ) % re.escape(token), data, re.DOTALL): md = match.groupdict() fn_token = md.get('function') fn_index = fn_token and int.from_bytes(fn_token, 'little') or None if fn_index is not None: fn_name = tables.MemberRef[fn_index].Name if fn_name != 'GetBytes': self.log_info(F'skipping string assignment passing through call to {fn_name}') continue k = int.from_bytes(md['token'], 'little') values.add(header.meta.Streams.US[k].encode(self.codec)) if not values: continue if len(values) == 1: yield UnpackResult( F'{name}.str', next(iter(values)), name=name, type='string' )
class dnhdr (resources=False, encode=None, digest=None)
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnhdr
and has the following commandline Interface:usage: dnhdr [-h] [-L] [-Q] [-0] [-v] [-r] [-e UNIT | -d HASH] Expects data that has been formatted with the BinaryFormatter class. The output is a representation of the deserialized data in JSON format. optional arguments: -r, --resources Also parse .NET resources. -e, --encode UNIT Select an encoder unit used to represent binary data in the JSON output. Available are: hex, esc, url, b64. -d, --digest HASH Select a hashing algorithm to digest binary data; instead of the data, only the hash will be displayed. The available algorithms are: md5, crc32, sha1, sha256, sha512. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnhdr(JSONEncoderUnit): """ Expects data that has been formatted with the `BinaryFormatter` class. The output is a representation of the deserialized data in JSON format. """ def __init__( self, resources: Arg.Switch('-r', '--resources', help='Also parse .NET resources.') = False, encode=None, digest=None ): super().__init__(encode=encode, digest=digest, resources=resources) def process(self, data): dn = DotNetHeader(data, parse_resources=self.args.resources) dn = { 'Head': dn.head, 'Meta': dn.meta } if self.args.resources: dn['RSRC'] = dn.resources return self.to_json(dn)
class dnmr (*paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw=False)
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnmr
and has the following commandline Interface:usage: dnmr [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-e | -z] [-r] [-P NAME] [-w] [path [path ...]] Extracts subfiles from .NET managed resources. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -e, --exact Path patterns never match on substrings. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "name". -w, --raw Do not deserialize the managed resource entry data. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnmr(PathExtractorUnit): """ Extracts subfiles from .NET managed resources. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw: Arg.Switch('-w', help='Do not deserialize the managed resource entry data.') = False ): super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, path=path, raw=raw, fuzzy=fuzzy, exact=exact, regex=regex, ) def unpack(self, data): try: managed = NetStructuredResources(data) except NoManagedResource: managed = None if not managed: raise RefineryPartialResult('no managed resources found', partial=data) for entry in managed: if entry.Error: self.log_warn(F'entry {entry.Name} carried error message: {entry.Error}') data = entry.Data if not self.args.raw: if isinstance(entry.Value, str): data = entry.Value.encode('utf-16le') elif isbuffer(entry.Value): data = entry.Value yield UnpackResult(entry.Name, data)
class dnrc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnrc
and has the following commandline Interface:usage: dnrc [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use the dnmr unit to extract subfiles from managed .NET resources. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnrc(PathExtractorUnit): """ Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use the `refinery.units.formats.pe.dotnet.dnmr` unit to extract subfiles from managed .NET resources. """ def unpack(self, data): header = DotNetHeader(data) if not header.resources: if self.args.list: return raise ValueError('This file contains no resources.') for resource in header.resources: yield UnpackResult(resource.Name, resource.Data)
class dnsdomain (min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)
-
This unit is implemented in
refinery.units.pattern.dnsdomain
and has the following commandline Interface:usage: dnsdomain [-h] [-L] [-Q] [-0] [-v] [-n N] [-m N] [-e N] [-x] [-r] [-l] [-t N] Extracts domain names in the format as they appear in DNS requests. This can be used as a quick and dirty way to extract domains from PCAP files, for example. optional arguments: -n, --min N Matches must have length at least N. -m, --max N Matches must have length at most N. -e, --len N Matches must be of length N. -x, --stripspace Strip all whitespace from input data. -r, --duplicates Yield every (transformed) Match, even when it was found before. -l, --longest Sort results by length. -t, --take N Return only the first N occurrences in order of appearance. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnsdomain(PatternExtractorBase): """ Extracts domain names in the format as they appear in DNS requests. This can be used as a quick and dirty way to extract domains from PCAP files, for example. """ _DOMAIN_CHARACTERS = ( B'ABCDEFGHIJKLMNOPQRSTUVWXYZ' B'abcdefghijklmnopqrstuvwxyz' B'0123456789-_' ) _DOMAIN_PATTERN = BR'(?:%s){1,20}(?:%s)\b' % (_lps(0xFF), _lps(25)) def process(self, data): def transform(match): match = bytearray(match[0]) pos = 0 while pos < len(match): length = match[pos] match[pos] = 0x2E if len(match) < length + pos: return None if any(x not in self._DOMAIN_CHARACTERS for x in match[pos + 1 : pos + length]): return None pos += 1 + length return match[1:] yield from self.matches_filtered(memoryview(data), self._DOMAIN_PATTERN, transform)
class dnsfx (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnsfx
and has the following commandline Interface:usage: dnsfx [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extracts files from .NET single file applications. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class dnsfx(PathExtractorUnit): """ Extracts files from .NET single file applications. """ _SIGNATURE = bytes([ # 32 bytes represent the bundle signature: SHA-256 for '.net core bundle' 0x8b, 0x12, 0x02, 0xb9, 0x6a, 0x61, 0x20, 0x38, 0x72, 0x7b, 0x93, 0x02, 0x14, 0xd7, 0xa0, 0x32, 0x13, 0xf5, 0xb9, 0xe6, 0xef, 0xae, 0x33, 0x18, 0xee, 0x3b, 0x2d, 0xce, 0x24, 0xb3, 0x6a, 0xae ]) def unpack(self, data): reader = StreamReader(data) reader.seek(self._find_bundle_manifest_offset(data)) major_version = reader.expect(UInt32) minor_version = reader.expect(UInt32) self.log_info(F'version {major_version}.{minor_version}') count = reader.expect(UInt32) bhash = reader.expect(StringPrimitive) self.log_info(F'bundle {bhash} contains {count} files') if major_version >= 2: reader.expect(UInt64) # depsOffset reader.expect(UInt64) # depsSize reader.expect(UInt64) # runtimeConfigOffset reader.expect(UInt64) # runtimeConfigSize reader.expect(UInt64) # flags for _ in range(count): try: offset = reader.expect(UInt64) size = reader.expect(UInt64) compressed_size = 0 if major_version >= 6: compressed_size = reader.expect(UInt64) type = reader.expect(Byte) path = reader.expect(StringPrimitive) def _logmsg(): _log = F'read item at offset 0x{offset:08X}, type 0x{type:02X}, size {SizeInt(size)!r}' if compressed_size: return F'{_log}, compressed to size {SizeInt(compressed_size)!r}' return F'{_log}, uncompressed' self.log_debug(_logmsg) with reader.checkpoint(): reader.seek(offset) if compressed_size: item_data = reader.read(compressed_size) | zl | bytearray else: item_data = reader.read(size) yield UnpackResult(path, item_data) except ParserEOF: self.log_warn('unexpected EOF while parsing bundle, terminating') break def _find_bundle_manifest_offset(self, data: bytearray) -> int: bundle_sig_offset = data.find(self._SIGNATURE, 0) if bundle_sig_offset < 0: raise ValueError('Cannot find valid Bundle Manifest offset. Is this a .NET Bundle?') return int.from_bytes(data[bundle_sig_offset - 8:bundle_sig_offset], 'little') @classmethod def handles(self, data: bytearray): return self._SIGNATURE in data
class dnstr (user=True, meta=True)
-
This unit is implemented in
refinery.units.formats.pe.dotnet.dnstr
and has the following commandline Interface:usage: dnstr [-h] [-L] [-Q] [-0] [-v] [-m | -u] Extracts all strings defined in the #Strings and #US streams of .NET executables. optional arguments: -m, --meta Only extract from #Strings. -u, --user Only extract from #US. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dnstr(Unit): """ Extracts all strings defined in the `#Strings` and `#US` streams of .NET executables. """ def __init__( self, user: Arg.Switch('-m', '--meta', off=True, group='HEAP', help='Only extract from #Strings.') = True, meta: Arg.Switch('-u', '--user', off=True, group='HEAP', help='Only extract from #US.') = True, ): if not meta and not user: raise ValueError('Either ascii or utf16 strings must be enabled.') super().__init__(meta=meta, user=user) def process(self, data): header = DotNetHeader(data, parse_resources=False) if self.args.meta: for string in header.meta.Streams.Strings.values(): yield string.encode(self.codec) if self.args.user: for string in header.meta.Streams.US.values(): yield string.encode(self.codec)
class doctxt
-
This unit is implemented in
refinery.units.formats.office.doctxt
and has the following commandline Interface:usage: doctxt [-h] [-L] [-Q] [-0] [-v] Extracts the text body from Word documents. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class doctxt(Unit): """ Extracts the text body from Word documents. """ @Unit.Requires('olefile', 'formats', 'office', 'extended') def _olefile(): import olefile return olefile def process(self, data: bytearray): extractors: Dict[str, Callable[[bytearray], str]] = OrderedDict( doc=self._extract_ole, docx=self._extract_docx, odt=self._extract_odt, ) if data.startswith(B'PK'): self.log_debug('document contains zip file signature, likely a odt or docx file') extractors.move_to_end('doc') if 'opendocument' in str(data | xtzip('mimetype')): self.log_debug('odt signature detected') extractors.move_to_end('odt', last=False) for filetype, extractor in extractors.items(): self.log_debug(F'trying to extract as {filetype}') try: result = extractor(data) except ImportError: raise except Exception as error: self.log_info(F'failed extractring as {filetype}: {error!s}') else: return result.encode(self.codec) raise ValueError('All extractors failed, the input data is not recognized as any known document format.') def _extract_docx(self, data: Chunk) -> str: NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARAGRAPH = F'{NAMESPACE}p' TEXT = F'{NAMESPACE}t' chunk = data | xtzip('word/document.xml') | bytearray if not chunk: raise ValueError('No document.xml file found.') root: Element = XML(chunk) with StringIO() as output: for index, paragraph in enumerate(root.iter(PARAGRAPH)): if index > 0: output.write('\n') for node in paragraph.iter(TEXT): if node.text: output.write(node.text) return output.getvalue() def _extract_odt(self, data: bytes): def _extract_text(node: Element): NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}' PARAGRAPH = F'{NAMESPACE}p' SPAN = F'{NAMESPACE}span' SPACE = F'{NAMESPACE}s' with StringIO() as res: for element in node: tag = element.tag text = element.text or '' tail = element.tail or '' if tag in [PARAGRAPH, SPAN]: res.write(text) elif tag == SPACE: res.write(' ') else: self.log_debug(F'unknown tag: {tag}') res.write(_extract_text(element)) res.write(tail) if tag == PARAGRAPH: res.write('\n') return res.getvalue() NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}' BODY = F'{NAMESPACE}body' TEXT = F'{NAMESPACE}text' for part in xtzip().unpack(data): if part.path != 'content.xml': continue xml_content: bytes = part.get_data() root: Element = XML(xml_content) body: Element = root.find(BODY) text: Element = body.find(TEXT) return _extract_text(text) else: raise ValueError('found no text') def _extract_ole(self, data: bytearray) -> str: stream = MemoryFile(data) with self._olefile.OleFileIO(stream) as ole: doc = ole.openstream('WordDocument').read() with StructReader(doc) as reader: table_name = F'{(doc[11] >> 1) & 1}Table' reader.seek(0x1A2) offset = reader.u32() length = reader.u32() with StructReader(ole.openstream(table_name).read()) as reader: reader.seek(offset) table = reader.read(length) piece_table = self._load_piece_table(table) return self._get_text(doc, piece_table) def _load_piece_table(self, table: bytes) -> bytes: with StructReader(table) as reader: while not reader.eof: entry_type = reader.read_byte() if entry_type == 1: reader.seekrel(reader.read_byte()) continue if entry_type == 2: length = reader.u32() return reader.read(length) raise NotImplementedError(F'Unsupported table entry type value 0x{entry_type:X}.') def _get_text(self, doc: bytes, piece_table: bytes) -> str: piece_count: int = 1 + (len(piece_table) - 4) // 12 with StringIO() as text: with StructReader(piece_table) as reader: character_positions = [reader.u32() for _ in range(piece_count)] for i in range(piece_count - 1): cp_start = character_positions[i] cp_end = character_positions[i + 1] fc_value = reader.read_struct('xxLxx', unwrap=True) is_ansi = bool((fc_value >> 30) & 1) fc = fc_value & 0xBFFFFFFF cb = cp_end - cp_start if is_ansi: encoding = 'cp1252' fc = fc // 2 else: encoding = 'utf16' cb *= 2 raw = doc[fc : fc + cb] text.write(raw.decode(encoding).replace('\r', '\n')) return text.getvalue()
class drp (consecutive=False, align=False, min=1, max=∞, len=None, all=False, threshold=20, weight=0, buffer=1024, chug=False)
-
This unit is implemented in
refinery.units.misc.drp
and has the following commandline Interface:usage: drp [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-n N] [-N N] [-l N] [-a] [-t N] [-w N] [-b N | -g] Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data. The unit computes a suffix tree which may require a lot of memory for large buffers. optional arguments: -c, --consecutive Assume that the repeating pattern is consecutive when observable. -d, --align Assume that the pattern occurs at offsets that are multiples of its length. -n, --min N Minimum size of the pattern to search for. Default is 1. -N, --max N Maximum size of the pattern to search for. Default is ∞. -l, --len N Set the exact size of the pattern. This is equivalent to --min=N --max=N. -a, --all Produce one output for each repeating pattern that was detected. -t, --threshold N Patterns must match this performance threshold in percent, lest they be discarded. -w, --weight N Specifies how much longer patterns are favored over small ones. Default is 0. -b, --buffer N Maximum number of bytes to inspect at once. The default is 1024. -g, --chug Compute the prefix tree for the entire buffer instead of chunking it. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class drp(Unit): """ Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data. The unit computes a suffix tree which may require a lot of memory for large buffers. """ def __init__( self, consecutive: Arg.Switch('-c', help='Assume that the repeating pattern is consecutive when observable.') = False, align: Arg.Switch('-d', help='Assume that the pattern occurs at offsets that are multiples of its length.') = False, min: Arg.Number('-n', help='Minimum size of the pattern to search for. Default is {default}.') = 1, max: Arg.Number('-N', help='Maximum size of the pattern to search for. Default is {default}.') = INF, len: Arg.Number('-l', help='Set the exact size of the pattern. This is equivalent to --min=N --max=N.') = None, all: Arg.Switch('-a', help='Produce one output for each repeating pattern that was detected.') = False, threshold: Arg.Number('-t', help='Patterns must match this performance threshold in percent, lest they be discarded.') = 20, weight: Arg.Number('-w', help='Specifies how much longer patterns are favored over small ones. Default is {default}.') = 0, buffer: Arg.Number('-b', group='BFR', help='Maximum number of bytes to inspect at once. The default is {default}.') = 1024, chug : Arg.Switch('-g', group='BFR', help='Compute the prefix tree for the entire buffer instead of chunking it.') = False ): if len is not None: min = max = len super().__init__( min=min, max=max, all=all, consecutive=consecutive, align=align, weight=weight, buffer=buffer, chug=chug, threshold=threshold ) def _get_patterns(self, data): with stackdepth(len(data)): tree = SuffixTree(data) min_size = self.args.min max_size = self.args.max patterns = set() cursor = 0 while cursor < len(data): node = tree.root rest = data[cursor:] remaining = len(rest) length = 0 offset = None while node.children and length < remaining: for child in node.children.values(): if tree.data[child.start] == rest[length]: node = child break if node.start >= cursor: break offset = node.start - length length = node.end + 1 - offset if offset is None: cursor += 1 continue length = min(remaining, length) if max_size >= length >= min_size: pattern = rest[:length].tobytes() patterns.add(pattern) cursor += length del tree return patterns @staticmethod def _consecutive_count(data, pattern): length = len(pattern) if length == 1: return data.count(pattern) view = memoryview(data) return max(sum(1 for i in range(k, len(view), length) if view[i:i + length] == pattern) for k in range(len(pattern))) @staticmethod def _truncate_pattern(pattern): offset = 0 for byte in pattern[1:]: if byte == pattern[offset]: offset += 1 else: offset = 0 if offset > 0: pattern = pattern[:-offset] return pattern def process(self, data: bytearray): if len(data) <= 1: yield data return memview = memoryview(data) weight = 1 + (self.args.weight / 10) if self.args.chug: patterns = self._get_patterns(memview) else: patterns = set() chunksize = self.args.buffer for k in range(0, len(memview), chunksize): patterns |= self._get_patterns(memview[k:k + chunksize]) if not patterns: raise RefineryPartialResult('no repeating sequences found', data) self.log_debug('removing duplicate pattern detections') duplicates = set() maxlen = max(len(p) for p in patterns) for pattern in sorted(patterns, key=len): for k in range(2, maxlen // len(pattern) + 1): repeated = pattern * k if repeated in patterns: duplicates.add(repeated) patterns -= duplicates self.log_debug(F'counting coverage of {len(patterns)} patterns') pattern_count = {p: data.count(p) for p in patterns} pattern_performance = dict(pattern_count) for consecutive in (False, True): if consecutive: self.log_debug(F're-counting coverage of {len(patterns)} patterns') patterns = {self._truncate_pattern(p) for p in patterns} pattern_performance = {p: self._consecutive_count(data, p) for p in patterns} self.log_debug('evaluating pattern performance') for pattern, count in pattern_performance.items(): pattern_performance[pattern] = count * (len(pattern) ** weight) best_performance = max(pattern_performance.values()) for pattern, performance in pattern_performance.items(): pattern_performance[pattern] = performance / best_performance self.log_debug('removing patterns below performance threshold') threshold = self.args.threshold patterns = {p for p in patterns if pattern_performance[p] * 100 >= threshold} pattern_count = {p: data.count(p) for p in patterns} if not self.args.consecutive: break if self.args.all: for pattern in sorted(patterns, key=pattern_performance.get, reverse=True): yield self.labelled(pattern, count=pattern_count[pattern]) return best_patterns = [p for p in patterns if pattern_performance[p] == 1.0] if len(best_patterns) > 1: self.log_warn('could not determine unique best repeating pattern, returning the first of these:') for k, pattern in enumerate(best_patterns): self.log_warn(F'{k:02d}.: {pattern.hex()}') result = best_patterns[0] if self.args.align: def rotated(pattern): for k in range(len(pattern)): yield pattern[k:] + pattern[:k] rotations = {k % len(result): r for k, r in ( (data.find(r), r) for r in rotated(result)) if k >= 0} result = rotations[min(rotations)] yield result
class dsjava
-
This unit is implemented in
refinery.units.formats.java.deserialize
and has the following commandline Interface:usage: dsjava [-h] [-L] [-Q] [-0] [-v] Deserialize Java serialized data and re-serialize as JSON. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dsjava(Unit): """ Deserialize Java serialized data and re-serialize as JSON. """ @Unit.Requires('javaobj-py3>=0.4.0.1', 'formats') def _javaobj(): import javaobj.v2 return javaobj.v2 def process(self, data): with JavaEncoder as encoder: return encoder.dumps(self._javaobj.loads(data)).encode(self.codec)
class dsphp
-
This unit is implemented in
refinery.units.formats.deserialize_php
and has the following commandline Interface:usage: dsphp [-h] [-L] [-Q] [-0] [-v] [-R] Deserialize PHP serialized data and re-serialize as JSON. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class dsphp(Unit): """ Deserialize PHP serialized data and re-serialize as JSON. """ @Unit.Requires('phpserialize', 'formats') def _php(): import phpserialize return phpserialize def reverse(self, data): return self._php.dumps(json.loads(data)) def process(self, data): phpobject = self._php.phpobject class encoder(json.JSONEncoder): def default(self, obj): try: return super().default(obj) except TypeError: pass if isinstance(obj, bytes) or isinstance(obj, bytearray): return obj.decode('utf8') if isinstance(obj, phpobject): return obj._asdict() return json.dumps( self._php.loads( data, object_hook=phpobject, decode_strings=True ), indent=4, cls=encoder ).encode(self.codec)
class dump (*files, tee=False, stream=False, plain=False, force=False)
-
This unit is implemented in
refinery.units.sinks.dump
and has the following commandline Interface:usage: dump [-h] [-L] [-Q] [-0] [-v] [-t] [-s] [-p] [-f] [file [file ...]] Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any metadata field on an incoming chunk is available. Additionally, any field that can be populated by the cm unit is also available. These include the following: {ext} : Automatically guessed file extension {crc32} : CRC32 checksum of the data {index} : Index of the data in the input stream, starting at 0 {size} : Size of the data in bytes {md5} : MD5 hash of the data {sha1} : SHA1 hash of the data {sha256} : SHA-256 hash of the data {path} : Associated path; defaults to {sha256} if none is given. When not using formatted file names, the unit ingests as many incoming inputs as filenames were specified on the command line. Unless connected to a terminal, the remaining inputs will be forwarded on STDOUT. The -t or --tee switch can be used to forward all inputs, under all circumstances, regardless of whether or not they have been processed. If no file is specified, all ingested inputs are concatenated and written to the clipboard. This will only succeed when the data can successfully be encoded. positional arguments: file Optionally formatted filename. optional arguments: -t, --tee Forward all inputs to STDOUT. -s, --stream Dump all incoming data to the same file. -p, --plain Never apply any formatting to file names. -f, --force Remove files if necessary to create dump path. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class dump(Unit): """ Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any metadata field on an incoming chunk is available. Additionally, any field that can be populated by the `refinery.cm` unit is also available. These include the following: {ext} : Automatically guessed file extension {crc32} : CRC32 checksum of the data {index} : Index of the data in the input stream, starting at 0 {size} : Size of the data in bytes {md5} : MD5 hash of the data {sha1} : SHA1 hash of the data {sha256} : SHA-256 hash of the data {path} : Associated path; defaults to {sha256} if none is given. When not using formatted file names, the unit ingests as many incoming inputs as filenames were specified on the command line. Unless connected to a terminal, the remaining inputs will be forwarded on STDOUT. The `-t` or `--tee` switch can be used to forward all inputs, under all circumstances, regardless of whether or not they have been processed. If no file is specified, all ingested inputs are concatenated and written to the clipboard. This will only succeed when the data can successfully be encoded. """ def __init__( self, *files: Arg(metavar='file', type=str, help='Optionally formatted filename.'), tee : Arg.Switch('-t', help='Forward all inputs to STDOUT.') = False, stream : Arg.Switch('-s', help='Dump all incoming data to the same file.') = False, plain : Arg.Switch('-p', help='Never apply any formatting to file names.') = False, force : Arg.Switch('-f', help='Remove files if necessary to create dump path.') = False, ): if stream and len(files) != 1: raise ValueError('Can only use exactly one file in stream mode.') super().__init__(files=files, tee=tee, stream=stream, force=force) self.stream = None self._formatted = not plain and any(self._has_format(f) for f in files) self._reset() @staticmethod def _has_format(filename): if not isinstance(filename, str): return False formatter = Formatter() return any( any(t.isalnum() for t in fields) for _, fields, *__ in formatter.parse(filename) if fields ) def _reset(self): self.exhausted = False self.paths = cycle(self.args.files) if self._formatted else iter(self.args.files) self._close() @property def _clipcopy(self): return not self.args.files def _components(self, path): def _reversed_components(path): while True: path, component = os.path.split(path) if not component: break yield component yield path components = list(_reversed_components(path)) components.reverse() return components def _open(self, path, unc=False): if hasattr(path, 'close'): return path path = os.path.abspath(path) base = os.path.dirname(path) if not unc: self.log_info('opening:', path) try: os.makedirs(base, exist_ok=True) except FileExistsError: self.log_info('existed:', path) part, components = '', self._components(path) while components: component, *components = components part = os.path.join(part, component) if os.path.exists(part) and os.path.isfile(part): if self.args.force: os.unlink(part) return self._open(path, unc) break raise RefineryCriticalException(F'Unable to dump to {path} because {part} is a file.') except FileNotFoundError: if unc or os.name != 'nt': raise path = F'\\\\?\\{path}' return self._open(path, unc=True) except OSError as e: if not self.log_info(): self.log_warn('opening:', path) self.log_warn('errored:', e.args[1]) return open(os.devnull, 'wb') else: mode = 'ab' if self.args.stream else 'wb' return open(path, mode) def _close(self, final=False): if not self.stream: return self.stream.flush() if self.args.stream and not final: return if self._clipcopy: if os.name == 'nt': from refinery.lib.winclip import ClipBoard, CF try: img = self._image.open(self.stream) with io.BytesIO() as out: img.save(out, 'BMP') except Exception: with ClipBoard(CF.TEXT) as cpb: cpb.copy(self.stream.getvalue()) else: with ClipBoard(CF.DIB) as cpb: out.seek(14, io.SEEK_SET) cpb.copy(out.read()) else: data = self.stream.getvalue() data = data.decode(self.codec, errors='backslashreplace') self._pyperclip.copy(data) self.stream.close() self.stream = None @Unit.Requires('pyperclip') def _pyperclip(): import pyperclip return pyperclip @Unit.Requires('Pillow', 'formats') def _image(): from PIL import Image return Image def process(self, data: bytes): forward_input_data = self.args.tee if self._clipcopy: self.stream.write(data) elif not self.exhausted: if not self.stream: # This should happen only when the unit is called from Python code # rather than via the command line. try: path = next(self.paths) except StopIteration: raise RefineryCriticalException('the list of filenames was exhausted.') else: with self._open(path) as stream: stream.write(data) else: self.stream.write(data) self.log_debug(F'wrote 0x{len(data):08X} bytes') self._close() else: forward_input_data = forward_input_data or not self.isatty if not forward_input_data: size = metavars(data).size self.log_warn(F'discarding unprocessed chunk of size {size!s}.') if forward_input_data: yield data def filter(self, chunks): if self.exhausted: self._reset() nostream = not self.args.stream clipcopy = self._clipcopy if clipcopy: self.stream = io.BytesIO() for index, chunk in enumerate(chunks, 0): if not chunk.visible: continue if not clipcopy and not self.exhausted and (nostream or not self.stream): try: path = next(self.paths) except StopIteration: self.exhausted = True else: if self._has_format(path): meta = metavars(chunk) meta.ghost = True meta.update_index(index) path = meta.format_str(path, self.codec, [chunk]) self.stream = self._open(path) yield chunk self._close(final=True) self.exhausted = True
class eat (name)
-
This unit is implemented in
refinery.units.meta.eat
and has the following commandline Interface:usage: eat [-h] [-L] [-Q] [-0] [-v] name Consume a meta variable and replace the contents of the current chunk with it. If the variable contains a string, it is encoded with the default codec. If the variable cannot be converted to a byte string, the data is lost and an empty chunk is returned. positional arguments: name The name of the variable to be used. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class eat(Unit): """ Consume a meta variable and replace the contents of the current chunk with it. If the variable contains a string, it is encoded with the default codec. If the variable cannot be converted to a byte string, the data is lost and an empty chunk is returned. """ def __init__( self, name: Arg(help='The name of the variable to be used.', type=str), ): super().__init__(name=check_variable_name(name)) def process(self, data: Chunk): def invalid_type(): return F'variable {name} is of type "{type}", unable to convert to byte string - data is lost' name = self.args.name meta = metavars(data) data = meta.pop(name) type = data.__class__.__name__ if isinstance(data, int): self.log_info(F'variable {name} is an integer, converting to string.') data = str(data).encode(self.codec) if isinstance(data, str): self.log_info(F'variable {name} is a string, encoding as {self.codec}') data = data.encode(self.codec) elif not isbuffer(data): try: wrapped = bytearray(data) except Exception: self.log_warn(invalid_type()) data = None else: data = wrapped return data
class ef (*filenames, list=False, meta=False, size=None, read=0, wild=False, tame=False, symlinks=False, linewise=False)
-
This unit is implemented in
refinery.units.meta.ef
and has the following commandline Interface:usage: ef [-h] [-L] [-Q] [-0] [-v] [-l] [-m] [-s start:end:step] [-r N] [-w | -t] [-y] [-i] FILEMASK [FILEMASK ...] Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to read large files in chunks. positional arguments: FILEMASK A list of file masks. Each matching file will be read from disk and emitted. The file masks can include format string expressions which will be substituted from the current meta variables. The masks can use wild-card expressions, but this feature is disabled by default on Posix platforms, where it has to be enabled explicitly using the -w switch. On Windows, the feature is enabled by default and can be disabled using the -t switch. optional arguments: -l, --list Only lists files with metadata. -m, --meta Adds the atime, mtime, ctime, and size metadata variables. -s, --size start:end:step If specified, only files are read whose size is in the given range. -r, --read N If specified, files will be read in chunks of size N and each chunk is emitted as one element in the output list. -w, --wild Force use of wildcard patterns in file masks. -t, --tame Disable wildcard patterns in file masks. -y, --symlinks Follow symbolic links and junctions, these are ignored by default. -i, --linewise Read the file linewise. By default, one line is read at a time. In line mode, the --read argument can be used to read the given number of lines in each chunk. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ef(Unit): """ Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to read large files in chunks. """ def __init__(self, *filenames: Arg(metavar='FILEMASK', nargs='+', type=str, help=( 'A list of file masks. Each matching file will be read from disk and ' 'emitted. The file masks can include format string expressions which ' 'will be substituted from the current meta variables. The masks can ' 'use wild-card expressions, but this feature is disabled by default on ' 'Posix platforms, where it has to be enabled explicitly using the -w ' 'switch. On Windows, the feature is enabled by default and can be ' 'disabled using the -t switch.' )), list: Arg.Switch('-l', help='Only lists files with metadata.') = False, meta: Arg.Switch('-m', help=( 'Adds the atime, mtime, ctime, and size metadata variables.' )) = False, size: Arg.Bounds('-s', range=True, help=( 'If specified, only files are read whose size is in the given range.')) = None, read: Arg.Number('-r', help=( 'If specified, files will be read in chunks of size N and each ' 'chunk is emitted as one element in the output list.' )) = 0, wild: Arg.Switch('-w', group='W', help='Force use of wildcard patterns in file masks.') = False, tame: Arg.Switch('-t', group='W', help='Disable wildcard patterns in file masks.') = False, symlinks: Arg.Switch('-y', help='Follow symbolic links and junctions, these are ignored by default.') = False, linewise: Arg.Switch('-i', help=( 'Read the file linewise. By default, one line is read at a time. ' 'In line mode, the --read argument can be used to read the given ' 'number of lines in each chunk.' )) = False ): if wild and tame: raise ValueError('Cannot be both wild and tame!') super().__init__( size=size, read=read, list=list, meta=meta, wild=wild, tame=tame, symlinks=symlinks, linewise=linewise, filenames=filenames ) def _read_chunks(self, fd): while True: buffer = fd.read(self.args.read) if not buffer: break yield buffer def _read_lines(self, fd): count = self.args.read or 1 if count == 1: while True: buffer = fd.readline() if not buffer: break yield buffer return with MemoryFile() as out: while True: for _ in range(count): buffer = fd.readline() if not buffer: break out.write(buffer) if not out.tell(): break yield out.getvalue() out.seek(0) out.truncate() def _absolute_path(self, path_string: str): path = Path(path_string).absolute() if os.name == 'nt' and not path.parts[0].startswith('\\\\?\\'): # The pathlib glob method will simply fail mid-traversal if it attempts to descend into # a folder or to a file whose path exceeds MAX_PATH on Windows. As a workaround, we use # UNC paths throughout and truncate to relative paths after enumeration. path = Path(F'\\\\?\\{path!s}') return path def _glob(self, pattern: str) -> Iterable[Path]: if pattern.endswith('**'): pattern += '/*' wildcard = re.search(R'[\[\?\*]', pattern) if wildcard is None: yield self._absolute_path(pattern) return k = wildcard.start() base, pattern = pattern[:k], pattern[k:] path = self._absolute_path(base or '.') last = path.parts[-1] if base.endswith(last): # /base/something.* pattern = F'{last}{pattern}' path = path.parent scandir = os.scandir class EmptyIterator: def __enter__(self): return self def __exit__(self, *_, **__): pass def __next__(self): raise StopIteration def __iter__(self): return self if sys.version_info >= (3, 12): def islink(path): return os.path.islink(path) or os.path.isjunction(path) else: def islink(path): try: return bool(os.readlink(path)) except OSError: return False paths_scanned = set() def _patched_scandir(path): if islink(path): if not self.args.symlinks: return EmptyIterator() try: rp = os.path.realpath(path, strict=True) except OSError: return EmptyIterator() if rp in paths_scanned: self.log_warn(F'file system loop at: {path!s}') return EmptyIterator() paths_scanned.add(rp) path = rp try: return scandir(path) except Exception as e: ignore = _ERROR_IGNORES.get(os.name, set()) if not any(p.lower() in ignore for p in Path(path).parts): self.log_warn(F'error calling scandir, {exception_to_string(e)}: {path}') return EmptyIterator() try: os.scandir = _patched_scandir for match in path.glob(pattern): yield match finally: os.scandir = scandir def process(self, data): meta = metavars(data) size = self.args.size size = size and range(size.start, size.stop, size.step) meta.ghost = True wild = (os.name == 'nt' or self.args.wild) and not self.args.tame root = self._absolute_path('.') paths = self._glob if wild else lambda mask: [self._absolute_path(mask)] do_meta = self.args.meta do_stat = size or do_meta class SkipErrors: unit = self def __init__(self): self._history: Set[type] = set() self._message: Dict[type, Optional[str]] = { ValueError: ( None ), PermissionError: ( 'access error while scanning: {}' ), OSError: ( 'system error while scanning: {}' ), FileNotFoundError: ( 'file unexpectedly not found: {}' ), Exception: ( 'unknown error while reading: {}' ), } self.path = None def reset(self, path): self._history.clear() self.path = path return self def __enter__(self): return self def __exit__(self, et, ev, trace): if et is None: return False for t, msg in self._message.items(): if issubclass(et, t): if t not in self._history: self._history.add(t) if msg is not None: self.unit.log_info(msg.format(self.path)) return True else: return False for mask in self.args.filenames: mask = meta.format_str(mask, self.codec, [data]) self.log_debug('scanning for mask:', mask) kwargs = dict() skip_errors = SkipErrors() for path in paths(mask): skip_errors.reset(path) filesize = None with skip_errors: path = path.relative_to(root) with skip_errors: if wild and not path.is_file(): continue with skip_errors: if do_stat: stat = path.stat() filesize = stat.st_size if do_meta: kwargs.update( fsize=filesize, atime=datetime.fromtimestamp(stat.st_atime).isoformat(' ', 'seconds'), ctime=datetime.fromtimestamp(stat.st_ctime).isoformat(' ', 'seconds'), mtime=datetime.fromtimestamp(stat.st_mtime).isoformat(' ', 'seconds') ) if size is not None and filesize not in size: continue with skip_errors: if self.args.list: yield self.labelled(str(path).encode(self.codec), **kwargs) continue with path.open('rb') as stream: if self.args.linewise: yield from self._read_lines(stream) elif self.args.read: yield from self._read_chunks(stream) else: data = stream.read() self.log_info(lambda: F'reading: {path!s} ({len(data)} bytes)') yield self.labelled(data, path=path.as_posix(), **kwargs)
class emit (*data)
-
This unit is implemented in
refinery.units.meta.emit
and has the following commandline Interface:usage: emit [-h] [-L] [-Q] [-0] [-v] [data [data ...]] positional arguments: data Data to be emitted. If no argument is specified, data is retrieved from the clipboard. Multiple arguments are output in framed format. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class emit(Unit): def __init__(self, *data: Arg(help=( 'Data to be emitted. If no argument is specified, data is retrieved from ' 'the clipboard. Multiple arguments are output in framed format.' ))): super().__init__(data=data) @Unit.Requires('pyperclip') def _pyperclip(): import pyperclip return pyperclip def process(self, data): if self.args.data: yield from self.args.data return if os.name == 'nt': from refinery.lib.winclip import get_any_data mode, data = get_any_data() if mode is not None: self.log_info(F'retrieved clipboard data in {mode.name} format') yield data else: data = self._pyperclip.paste() if not data: return yield data.encode(self.codec, 'replace')
class esc (hex=False, unicode=False, greedy=False, unquoted=False, quoted=False, bare=False)
-
This unit is implemented in
refinery.units.encoding.esc
and has the following commandline Interface:usage: esc [-h] [-L] [-Q] [-0] [-v] [-R] [-x] [-u] [-g] [-p | -q] [-b] Encodes and decodes common ASCII escape sequences. optional arguments: -x, --hex Hex encode everything, do not use C escape sequences. -u, --unicode Use unicode escape sequences and UTF-8 encoding. -g, --greedy Replace \x by x and \u by u when not followed by two or four hex digits, respectively. -p, --unquoted Never remove enclosing quotes. -q, --quoted Remove enclosing quotes while decoding and add them for encoding. -b, --bare Do not escape quote characters. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class esc(Unit): """ Encodes and decodes common ASCII escape sequences. """ _ESCAPE = { 0x00: BR'\0', 0x07: BR'\a', 0x08: BR'\b', 0x0C: BR'\f', 0x0A: BR'\n', 0x0D: BR'\r', 0x09: BR'\t', 0x0B: BR'\v', 0x5C: BR'\\', 0x27: BR'\'', 0x22: BR'\"' } _UNESCAPE = { BR'0': B'\x00', BR'a': B'\x07', BR'b': B'\x08', BR'f': B'\x0C', BR'n': B'\x0A', BR'r': B'\x0D', BR't': B'\x09', BR'v': B'\x0B', B'\\': B'\x5C', BR"'": B'\x27', BR'"': B'\x22' } def __init__(self, hex : Arg.Switch('-x', help='Hex encode everything, do not use C escape sequences.') = False, unicode : Arg.Switch('-u', help='Use unicode escape sequences and UTF-8 encoding.') = False, greedy : Arg.Switch('-g', help='Replace \\x by x and \\u by u when not followed by two or four hex digits, respectively.') = False, unquoted: Arg.Switch('-p', group='Q', help='Never remove enclosing quotes.') = False, quoted : Arg.Switch('-q', group='Q', help='Remove enclosing quotes while decoding and add them for encoding.') = False, bare : Arg.Switch('-b', help='Do not escape quote characters.') = False, ) -> Unit: pass # noqa def process(self, data): data = memoryview(data) if self.args.quoted: quote = data[0] if data[-1] != quote: self.log_info('string is not correctly quoted') else: data = data[1:-1] elif not self.args.unquoted: quote = data[:1] strip = data[1:-1] if data[-1:] == quote and not re.search(br'(?<!\\)' + re.escape(quote), strip): self.log_info('removing automatically detected quotes') data = strip def unescape(match): c = match[1] if len(c) > 1: if c[0] == 0x75: # unicode upper = int(c[1:3], 16) lower = int(c[3:5], 16) if self.args.unicode: return bytes((lower, upper)).decode('utf-16le').encode(self.codec) return bytes((lower,)) elif c[0] == 0x78: # hexadecimal return bytes((int(c[1:3], 16),)) else: # octal escape sequence return bytes((int(c, 8) & 0xFF,)) elif c in B'ux': return c if self.args.greedy else match[0] return self._UNESCAPE.get(c, c) data = re.sub( RB'\\(u[a-fA-F0-9]{4}|x[a-fA-F0-9]{1,2}|[0-7]{3}|.)', unescape, data) return data def reverse(self, data): if self.args.unicode: string = data.decode(self.codec).encode('UNICODE_ESCAPE') else: if not self.args.hex: def escape(match): c = match[0][0] return self._ESCAPE.get(c, RB'\x%02x' % c) pattern = RB'[\x00-\x1F\x22\x27\x5C\x7F-\xFF]' if self.args.bare: pattern = RB'[\x00-\x1F\x5C\x7F-\xFF]' string = re.sub(pattern, escape, data) else: string = bytearray(4 * len(data)) for k in range(len(data)): a = k * 4 b = k * 4 + 4 string[a:b] = RB'\x%02x' % data[k] if self.args.quoted: string = B'"%s"' % string return string
class evtx (raw=False)
-
This unit is implemented in
refinery.units.formats.evtx
and has the following commandline Interface:usage: evtx [-h] [-L] [-Q] [-0] [-v] [-r] Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single output chunk in XML format. optional arguments: -r, --raw Extract raw event data rather than XML. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class evtx(Unit): """ Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single output chunk in XML format. """ def __init__(self, raw: Unit.Arg.Switch('-r', help='Extract raw event data rather than XML.') = False): super().__init__(raw=raw) @Unit.Requires('python-evtx', 'formats') def _evtx(): from Evtx.Evtx import Evtx return Evtx def process(self, data): with VirtualFileSystem() as vfs: raw = self.args.raw with self._evtx(vfs.new(data)) as log: for record in log.records(): yield record.data() if raw else record.xml().encode(self.codec)
class fernet (key)
-
This unit is implemented in
refinery.units.crypto.cipher.fernet
and has the following commandline Interface:usage: fernet [-h] [-L] [-Q] [-0] [-v] key Decrypt Fernet messages. positional arguments: key A fernet key, either in base64 or raw binary. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class fernet(Unit): """ Decrypt Fernet messages. """ def __init__(self, key: Arg(help='A fernet key, either in base64 or raw binary.')): super().__init__(key=key) def _b64(self, data): try: return data | b64(urlsafe=True) | bytearray except Exception: return data def process(self, data): fk = self._b64(self.args.key) if len(fk) != 32: raise ValueError(F'The given Fernet key has length {len(fk)}, expected 32 bytes.') signing_key = fk[:16] encryption_key = fk[16:] decoded = self._b64(data) reader = StructReader(memoryview(decoded), bigendian=True) signed_data = reader.peek(reader.remaining_bytes - 32) version = reader.u8() timestamp = datetime.fromtimestamp(reader.u64()) iv = reader.read(16) if version != 0x80: self.log_warn(F'The Fernet version is 0x{version:02X}, the only documented one is 0x80.') ciphertext = reader.read(reader.remaining_bytes - 32) if len(ciphertext) % 16 != 0: raise ValueError('The encoded ciphertext is not 16-byte block aligned.') signature = reader.read(32) hmac = HMAC.new(signing_key, digestmod=SHA256) hmac.update(signed_data) if hmac.digest() != signature: self.log_warn('HMAC verification failed; the message has been tampered with.') self.log_info(F'computed signature: {hmac.hexdigest().upper()}') self.log_info(F'provided signature: {signature.hex().upper()}') plaintext = ciphertext | aes(mode='cbc', iv=iv, key=encryption_key) | bytearray return self.labelled(plaintext, timestamp=timestamp.isoformat(' ', 'seconds'))
class gost (key, iv=b'', padding=None, mode=None, raw=False, swap=False, sbox=SBOX.R34, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False)
-
This unit is implemented in
refinery.units.crypto.cipher.gost
and has the following commandline Interface:usage: gost [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-x SBOX] [-e] [-S N] key GOST encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -s, --swap Decode blocks as big endian rather than little endian. -x, --sbox SBOX Choose an SBOX. The default is R34, which corresponds to the R-34.12.2015 standard. The other option is CBR, which is the SBOX used by the Central Bank of Russia. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class gost(StandardBlockCipherUnit, cipher=BlockCipherFactory(GOST)): """ GOST encryption and decryption. """ def __init__( self, key, iv=B'', padding=None, mode=None, raw=False, swap: Arg.Switch('-s', help='Decode blocks as big endian rather than little endian.') = False, sbox: Arg.Option('-x', choices=SBOX, help=( 'Choose an SBOX. The default is {default}, which corresponds to the R-34.12.2015 standard. ' 'The other option is CBR, which is the SBOX used by the Central Bank of Russia.' )) = SBOX.R34, **more ): sbox = Arg.AsOption(sbox, SBOX) super().__init__(key, iv, padding=padding, mode=mode, raw=raw, swap=swap, sbox=sbox, **more) def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( swap=self.args.swap, sbox=self.args.sbox, **optionals )
class group (size)
-
This unit is implemented in
refinery.units.meta.group
and has the following commandline Interface:usage: group [-h] [-L] [-Q] [-0] [-v] N Group incoming chunks into frames of the given size. positional arguments: N Size of each group; must be at least 2. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class group(Unit): """ Group incoming chunks into frames of the given size. """ def __init__(self, size: Arg.Number(help='Size of each group; must be at least 2.', bound=(2, None))): super().__init__(size=size) def process(self, data: Chunk): if not data.temp: return yield data yield from islice(data.temp, 0, self.args.size - 1) def filter(self, chunks): it = iter(chunks) while True: try: head: Chunk = next(it) except StopIteration: return head.temp = it yield head
class groupby (name)
-
This unit is implemented in
refinery.units.meta.groupby
and has the following commandline Interface:usage: groupby [-h] [-L] [-Q] [-0] [-v] name Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot stream any output until the input frame is consumed: It has to read every input chunk to make sure that all groupings are complete. positional arguments: name name of the meta variable generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class groupby(Unit): """ Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot stream any output until the input frame is consumed: It has to read every input chunk to make sure that all groupings are complete. """ def __init__(self, name: Arg(type=str, help='name of the meta variable')): super().__init__(name=check_variable_name(name)) def process(self, data): yield from data.temp def filter(self, chunks: Iterable[Chunk]) -> Generator[Chunk, None, None]: name = self.args.name members = defaultdict(list) for chunk in chunks: try: value = chunk.meta[name] except KeyError: value = None members[value].append(chunk) for chunklist in members.values(): dummy = chunklist[0] dummy.temp = chunklist yield dummy
class hc128 (key, discard=0, stateful=False)
-
This unit is implemented in
refinery.units.crypto.cipher.hc128
and has the following commandline Interface:usage: hc128 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key HC-128 encryption and decryption. positional arguments: key The encryption key. optional arguments: -d, --discard N Discard the first N bytes of the keystream, 0 by default. -s, --stateful Do not reset the key stream while processing the chunks of one frame. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class hc128(StreamCipherUnit): """ HC-128 encryption and decryption. """ key_size = {32} def keystream(self) -> Iterable[int]: return HC128(self.args.key)
class hc256 (key, iv=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', discard=0, stateful=False)
-
This unit is implemented in
refinery.units.crypto.cipher.hc256
and has the following commandline Interface:usage: hc256 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key [iv] HC-256 encryption and decryption. positional arguments: key The encryption key. iv An initialization vector; the default is a sequence of 32 zero bytes. optional arguments: -d, --discard N Discard the first N bytes of the keystream, 0 by default. -s, --stateful Do not reset the key stream while processing the chunks of one frame. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class hc256(StreamCipherUnit): """ HC-256 encryption and decryption. """ key_size = {32} def __init__( self, key, iv: Arg(help='An initialization vector; the default is a sequence of 32 zero bytes.') = bytes(32), discard=0, stateful=False, ): super().__init__(key=key, iv=iv, stateful=stateful, discard=discard) self._keystream = None def keystream(self) -> Iterable[int]: for num in HC256(self.args.key, self.args.iv): yield from num.to_bytes(4, 'little')
class hex
-
This unit is implemented in
refinery.units.encoding.hex
and has the following commandline Interface:usage: hex [-h] [-L] [-Q] [-0] [-v] [-R] [-F] Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding, any odd trailing hex digits are stripped as two hex digits are required to represent a byte. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class hex(Unit): """ Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding, any odd trailing hex digits are stripped as two hex digits are required to represent a byte. """ def reverse(self, data): import base64 return base64.b16encode(data) def process(self, data): import re import base64 data = re.sub(B'[^A-Fa-f0-9]+', B'', data) if len(data) % 2: data = data[:-1] return base64.b16decode(data, casefold=True) @classmethod def handles(self, data: bytearray): from refinery.lib.patterns import formats if formats.spaced_hex.fullmatch(data): return True
class hexload (blocks=1, dense=False, expand=False, narrow=False, width=0)
-
This unit is implemented in
refinery.units.formats.hexload
and has the following commandline Interface:usage: hexload [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] [-D] [-E] [-N] [-W N] Convert hex dumps back to the original data and vice versa. All options of this unit apply to its reverse operation where binary data is converted to a readable hexdump format. The default mode of the unit expects the input data to contain a readable hexdump and converts it back to binary. optional arguments: -B, --blocks N Group hexadecimal bytes in blocks of the given size; default is 1. -D, --dense Do not insert spaces in hexdump. -E, --expand Do not compress sequences of identical lines in hexdump -N, --narrow Do not show addresses in hexdump -W, --width N Specify the number of hexadecimal characters to use in preview. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class hexload(HexViewer): """ Convert hex dumps back to the original data and vice versa. All options of this unit apply to its reverse operation where binary data is converted to a readable hexdump format. The default mode of the unit expects the input data to contain a readable hexdump and converts it back to binary. """ @regex class _ENCODED_BYTES: R""" (?ix)(?:^|(?<=\s)) # encoded byte patches must be prefixed by white space (?: (?: # separated chunks of hex data [a-f0-9]{2} # hexadecimal chunk; single byte (two hexadecimal letters) \s{1,2} # encoded byte followed by whitespace (?: # at least one more encoded byte [a-f0-9]{2} # followed by more encoded bytes (?:\s{1,2}[a-f0-9]{2})* # unless it was just a single byte )? ) | (?:[a-f0-9]{4}\s{1,2} # 2-byte chunks (?:[a-f0-9]{4} (?:\s{1,2}[a-f0-9]{4})*)?) | (?:[a-f0-9]{8}\s{1,2} # 4-byte chunks (?:[a-f0-9]{8} (?:\s{1,2}[a-f0-9]{8})*)?) | (?:(?:[a-f0-9]{2})+) # continuous line of hexadecimal characters )(?=\s|$) # terminated by a whitespace or line end """ def __init__(self, blocks=1, dense=False, expand=False, narrow=False, width=0): super().__init__(blocks=blocks, dense=dense, expand=expand, narrow=narrow, width=width) self._hexline_pattern = re.compile(F'{make_hexline_pattern(1)}(?:[\r\n]|$)', flags=re.MULTILINE) def process(self, data: bytearray): lines = data.decode(self.codec).splitlines(keepends=False) if not lines: return None decoded_bytes = bytearray() encoded_byte_matches: List[Dict[int, int]] = [] for line in lines: matches: Dict[int, int] = {} encoded_byte_matches.append(matches) for match in self._ENCODED_BYTES.finditer(line): a, b = match.span() matches[a] = b - a it = iter(encoded_byte_matches) offsets = set(next(it).keys()) for matches in it: offsets.intersection_update(matches.keys()) if not offsets: raise ValueError('unable to determine the position of the hex bytes in this dump') lengths: Dict[int, List[int]] = {offset: [] for offset in offsets} del offsets for matches in encoded_byte_matches: for offset in lengths: lengths[offset].append(matches[offset]) for offset in lengths: lengths[offset].sort() midpoint = len(encoded_byte_matches) // 2 offset, length = max(((offset, lengths[offset][midpoint]) for offset in lengths), key=operator.itemgetter(1)) end = offset + length del lengths for k, line in enumerate(lines, 1): encoded_line = line[offset:end] onlyhex = re.search(r'^[\sA-Fa-f0-9]+', encoded_line) if not onlyhex: self.log_warn(F'ignoring line without hexadecimal data: {line}') continue if onlyhex.group(0) != encoded_line: if k != len(lines): self.log_warn(F'ignoring line with mismatching hex data length: {line}') continue encoded_line = onlyhex.group(0) self.log_debug(F'decoding: {encoded_line.strip()}') decoded_line = bytes.fromhex(encoded_line) decoded_bytes.extend(decoded_line) txt = line[end:] txt_stripped = re.sub('\\s+', '', txt) if not txt_stripped: continue if len(decoded_line) not in range(len(txt_stripped), len(txt) + 1): self.log_warn(F'preview size {len(txt_stripped)} does not match decoding: {line}') if decoded_bytes: yield decoded_bytes def reverse(self, data): metrics = self._get_metrics(len(data)) if not self.args.width: metrics.fit_to_width(allow_increase=True) for line in self.hexdump(data, metrics): yield line.encode(self.codec)
class HKDF (size, salt, hash='SHA512')
-
This unit is implemented in
refinery.units.crypto.keyderive.hkdf
and has the following commandline Interface:usage: HKDF [-h] [-L] [-Q] [-0] [-v] size salt [hash] HKDF Key derivation positional arguments: size The number of bytes to generate. salt Salt for the derivation. hash Specify one of these algorithms (default is SHA512): md2, md4, md5, sha1, sha256, sha512, sha224, sha384 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class HKDF(KeyDerivation): """HKDF Key derivation""" def __init__(self, size, salt, hash='SHA512'): super().__init__(size=size, salt=salt, hash=hash) def process(self, data): from Cryptodome.Protocol.KDF import HKDF return HKDF(data, self.args.size, self.args.salt, self.hash)
class hmac (salt, hash='SHA1', size=None)
-
This unit is implemented in
refinery.units.crypto.keyderive.hmac
and has the following commandline Interface:usage: hmac [-h] [-L] [-Q] [-0] [-v] salt [hash] [size] HMAC based key derivation positional arguments: salt Salt for the derivation. hash Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256, sha512, sha224, sha384 size The number of bytes to generate. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class hmac(KeyDerivation): """ HMAC based key derivation """ def __init__(self, salt, hash='SHA1', size=None): super().__init__(salt=salt, size=size, hash=hash) def process(self, data): from Cryptodome.Hash import HMAC return HMAC.new(data, self.args.salt, digestmod=self.hash).digest()
class htmlesc
-
This unit is implemented in
refinery.units.encoding.htmlesc
and has the following commandline Interface:usage: htmlesc [-h] [-L] [-Q] [-0] [-v] [-R] Encodes and decodes HTML entities. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class htmlesc(Unit): """ Encodes and decodes HTML entities. """ @unicoded def process(self, data: str) -> str: return html_entities.unescape(data) @unicoded def reverse(self, data: str) -> str: return html_entities.escape(data)
class httprequest
-
This unit is implemented in
refinery.units.formats.httprequest
and has the following commandline Interface:usage: httprequest [-h] [-L] [-Q] [-0] [-v] [-F] Parses HTTP request data, as you would obtain from a packet dump. The unit extracts POST data in any format; each uploaded file is emitted as a separate chunk. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class httprequest(Unit): """ Parses HTTP request data, as you would obtain from a packet dump. The unit extracts POST data in any format; each uploaded file is emitted as a separate chunk. """ def process(self, data): def header(line: bytes): name, colon, data = line.decode('utf8').partition(':') if colon: yield (name.strip().lower(), data.strip()) head, _, body = data.partition(b'\r\n\r\n') request, *headers = head.splitlines(False) headers = dict(t for line in headers for t in header(line)) method, path, _, *rest = request.split() mode = _Fmt.RawBody if rest: self.log_warn('unexpected rest data while parsing HTTP request:', rest) if method == b'GET' and not body: mode = _Fmt.UrlEncode body = path.partition(B'?')[1] if method == b'POST' and (ct := headers.get('content-type', None)): ct, _ = _parse_header(ct) mode = _Fmt(ct) def chunks(upload: Dict[Union[str, bytes], List[bytes]]): for key, values in upload.items(): if not isinstance(key, str): key = key.decode('utf8') for value in values: yield self.labelled(value, name=key) if mode is _Fmt.RawBody: yield body return if mode is _Fmt.Multipart: _, _, message_data = data.partition(b'\n') msg = BytesParser().parsebytes(message_data) for part in msg.walk(): payloads = part.get_payload(decode=True) if not isinstance(payloads, list): payloads = [payloads] for payload in payloads: if not isbuffer(payload): continue if name := part.get_filename(): payload = self.labelled(payload, name=name) yield payload if mode is _Fmt.UrlEncode: yield from chunks(parse_qs(body, keep_blank_values=1)) @classmethod def handles(self, data: bytearray) -> bool | None: return data.startswith(B'POST ') or data.startswith(B'GET ')
class httpresponse
-
This unit is implemented in
refinery.units.formats.httpresponse
and has the following commandline Interface:usage: httpresponse [-h] [-L] [-Q] [-0] [-v] Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked or compressed transfer encoding was used. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class httpresponse(Unit): """ Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked or compressed transfer encoding was used. """ def process(self, data): with SockWrapper(data) as mock: mock.seek(0) parser = HTTPResponse(mock) parser.begin() try: return parser.read() except IncompleteRead as incomplete: msg = F'incomplete read: {len(incomplete.partial)} bytes processed, {incomplete.expected} more expected' raise RefineryPartialResult(msg, incomplete.partial) from incomplete
class iemap (legend=False, background=False, block_char='#', *label)
-
This unit is implemented in
refinery.units.sinks.iemap
and has the following commandline Interface:usage: iemap [-h] [-L] [-Q] [-0] [-v] [-l] [-b] [-c C] [label-part [label-part ...]] The information entropy map displays a colored bar on the terminal visualizing the file's local entropy from beginning to end. positional arguments: label-part The remaining command line specifies a format string expression that will be printed over the heat map display of each processed chunk. optional arguments: -l, --legend Show entropy color legend. -b, --background Generate the bar by coloring the background. -c, --block-char C Character used for filling the bar, default is # generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class iemap(Unit): """ The information entropy map displays a colored bar on the terminal visualizing the file's local entropy from beginning to end. """ def __init__( self, legend: Unit.Arg.Switch('-l', help='Show entropy color legend.') = False, background: Unit.Arg.Switch('-b', help='Generate the bar by coloring the background.') = False, block_char: Unit.Arg('-c', '--block-char', type=str, metavar='C', help='Character used for filling the bar, default is {default}') = '#', *label: Unit.Arg(type=str, metavar='label-part', help=( 'The remaining command line specifies a format string expression that will be printed ' 'over the heat map display of each processed chunk.' )) ): super().__init__(label=' '.join(label), background=background, legend=legend, block_char=block_char) @Unit.Requires('colorama', 'display', 'default', 'extended') def _colorama(): import colorama return colorama def process(self, data): from sys import stderr from os import name as os_name colorama = self._colorama colorama.init(autoreset=False, convert=(os_name == 'nt')) nobg = not self.args.background meta = metavars(data) label = meta.format_str(self.args.label, self.codec, [data]) if label: if not label.endswith(' '): label = F'{label} ' if not label.startswith(' '): label = F' {label}' bgmap = [ colorama.Back.BLACK, colorama.Back.WHITE, colorama.Back.YELLOW, colorama.Back.CYAN, colorama.Back.BLUE, colorama.Back.GREEN, colorama.Back.LIGHTRED_EX, colorama.Back.MAGENTA, ] fgmap = [ colorama.Fore.LIGHTBLACK_EX, colorama.Fore.LIGHTWHITE_EX, colorama.Fore.LIGHTYELLOW_EX, colorama.Fore.LIGHTCYAN_EX, colorama.Fore.LIGHTBLUE_EX, colorama.Fore.LIGHTGREEN_EX, colorama.Fore.LIGHTRED_EX, colorama.Fore.LIGHTMAGENTA_EX, ] _reset = colorama.Back.BLACK + colorama.Fore.WHITE + colorama.Style.RESET_ALL clrmap = fgmap if nobg else bgmap header = '[' header_length = 1 footer_length = 4 + 7 if self.args.legend: header = '[{1}{0}] {2}'.format(_reset, ''.join(F'{bg}{k}' for k, bg in enumerate(clrmap, 1)), header) header_length += 3 + len(clrmap) _tw = get_terminal_size() width = _tw - header_length - footer_length if width < 16: raise RuntimeError(F'computed terminal width {_tw} is too small for heatmap') def entropy_select(value, map): index = min(len(map) - 1, math.floor(value * len(map))) return map[index] view = memoryview(data) size = len(data) chunk_size = 0 for block_size in range(1, width + 1): block_count = width // block_size chunk_size = size // block_count if chunk_size > 1024: break q, remainder = divmod(width, block_size) assert q == block_count indices = list(range(q)) random.seed(sum(view[:1024])) random.shuffle(indices) block_sizes = [block_size] * q q, r = divmod(remainder, block_count) for i in indices: block_sizes[i] += q for i in indices[:r]: block_sizes[i] += 1 assert sum(block_sizes) == width q, remainder = divmod(size, block_count) assert q == chunk_size chunk_sizes = [chunk_size] * block_count for i in indices[:remainder]: chunk_sizes[i] += 1 assert sum(chunk_sizes) == size stream = MemoryFile(view) filler = self.args.block_char if nobg else ' ' try: stderr.write(header) if label is not None: stderr.write(colorama.Fore.WHITE) stderr.flush() it = itertools.chain(itertools.repeat(filler, 3), label, itertools.cycle(filler)) cp = None for chunk_size, block_size in zip(chunk_sizes, block_sizes): chunk = stream.read(chunk_size) chunk_entropy = entropy(chunk) pp = entropy_select(chunk_entropy, clrmap) string = ''.join(itertools.islice(it, block_size)) if pp != cp: string = F'{pp}{string}' cp = pp stderr.write(string) stderr.flush() except BaseException: eraser = ' ' * width stderr.write(F'\r{_reset}{eraser}\r') raise else: stderr.write(F'{_reset}] [---.--%]') te = meta['entropy'] stderr.write('\b' * footer_length) stderr.write(F'] [{te!r:>7}]\n') stderr.flush() if not self.isatty: yield data
class iff (*expression, ge=None, gt=None, le=None, lt=None, ct=None, ne=None, iN=None, eq=None, retain=False)
-
This unit is implemented in
refinery.units.meta.iff
and has the following commandline Interface:usage: iff [-h] [-L] [-Q] [-0] [-v] [-R] [-ge RHS | -gt RHS | -le RHS | -lt RHS | -ct RHS | -ne RHS | -in RHS | -eq RHS] [-r] [token [token ...]] Filter incoming chunks depending on whether a given Python expression evaluates to true. If no expression is given, the unit filters out empty chunks. Note: The reverse operation of a conditional unit uses the logical negation of its condition. positional arguments: token All "token" arguments to this unit are joined with spaces to produce the expression to be evaluated. This is done so that unnecessary shell quoting is avoided. optional arguments: -ge RHS check that the expression is greater or equal to RHS -gt RHS check that the expression is greater than RHS -le RHS check that the expression is less or equal to RHS -lt RHS check that the expression is less than RHS -ct RHS check that the expression contains RHS -ne RHS check that the expression is equal to RHS -in, -i RHS check that the expression is contained in RHS -eq, -e RHS check that the expression is equal to RHS -r, --retain Move non-matching chunks out of scope rather than discarding them. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class iff(ConditionalUnit, extend_docs=True): """ Filter incoming chunks depending on whether a given Python expression evaluates to true. If no expression is given, the unit filters out empty chunks. """ def __init__( self, *expression: Arg(metavar='token', type=str, help=( 'All "token" arguments to this unit are joined with spaces to produce the expression ' 'to be evaluated. This is done so that unnecessary shell quoting is avoided.')), ge: Arg('-ge', type=str, metavar='RHS', group='OP', help='check that the expression is greater or equal to {varname}') = None, gt: Arg('-gt', type=str, metavar='RHS', group='OP', help='check that the expression is greater than {varname}') = None, le: Arg('-le', type=str, metavar='RHS', group='OP', help='check that the expression is less or equal to {varname}') = None, lt: Arg('-lt', type=str, metavar='RHS', group='OP', help='check that the expression is less than {varname}') = None, ct: Arg('-ct', type=str, metavar='RHS', group='OP', help='check that the expression contains {varname}') = None, ne: Arg('-ne', type=str, metavar='RHS', group='OP', help='check that the expression is equal to {varname}') = None, iN: Arg('-in', '-i', type=str, metavar='RHS', group='OP', help='check that the expression is contained in {varname}') = None, eq: Arg('-eq', '-e', type=str, metavar='RHS', group='OP', help='check that the expression is equal to {varname}') = None, retain=False, ): def encodings(v: str): if not isinstance(v, str): return for codec in [self.codec, 'latin1', 'utf-16le']: yield v.encode(codec) def __br_contains__(container, value): if value in container: return True if isinstance(value, str): return any(b in container for b in encodings(value)) else: return any(value == b for v in container for b in encodings(v)) operators = [ (ge, operator.__ge__), (gt, operator.__gt__), (le, operator.__le__), (lt, operator.__lt__), (eq, operator.__eq__), (ne, operator.__ne__), (ct, __br_contains__), (iN, lambda a, b: __br_contains__(b, a)), ] operators = [ (rhs, cmp) for (rhs, cmp) in operators if rhs is not None ] rhs, cmp, lhs = None, None, '\x20'.join(expression) or None if len(operators) > 0: if not lhs: raise ValueError('Comparison operator with empty left hand side.') if len(operators) > 1: raise ValueError('Only one comparison operation can be specified.') rhs, cmp = operators[0] super().__init__( lhs=lhs, rhs=rhs, cmp=cmp, retain=retain, ) def match(self, chunk): meta = metavars(chunk) lhs: Optional[str] = self.args.lhs rhs: Optional[Any] = self.args.rhs cmp: Optional[Callable[[Any, Any], bool]] = self.args.cmp if cmp is None and rhs is not None: raise ValueError('right hand side defined but no operator') if lhs is not None: if rhs is not None: lhs = DelayedNumSeqArgument(lhs, additional_types=(float, str))(chunk) else: lhs = PythonExpression.Evaluate(lhs, meta) rhs = rhs and DelayedNumSeqArgument(rhs, additional_types=(float, str))(chunk) self.log_info(F'lhs: type={lhs.__class__.__name__}; value={lhs!r}') self.log_info(F'rhs: type={rhs.__class__.__name__}; value={rhs!r}') if lhs is None: return bool(chunk) if rhs is None: return bool(lhs) return cmp(lhs, rhs)
class iffp (*patterns, partial=False, retain=False)
-
This unit is implemented in
refinery.units.meta.iffp
and has the following commandline Interface:usage: iffp [-h] [-L] [-Q] [-0] [-v] [-R] [-p] [-r] [pattern [pattern ...]] Filter incoming chunks depending on whether it matches any of a given set of patterns. The available patterns are the following: integer, float, number, string, multiline_string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote, urlquote_coarse, urlquote_narrow, intarray, numarray, word, letters, wshenc, alphanumeric, b32, b64, b85, b92, b64any, b64url, hex, uppercase_hex, spaced_hex, spaced_b64, spaced_b85, utf8, hexdump, hexarray, uuencode, domain, email, guid, ipv4, ipv6, md5, sha1, sha256, hostname, socket, subdomain, url, btc, pem, xmr, path, winpath, nixpath, environment_variable. Note: The reverse operation of a conditional unit uses the logical negation of its condition. positional arguments: pattern optional arguments: -p, --partial Allow partial matches on the data. -r, --retain Move non-matching chunks out of scope rather than discarding them. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class iffp(ConditionalUnit, extend_docs=True): """ Filter incoming chunks depending on whether it matches any of a given set of patterns. The available patterns are the following: {}. """ def __init__( self, *patterns: Arg.Choice(metavar='pattern', choices=_PATTERNS), partial: Arg.Switch('-p', help='Allow partial matches on the data.') = False, retain=False ): super().__init__( retain=retain, patterns=patterns, partial=partial ) def match(self, chunk): for name in self.args.patterns: p: pattern = _PATTERNS[name] matcher = p.match if self.args.partial else p.fullmatch if matcher(chunk): return True return False
class iffs (needle, retain=False)
-
This unit is implemented in
refinery.units.meta.iffs
and has the following commandline Interface:usage: iffs [-h] [-L] [-Q] [-0] [-v] [-R] [-r] needle Filter incoming chunks depending on whether they contain a given binary substring. Note: The reverse operation of a conditional unit uses the logical negation of its condition. positional arguments: needle the string to search for optional arguments: -r, --retain Move non-matching chunks out of scope rather than discarding them. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class iffs(ConditionalUnit, extend_docs=True): """ Filter incoming chunks depending on whether they contain a given binary substring. """ def __init__( self, needle: Arg(help='the string to search for'), retain=False, ): super().__init__( needle=needle, retain=retain, ) def match(self, chunk): return self.args.needle in chunk
class iffx (regex, count=0, fullmatch=False, multiline=False, ignorecase=False)
-
This unit is implemented in
refinery.units.meta.iffx
and has the following commandline Interface:usage: iffx [-h] [-L] [-Q] [-0] [-v] [-R] [-c N] [-U] [-M] [-I] regex Filter incoming chunks by discarding those that do not match the given regular expression. Note: The reverse operation of a conditional unit uses the logical negation of its condition. positional arguments: regex Regular expression to match. optional arguments: -c, --count N Specify the maximum number of operations to perform. -U, --fullmatch Regular expressions are matched against the full input, not substrings of it. -M, --multiline Caret and dollar in regular expressions match the beginning and end of a line and a dot does not match line breaks. -I, --ignorecase Ignore capitalization for alphabetic characters in regular expressions. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class iffx(SingleRegexUnit, ConditionalUnit, extend_docs=True): """ Filter incoming chunks by discarding those that do not match the given regular expression. """ def match(self, chunk): return bool(self.matcher(chunk))
class ifps
-
This unit is implemented in
refinery.units.formats.ifps
and has the following commandline Interface:usage: ifps [-h] [-L] [-Q] [-0] [-v] [-F] Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class ifps(Unit): """ Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp. """ def process(self, data): return str(IFPSFile(data)).encode(self.codec) @classmethod def handles(self, data: bytearray) -> bool: return data.startswith(IFPSFile.Magic)
class ifpsstr
-
This unit is implemented in
refinery.units.formats.ifpsstr
and has the following commandline Interface:usage: ifpsstr [-h] [-L] [-Q] [-0] [-v] [-F] Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class ifpsstr(Unit): """ Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp. """ def process(self, data): ifps = IFPSFile(data) for string in ifps.strings: yield string.encode(self.codec) @classmethod def handles(self, data: bytearray) -> bool: return data.startswith(IFPSFile.Magic)
class imphash (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.imphash
and has the following commandline Interface:usage: imphash [-h] [-L] [-Q] [-0] [-v] [-t] Implements the import hash for PE files. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class imphash(HashUnit): """ Implements the import hash for PE files. """ def _algorithm(self, data): pe = PE(data=data, fast_load=True) pe.parse_data_directories(directories=[IMAGE_DIRECTORY_ENTRY_IMPORT]) th = pe.get_imphash() if not th: raise ValueError('no import directory.') return bytes.fromhex(th)
class isaac (key, discard=0, stateful=False)
-
This unit is implemented in
refinery.units.crypto.cipher.isaac
and has the following commandline Interface:usage: isaac [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher. positional arguments: key The encryption key. optional arguments: -d, --discard N Discard the first N bytes of the keystream, 0 by default. -s, --stateful Do not reset the key stream while processing the chunks of one frame. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class isaac(StreamCipherUnit): """ The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher. """ def keystream(self) -> Iterable[int]: key = self.args.key A: int = 0 B: int = 0 C: int = 0 S: List[int] = [0x9E3779B9] * 8 T: List[int] = [] K = list(chunks.unpack(key + bytearray(0x400 - len(key)), 4, bigendian=False)) U = 0xFFFFFFFF def _mix_state(): a, b, c, d, e, f, g, h = S a ^= (b << 0x0B) & U; d = d + a & U; b = b + c & U # noqa b ^= (c >> 0x02) & U; e = e + b & U; c = c + d & U # noqa c ^= (d << 0x08) & U; f = f + c & U; d = d + e & U # noqa d ^= (e >> 0x10) & U; g = g + d & U; e = e + f & U # noqa e ^= (f << 0x0A) & U; h = h + e & U; f = f + g & U # noqa f ^= (g >> 0x04) & U; a = a + f & U; g = g + h & U # noqa g ^= (h << 0x08) & U; b = b + g & U; h = h + a & U # noqa h ^= (a >> 0x09) & U; c = c + h & U; a = a + b & U # noqa S[:] = a, b, c, d, e, f, g, h return S def _initialize_with(R: List[int]): for i in range(0, 0x100, 8): S[:] = (x + R[j] & U for j, x in enumerate(S, i)) T[i:i + 8] = _mix_state() for _ in range(4): _mix_state() _initialize_with(K) _initialize_with(T) operations = [ (__lshift__, 0x0D), (__rshift__, 0x06), (__lshift__, 0x02), (__rshift__, 0x10), ] while True: C = (C + 1) & U B = (B + C) & U for i in range(0x100): X = T[i] shift, k = operations[i % 4] A = (A ^ shift(A, k)) & U A = (A + T[i ^ 0x80]) & U Y = T[+i] = T[X // 4 & 0xFF] + A + B & U B = K[~i] = X + T[Y // 1024 & 0xFF] & U yield from chunks.pack(K, 4, True)
class jcalg (ignore_header=False)
-
This unit is implemented in
refinery.units.compression.jcalg
and has the following commandline Interface:usage: jcalg [-h] [-L] [-Q] [-0] [-v] [-F] [-g] JCALG decompression. optional arguments: -g, --ignore-header Keep decompressing even after the output has reached the final size as given by the header value. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class jcalg(Unit): """ JCALG decompression. """ def __init__( self, ignore_header: Unit.Arg('-g', help=( 'Keep decompressing even after the output has reached the final size as given by the header value.')) = False, ): super().__init__(ignore_header=ignore_header) def process(self, data: bytearray): with MemoryFile() as output, StructReader(data) as reader: if reader.read(2) != B'JC': self.log_warn('data does not begin with magic sequence, assuming that header is missing') reader.seek(0) size = checksum = None else: size = reader.u32() checksum = reader.u32() if self.args.ignore_header: size = None self._decompress(output, reader, size) if size is not None: if len(output) > size: self.log_info(F'tuncating to size {size}') output.truncate(size) elif len(output) < size: self.log_warn(F'header size was {size}, but only {len(data)} bytes were decompressed') data = output.getvalue() if checksum: c = self._checksum(data) if c != checksum: self.log_warn(F'header checksum was {checksum:08X}, computed value is {c:08X}') return data @classmethod def handles(cls, data: bytearray): if data[:2] == B'JC': return True def _checksum(self, data): from refinery.lib import chunks checksum = 0 it = chunks.unpack(data, 4) if len(data) % 4: import itertools it = itertools.chain(it, (int.from_bytes(data[-4:], 'little'),)) for chunk in it: checksum += chunk checksum ^= ((chunk & 0x7FFFFFFF) << 1) + (chunk >> 31) + 1 checksum &= 0xFFFFFFFF return checksum def _decompress(self, writer: MemoryFile, reader_: StructReader[bytearray], size: Optional[int] = None): index = 1 base = 8 literal_bits = None literal_offset = None flags = BitBufferedReader(reader_, 32) while True: if size and len(writer) >= size: break if flags.next(): b = flags.read(literal_bits) + literal_offset b = b & 0xFF writer.write_byte(b) continue if flags.next(): high = flags.variable_length_integer() if high == 2: match_length = flags.variable_length_integer() else: index = ((high - 3) << base) + flags.read(base) match_length = flags.variable_length_integer() if index >= 0x10000: match_length += 3 elif index >= 0x37FF: match_length += 2 elif index >= 0x27F: match_length += 1 elif index <= 127: match_length += 4 writer.replay(index, match_length) continue if not flags.next(): new_index = flags.read(7) match_length = 2 + flags.read(2) if new_index == 0: if match_length == 2: break base = flags.read(match_length + 1) else: index = new_index writer.replay(index, match_length) continue one_byte_phrase_value = flags.read(4) - 1 if one_byte_phrase_value == 0: writer.write_byte(0) elif one_byte_phrase_value > 0: b = writer.getbuffer()[-one_byte_phrase_value] writer.write_byte(b) else: if not flags.next(): literal_bits = 7 + flags.next() literal_offset = 0 if literal_bits != 8: literal_offset = flags.read(8) continue while True: for _ in range(0x100): b = flags.read(8) writer.write_byte(b) if not flags.next(): break
class jvdasm (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.java.jvdasm
and has the following commandline Interface:usage: jvdasm [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The unit is implemented as a path extractor and each path name corresponds to the name of one method defined in the class file. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class jvdasm(PathExtractorUnit): """ Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The unit is implemented as a path extractor and each path name corresponds to the name of one method defined in the class file. """ _OPC_STRLEN = max(len(op.name) for op in opc) def _hex(self, bytestring, sep=''): return sep.join(F'{x:02x}' for x in bytestring) def unpack(self, data): jc = JvClassFile(data) tt = ' ' opcw = self._OPC_STRLEN for method in jc.methods: for attribute in method.attributes: if attribute.name == 'Code': break else: self.log_warn(F'no code found for method: {method.name}') continue code: JvCode = attribute.parse(JvCode) with io.StringIO() as display: args, retval = re.match(R'^\((.*?)\)(.*?)$', method.descriptor).groups() print(F'{jc.this!s}::{method!s}{method.descriptor}', file=display) for op in code.disassembly: olen = len(op.raw) if op.table is None: args = ', '.join(repr(a) for a in op.arguments) else: ow = 4 if op.code is opc.tableswitch else 8 olen = olen - (len(op.table) - 1) * ow args = F'defaultjmp => {op.table[None]:#010x}' jmps = [] for k, (key, jmp) in enumerate(op.table.items()): if key is None: continue raw = self._hex(op.raw[olen + k * ow: olen + k * ow + ow], ' ') jmps.append(F'{tt}{raw!s:<{opcw + 15}} {key:#010x} => {jmp:#010x}') args = '\n'.join((args, *jmps)) opch = self._hex(op.raw[:olen], ' ') if len(opch) > 14: opch += F'\n{tt}{tt:<15}' print(F'{tt}{opch:<15}{op.code!r:<{opcw}} {args}', file=display) name = method.name if name.startswith('<'): this = jc.this.value.split('/') this = this[-1] name = F'{this}${name[1:-1]}' yield UnpackResult(F'{name}.jd', display.getvalue().encode(self.codec))
class jvstr
-
This unit is implemented in
refinery.units.formats.java.jvstr
and has the following commandline Interface:usage: jvstr [-h] [-L] [-Q] [-0] [-v] Extract string constants from Java class files. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class jvstr(Unit): """ Extract string constants from Java class files. """ def process(self, data): jc = JvClassFile(data) for string in jc.strings: yield string.encode(self.codec)
class kblob
-
This unit is implemented in
refinery.units.crypto.keyderive.kblob
and has the following commandline Interface:usage: kblob [-h] [-L] [-Q] [-0] [-v] Extracts a key from a Microsoft Crypto API BLOB structure. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class kblob(Unit): """ Extracts a key from a Microsoft Crypto API BLOB structure. """ def process(self, data): blob = CRYPTOKEY(data) try: return self.labelled( bytes(blob.key), type=blob.header.type.name, algorithm=blob.header.algorithm.name ) except AttributeError as A: raise ValueError(F'unable to derive key from {blob.header.type!s}') from A
class keccak256 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: keccak256 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the KECCAK256 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class kramer
-
This unit is implemented in
refinery.units.malware.kramer
and has the following commandline Interface:usage: kramer [-h] [-L] [-Q] [-0] [-v] Deobfuscate Python samples obfuscated with Kramer. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class kramer(Unit): """ Deobfuscate Python samples obfuscated with Kramer. """ _LINEBREAK_MAGIC = 950 def process(self, data): kramer = str() secret = set() _pyver = None def crawl(code: CodeType, depth=1): nonlocal kramer nonlocal secret for instruction in disassemble_code(code, _pyver): arg = instruction.argval if arg is None: continue if isinstance(arg, tuple): continue if isinstance(arg, str): if len(arg) > len(kramer): kramer = arg continue if isinstance(arg, int): secret.add(arg) continue try: crawl(arg, depth + 1) except Exception as E: self.log_info(F'error crawling arg of type {type(arg).__name__} at depth {depth}: {E}') for code in extract_code_from_buffer(bytes(data)): _pyver = code.version crawl(code.container) if not kramer: raise ValueError('could not find the encoded string') separator = re.search('[^a-fA-F0-9]+', kramer) if not separator: raise ValueError('no separator detected; encoding method may have changed') def rotchar(c: int): if c in range(0x61, 0x7a) or c in range(0x30, 0x39): return c + 1 if c == 0x7a: return 0x30 if c == 0x39: return 0x61 return c def decrypt(c: int, k: int): if c >= k: out = rotchar(c - k) if out not in range(0x100): raise _WrongKey return out if c == self._LINEBREAK_MAGIC: return 0x0A raise _WrongKey def decrypt_with_key(key: int): decrypted = bytearray(decrypt(c, key) for c in encrypted) if not re.fullmatch(B'[\\s!-~]+', decrypted): raise _WrongKey return decrypted separator = separator.group(0) encrypted = [ord(bytes.fromhex(e).decode()) for e in kramer.split(separator)] ubound = min(x for x in encrypted if x != self._LINEBREAK_MAGIC) lbound = ubound - 0xFF secret = {k for k in secret if k > lbound and k < ubound} self.log_debug('potential secrets from code:', secret) for key in sorted(secret, reverse=True): try: return decrypt_with_key(key) except _WrongKey: pass self.log_info(F'all candidates failed, searching [{lbound}, {ubound}]') for key in range(ubound, lbound - 1, -1): try: self.log_debug('attempting key:', key) return decrypt_with_key(key) except _WrongKey: pass raise RuntimeError('could not find decryption key')
class lnk (tabular=False)
-
This unit is implemented in
refinery.units.formats.lnk
and has the following commandline Interface:usage: lnk [-h] [-L] [-Q] [-0] [-v] [-t] Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit is a thin wrapper around the LnkParse3 library. optional arguments: -t, --tabular Print information in a table rather than as JSON generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class lnk(Unit): """ Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit is a thin wrapper around the LnkParse3 library. """ @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'default', 'extended') def _LnkParse3(): import LnkParse3 return LnkParse3 def __init__(self, tabular: Unit.Arg('-t', help='Print information in a table rather than as JSON') = False): super().__init__(tabular=tabular) def process(self, data): with NoLogging(): parsed = self._LnkParse3.lnk_file(MemoryFile(data)).get_json() with JSONEncoderEx as encoder: pp = ppjson(tabular=self.args.tabular) yield from pp._pretty_output( parsed, indent=4, cls=encoder, ensure_ascii=False)
class loop (count, suffix, do_while, do_until, fullmatch=False, multiline=False, ignorecase=False)
-
This unit is implemented in
refinery.units.meta.loop
and has the following commandline Interface:usage: loop [-h] [-L] [-Q] [-0] [-v] [-w RE] [-u RE] [-U] [-M] [-I] count suffix Applies a given multibin suffix to the input chunk repeatedly. For example, the following command would carve the largest base64-encoded buffer from the input, decode it, and then decompress the result 20 times: emit data | loop 20 csd[b64]:zl Notably, the argument after the count is a suffix, which means that handlers are applied from left to right (not from right to left). The loop is aborted and the previous result returned if the newly computed result is empty. If the an error occurs while computing the suffix and the unit is lenient (i.e. the -L switch is set), the last known result is returned. positional arguments: count The number of repeated applications of the suffix. suffix A multibin expression suffix. optional arguments: -w, --while RE Halt when the given regular expression does not match the data. -u, --until RE Halt when the given regular expression matches the data. -U, --fullmatch Regular expressions are matched against the full input, not substrings of it. -M, --multiline Caret and dollar in regular expressions match the beginning and end of a line and a dot does not match line breaks. -I, --ignorecase Ignore capitalization for alphabetic characters in regular expressions. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class loop(RegexUnit): """ Applies a given multibin suffix to the input chunk repeatedly. For example, the following command would carve the largest base64-encoded buffer from the input, decode it, and then decompress the result 20 times: emit data | loop 20 csd[b64]:zl Notably, the argument after the count is a suffix, which means that handlers are applied from left to right (not from right to left). The loop is aborted and the previous result returned if the newly computed result is empty. If the an error occurs while computing the suffix and the unit is lenient (i.e. the `-L` switch is set), the last known result is returned. """ def __init__( self, count: Arg.Number(metavar='count', help='The number of repeated applications of the suffix.'), suffix: Arg(type=str, help='A multibin expression suffix.'), do_while: Arg('-w', '--while', type=regexp, metavar='RE', help='Halt when the given regular expression does not match the data.'), do_until: Arg('-u', '--until', type=regexp, metavar='RE', help='Halt when the given regular expression matches the data.'), fullmatch=False, multiline=False, ignorecase=False, ): super().__init__( count=count, suffix=suffix, do_while=do_while, do_until=do_until, fullmatch=fullmatch, multiline=multiline, ignorecase=ignorecase, ) def process(self, data): _count = self.args.count _width = len(str(_count)) _while = self._while _until = self._until for k in range(_count): if _while and not _while(data): self.log_info(F'step {k:0{_width}}: stopping, while-condition violated') break if _until and _until(data): self.log_info(F'step {k:0{_width}}: stopping, until-condition satisfied') break try: out = DelayedBinaryArgument( self.args.suffix, reverse=True, seed=data)(data) except Exception as error: self.log_info(F'step {k:0{_width}}: error;', exception_to_string(error)) msg = F'Stopped after {k} steps, increase verbosity for additional details.' raise RefineryPartialResult(msg, data) from error if not out: self.log_info(F'step {k:0{_width}}: stopping after empty result') break data[:] = out self.log_debug(F'step {k:0{_width}}: data =', data, clip=True) return data @property def _while(self): return self._make_matcher(self.args.do_while) @property def _until(self): return self._make_matcher(self.args.do_until)
class lz4
-
This unit is implemented in
refinery.units.compression.lz4
and has the following commandline Interface:usage: lz4 [-h] [-L] [-Q] [-0] [-v] LZ4 block decompression. See also: https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class lz4(Unit): """ LZ4 block decompression. See also: https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format """ def _read_block(self, reader: StructReader, output: io.BytesIO, ubound=None): entry = reader.tell() lastend = 0 def ubound_check(): if ubound is None: return False consumed = reader.tell() - entry if consumed > ubound: raise ValueError(F'upper bound {ubound} exceeded by {consumed - ubound} in LZ4 block') return consumed == ubound while not reader.eof: reflen = reader.read_nibble() litlen = reader.read_nibble() litlen = reader.read_size(litlen) literal = reader.read(litlen) output.write(literal) if ubound_check(): break try: refpos = reader.u16() except EOFError: break if refpos - 1 not in range(output.tell()): with StreamDetour(output, lastend): if output.read(len(literal)) == literal: # This literal could have been encoded in the last match, but it wasn't. # Therefore, it is very likely that we have reached the end of the stream. break position = reader.tell() remaining = len(literal) - position raise RefineryPartialResult( F'encountered invalid match offset value {refpos} at position {position} with {remaining} bytes remaining', partial=output.getvalue()) reflen = reader.read_size(reflen) if ubound_check(): raise ValueError('last sequence in block contained a match') reflen += 4 available_bytes = min(refpos, reflen) q, r = divmod(reflen, available_bytes) with StreamDetour(output, -refpos, io.SEEK_CUR): match = output.read(available_bytes) match = q * match + match[:r] assert len(match) == reflen lastend = output.tell() - available_bytes + r output.write(match) def process(self, data): output = io.BytesIO() reader = LZ4Reader(memoryview(data)) try: magic = reader.u32() == 0x184D2204 except EOFError: magic = False if not magic: reader.seek(0) self._read_block(reader, output) return output.getbuffer() (dict_id, rsrv1, content_checksummed, content_size, blocks_checksummed, blocks_independent, v2, v1) = reader.read_bits(8) rsrv2 = reader.read_nibble() try: block_maximum = { 7: 0x400000, 6: 0x100000, 5: 0x040000, 4: 0x010000, }[reader.read_integer(3)] except KeyError: raise ValueError('unknown maximum block size value in LZ4 frame header') rsrv3 = reader.read_bit() if any((rsrv1, rsrv2, rsrv3)): self.log_warn('nonzero reserved value in LZ4 frame header') if (v1, v2) != (0, 1): self.log_warn(F'invalid version ({v1},{v2}) in LZ4 frame header') content_size = content_size and reader.u64() or None dict_id = dict_id and reader.u32() or None # Header Checksum xxh = xxhash(data[4:reader.tell()]).intdigest() >> 8 & 0xFF chk = reader.read_byte() if chk != xxh: self.log_warn(F'header checksum {chk:02X} does not match computed value {xxh:02X}') self.log_debug(lambda: F'dictionary id: {dict_id}') self.log_debug(lambda: F'block max: 0x{block_maximum:X}') if content_size is not None: self.log_debug(lambda: F'chunk max: 0x{content_size:X}') self.log_debug(lambda: F'blocks independent: {bool(blocks_independent)}') self.log_debug(lambda: F'blocks checksummed: {bool(blocks_checksummed)}') blockindex = 0 while True: blockindex += 1 size = reader.read_integer(31) uncompressed = reader.read_bit() if not size: assert not uncompressed break self.log_info(F'reading block of size 0x{size:06X}') assert reader.byte_aligned assert size <= block_maximum, 'block size exceeds maximum size' if uncompressed: output.write(reader.read_exactly(size)) else: self._read_block(reader, output, size) if blocks_checksummed: with StreamDetour(reader, -size, io.SEEK_CUR): xxh = xxhash(reader.read_exactly(size)).intdigest() chk = reader.u32() if chk != xxh: self.log_warn(F'block {blockindex} had checksum {chk:08X} which did not match computed value {xxh:08X}') if content_checksummed: self.log_info('computing checksum') xxh = xxhash(output.getbuffer()).intdigest() chk = reader.u32() if chk != xxh: self.log_warn(F'the given checksum {chk:08X} did not match the computed checksum {xxh:08X}') if not reader.eof: pos = reader.tell() self.log_warn(F'found {len(data) - pos} additional bytes starting at position 0x{pos:X} after compressed data') return output.getbuffer()
class lzf (fast=False)
-
This unit is implemented in
refinery.units.compression.lzf
and has the following commandline Interface:usage: lzf [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-x] This unit implements LZF compression and decompression. optional arguments: -x, --fast Enable fast compression mode. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class lzf(Unit): """ This unit implements LZF compression and decompression. """ def __init__(self, fast: Arg.Switch('-x', help='Enable fast compression mode.') = False): super().__init__(fast=fast) def reverse(self, data): def FRST(p: memoryview) -> int: return ((p[0]) << 8) | p[1] def NEXT(v: int, p: memoryview) -> int: return ((v << 8) | p[2]) & 0xFFFFFFFF def DELTA(p: memoryview): return view.nbytes - p.nbytes if self.args.fast: def HIDX(h: int) -> int: return (((h >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1)) else: def HIDX(h: int) -> int: q = (h ^ (h << 5)) return (((q >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1)) if not data: return data ip = view = memoryview(data) op = bytearray() if len(data) == 1: op.append(0) op.extend(data) return op hval = FRST(ip) htab = [0] * _HSIZE fast = 1 if self.args.fast else 0 lit = 0 def begin_literal(): nonlocal lit op.append(0) lit = 0 def advance_literal(): nonlocal lit, ip lit += 1 op.append(ip[0]) ip = ip[1:] if lit == _MAX_LIT: op[-lit - 1] = lit - 1 begin_literal() def commit_literal(): if lit > 0: op[-lit - 1] = lit - 1 else: op.pop() begin_literal() while ip.nbytes > 2: hval = NEXT(hval, ip) hpos = HIDX(hval) ipos = DELTA(ip) length = 2 r, htab[hpos] = htab[hpos], ipos off = ipos - r - 1 ref = view[r:] if off >= _MAX_OFF or r <= 0 or ref[:3] != ip[:3]: advance_literal() continue else: commit_literal() maxlen = min(_MAX_REF, ip.nbytes - length) while True: length += 1 if length >= maxlen or ref[length] != ip[length]: length -= 2 break if length < 7: op.append((off >> 8) + (length << 5)) else: op.append((off >> 8) + (7 << 5)) op.append(length - 7) op.append(off & 0xFF) begin_literal() if ip.nbytes <= length + 3: ip = ip[length + 2:] break if fast: ip = ip[length:] hval = FRST(ip) for _ in range(2): hval = NEXT(hval, ip) htab[HIDX(hval)] = DELTA(ip) ip = ip[1:] else: ip = ip[1:] for _ in range(length + 1): hval = NEXT(hval, ip) htab[HIDX(hval)] = DELTA(ip) ip = ip[1:] while ip.nbytes: advance_literal() commit_literal() return op def _decompress_chunk(self, data: memoryview, out: MemoryFile): ip = StructReader(data) while not ip.eof: ctrl = ip.u8() if ctrl < 0B100000: ctrl += 1 out.write(ip.read_exactly(ctrl)) else: length = ctrl >> 5 offset = 1 + ((ctrl & 0B11111) << 8) if length == 7: length += ip.u8() offset += ip.u8() length += 2 out.replay(offset, length) def process(self, data): mem = memoryview(data) out = MemoryFile() try: reader = StructReader(mem) header = LZFHeader(reader) except Exception: self.log_info('no header detected, decompressing as raw stream') self._decompress_chunk(mem, out) return out.getvalue() for k in itertools.count(1): self.log_info(F'chunk: e=0x{header.encoded_size:04X} d=0x{header.decoded_size:04X}') chunk = reader.read(header.encoded_size) if header.compressed: self._decompress_chunk(chunk, out) else: out.write(chunk) if reader.eof: break try: header = LZFHeader(reader) except Exception as E: msg = F'failed parsing next header after {k} chunks: {E!s}' raise RefineryPartialResult(msg, out.getvalue()) return out.getvalue() @classmethod def handles(self, data: bytearray): if data[:2] == LZFHeader.MAGIC: return True
class lzg
-
This unit is implemented in
refinery.units.compression.lzg
and has the following commandline Interface:usage: lzg [-h] [-L] [-Q] [-0] [-v] [-F] LZG decompression. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class lzg(Unit): """ LZG decompression. """ def process(self, data: bytearray): stream = LZGStream(data) out = stream.decompress() if len(out) != stream.decoded_size: msg = F'LZG header announced {stream.decoded_size} bytes, but decompressed buffer had size {len(out)}.' raise RefineryPartialResult(msg, out) return out @classmethod def handles(cls, data: bytearray): if data[:3] == B'LZG': return True
class lzip
-
This unit is implemented in
refinery.units.compression.lzip
and has the following commandline Interface:usage: lzip [-h] [-L] [-Q] [-0] [-v] [-F] LZIP decompression generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class lzip(Unit): """ LZIP decompression """ def process(self, data: bytearray): view = memoryview(data) with MemoryFile() as output, StructReader(view) as reader: for k in count(1): if reader.eof: break trailing_size = len(data) - reader.tell() try: ID, VN, DS = reader.read_struct('4sBB') if ID != B'LZIP': if k > 1: raise EOF else: self.log_warn(F'ignoring invalid LZIP signature: {ID.hex()}') if VN != 1: self.log_warn(F'ignoring invalid LZIP version: {VN}') dict_size = 1 << (DS & 0x1F) dict_size -= (dict_size // 16) * ((DS >> 5) & 7) if dict_size not in range(_MIN_DICT_SIZE, _MAX_DICT_SIZE + 1): raise ValueError( F'The dictionary size {dict_size} is out of the valid range ' F'[{_MIN_DICT_SIZE}, {_MAX_DICT_SIZE}]; unable to proceed.' ) decoder = MemberDecoder(dict_size, reader, output) if not decoder(): raise ValueError(F'Data error in stream {k}.') crc32, data_size, member_size = reader.read_struct('<LQQ') if crc32 != decoder.crc32: self.log_warn(F'checksum in stream {k} was {decoder.crc:08X}, should have been {crc32:08X}.') if member_size - 20 != decoder.member_position: self.log_warn(F'member size in stream {k} was {decoder.member_position}, should have been {member_size}.') if data_size != decoder.data_position: self.log_warn(F'data size in stream {k} was {decoder.data_position}, should have been {data_size}.') except EOFError: if k <= 1: raise self.log_info(F'silently ignoring {trailing_size} bytes of trailing data') break return output.getvalue() @classmethod def handles(self, data: bytearray): return data[:4] == B'LZIP'
class lzjb
-
This unit is implemented in
refinery.units.compression.lzjb
and has the following commandline Interface:usage: lzjb [-h] [-L] [-Q] [-0] [-v] [-R] LZJB compression and decompression. This LZ-type compression is used in the ZFS file system. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class lzjb(Unit): """ LZJB compression and decompression. This LZ-type compression is used in the ZFS file system. """ def reverse(self, src): # https://web.archive.org/web/20100807223517/ .. # .. http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/fs/zfs/lzjb.c output = bytearray() lempel = [0] * _LEMPEL_SIZE copymask = 0x80 position = 0 while position < len(src): copymask <<= 1 if copymask >= 0x100: copymask = 1 copymap = len(output) output.append(0) if position > len(src) - _MATCH_MAX: output.append(src[position]) position += 1 continue hsh = (src[position] << 16) + (src[position + 1] << 8) + src[position + 2] hsh += hsh >> 9 hsh += hsh >> 5 hsh %= len(lempel) offset = (position - lempel[hsh]) & _OFFSET_MASK lempel[hsh] = position cpy = position - offset if cpy >= 0 and cpy != position and src[position:position + 3] == src[cpy:cpy + 3]: output[copymap] |= copymask for mlen in range(_MATCH_MIN, min(len(src) - position, _MATCH_MAX)): if src[position + mlen] != src[cpy + mlen]: break output.append(((mlen - _MATCH_MIN) << (8 - _MATCH_LEN)) | (offset >> 8)) output.append(offset & 255) position += mlen else: output.append(src[position]) position += 1 return output def process(self, data): dst = bytearray() src = StructReader(data) while not src.eof: copy = src.read_byte() for mask in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80): if src.eof: break if not copy & mask: dst.append(src.read_byte()) continue elif not dst: raise ValueError('copy requested against empty buffer') with src.be: match_len = src.read_integer(6) + _MATCH_MIN match_pos = src.read_integer(10) if not match_pos or match_pos > len(dst): raise RuntimeError(F'invalid match offset at position {src.tell()}') match_pos = len(dst) - match_pos while match_len > 0: match = dst[match_pos:match_pos + match_len] dst.extend(match) match_pos += len(match) match_len -= len(match) return dst
class lzma (filter=None, raw=False, alone=False, xz=False, level=9, delta=None)
-
This unit is implemented in
refinery.units.compression.lz
and has the following commandline Interface:usage: lzma [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-r | -a | -x] [-l N] [-d N] [FILTER] LZMA compression and decompression. positional arguments: FILTER Specifies a bcj filter to be applied. Possible values are: ARM, ARMTHUMB, IA64, LZMA1, LZMA2, POWERPC, SPARC, X86 optional arguments: -r, --raw Use raw (no container) format. -a, --alone Use the lzma container format. -x, --xz Use the default xz format. -l, --level N The compression level preset; between 0 and 9. -d, --delta N Add a delta filter when compressing. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class lzma(Unit): """ LZMA compression and decompression. """ _LZMA_FILTER = extract_options(std_lzma, 'FILTER_', 'DELTA') _LZMA_PARSER = OptionFactory(_LZMA_FILTER) def __init__( self, filter: Arg.Choice(choices=list(_LZMA_FILTER), metavar='FILTER', help=( 'Specifies a bcj filter to be applied. Possible values are: {choices}')) = None, raw : Arg.Switch('-r', group='MODE', help='Use raw (no container) format.') = False, alone : Arg.Switch('-a', group='MODE', help='Use the lzma container format.') = False, xz : Arg.Switch('-x', group='MODE', help='Use the default xz format.') = False, level : Arg.Number('-l', bound=(0, 9), help='The compression level preset; between 0 and 9.') = 9, delta : Arg.Number('-d', help='Add a delta filter when compressing.') = None, ): filter = filter and self._LZMA_PARSER(filter) if (raw, alone, xz).count(True) > 1: raise ValueError('Only one container format can be enabled.') if level not in range(10): raise ValueError('Compression level must be a number between 0 and 9.') super().__init__(filter=filter, raw=raw, alone=alone, xz=xz, delta=delta, level=level | std_lzma.PRESET_EXTREME) def _get_lz_mode_and_filters(self, reverse=False): mode = std_lzma.FORMAT_AUTO filters = [] if self.args.filter is not None: filters.append({'id': self.args.filter.value}) if self.args.delta is not None: self.log_debug('adding delta filter') filters.append({ 'id': std_lzma.FILTER_DELTA, 'dist': self.args.delta }) if self.args.alone: self.log_debug('setting alone format') mode = std_lzma.FORMAT_ALONE filters.append({ 'id': std_lzma.FILTER_LZMA1, 'preset': self.args.level }) elif self.args.raw: self.log_debug('setting raw format') mode = std_lzma.FORMAT_RAW filters.append({ 'id': std_lzma.FILTER_LZMA2, 'preset': self.args.level }) elif self.args.xz or reverse: if reverse and not self.log_debug('setting xz container format'): self.log_info('choosing default .xz container format for compression.') mode = std_lzma.FORMAT_XZ filters.append({ 'id': std_lzma.FILTER_LZMA2, 'preset': self.args.level }) return mode, filters def reverse(self, data): mode, filters = self._get_lz_mode_and_filters(True) lz = std_lzma.LZMACompressor(mode, filters=filters) output = lz.compress(data) output += lz.flush() return output def _process_stream(self, data: ByteString, strategy: F, keywords): if strategy & F.STEPWISE: sizes = repeat(1) else: sizes = [len(data)] lz = std_lzma.LZMADecompressor(**keywords) with MemoryFile() as output: with MemoryFile(data) as stream: if strategy & F.INJECT: output.write(lz.decompress(stream.read(5))) output.write(lz.decompress(B'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF')) for size in sizes: if stream.eof or stream.closed: break try: position = stream.tell() output.write(lz.decompress(stream.read(size))) except (EOFError, std_lzma.LZMAError) as error: msg = error.args[0] if len(error.args) == 1 else error.__class__.__name__ raise RefineryPartialResult(F'compression failed at offset {position}: {msg!s}', output.getvalue()) return output.getvalue() def process(self, data: bytearray): errors: List[RefineryPartialResult] = [] view = memoryview(data) keywords = {} keywords['format'], filters = self._get_lz_mode_and_filters(False) if self.args.raw: keywords['filters'] = filters for strategy in (F.DEFAULT, F.INJECT, F.STEPWISE, F.INJECT | F.STEPWISE): try: return self._process_stream(view, strategy, keywords) except RefineryPartialResult as p: self.log_info(F'decompression failed with strategy {strategy}: {p.message}') errors.append(p) raise max(errors, key=lambda e: len(e.partial)) @classmethod def handles(self, data: bytearray): if data[:4] == B'\x5D\0\0\0': return True if data[:5] == B'\xFD7zXZ': return True
class lznt1 (chunk_size=4096)
-
This unit is implemented in
refinery.units.compression.lznt1
and has the following commandline Interface:usage: lznt1 [-h] [-L] [-Q] [-0] [-v] [-R] [-c N] LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API routine RtlDecompressBuffer, for example. optional arguments: -c, --chunk-size N Optionally specify the chunk size for compression, default is 0x1000. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class lznt1(Unit): """ LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API routine `RtlDecompressBuffer`, for example. """ def _decompress_chunk(self, chunk): out = B'' while chunk: flags = chunk[0] chunk = chunk[1:] for i in range(8): if not (flags >> i & 1): out += chunk[:1] chunk = chunk[1:] else: flag = struct.unpack('<H', chunk[:2])[0] pos = len(out) - 1 l_mask = 0xFFF o_shift = 12 while pos >= 0x10: l_mask >>= 1 o_shift -= 1 pos >>= 1 length = (flag & l_mask) + 3 offset = (flag >> o_shift) + 1 if length >= offset: tmp = out[-offset:] * (0xFFF // len(out[-offset:]) + 1) out += tmp[:length] else: out += out[-offset:length - offset] chunk = chunk[2:] if len(chunk) == 0: break return out def _find(self, src, target, max_len): result_offset = 0 result_length = 0 for i in range(1, max_len): offset = src.rfind(target[:i]) if offset == -1: break tmp_offset = len(src) - offset tmp_length = i if tmp_offset == tmp_length: tmp = src[offset:] * (0xFFF // len(src[offset:]) + 1) for j in range(i, max_len + 1): offset = tmp.rfind(target[:j]) if offset == -1: break tmp_length = j if tmp_length > result_length: result_offset = tmp_offset result_length = tmp_length if result_length < 3: return 0, 0 return result_offset, result_length def _compress_chunk(self, chunk): blob = copy.copy(chunk) out = B'' pow2 = 0x10 l_mask3 = 0x1002 o_shift = 12 while len(blob) > 0: bits = 0 tmp = B'' for i in range(8): bits >>= 1 while pow2 < (len(chunk) - len(blob)): pow2 <<= 1 l_mask3 = (l_mask3 >> 1) + 1 o_shift -= 1 if len(blob) < l_mask3: max_len = len(blob) else: max_len = l_mask3 offset1, length1 = self._find( chunk[:len(chunk) - len(blob)], blob, max_len) # try to find more compressed pattern offset2, length2 = self._find( chunk[:len(chunk) - len(blob) + 1], blob[1:], max_len) if length1 < length2: length1 = 0 if length1 > 0: symbol = ((offset1 - 1) << o_shift) | (length1 - 3) tmp += struct.pack('<H', symbol) bits |= 0x80 # set the highest bit blob = blob[length1:] else: tmp += blob[:1] blob = blob[1:] if len(blob) == 0: break out += struct.pack('B', bits >> (7 - i)) out += tmp return out def reverse(self, buf): out = B'' while buf: chunk = buf[:self.args.chunk_size] compressed = self._compress_chunk(chunk) if len(compressed) < len(chunk): # chunk is compressed flags = 0xB000 header = struct.pack('<H', flags | (len(compressed) - 1)) out += header + compressed else: flags = 0x3000 header = struct.pack('<H', flags | (len(chunk) - 1)) out += header + chunk buf = buf[self.args.chunk_size:] return out def process(self, data): out = io.BytesIO() offset = 0 while offset < len(data): try: header, = struct.unpack('<H', data[offset:offset + 2]) except struct.error as err: raise RefineryPartialResult(str(err), partial=out.getvalue()) offset += 2 size = (header & 0xFFF) + 1 if size + 1 >= len(data): raise RefineryPartialResult( F'chunk header indicates size {size}, but only {len(data)} bytes remain.', partial=out.getvalue() ) chunk = data[offset:offset + size] offset += size if header & 0x8000: chunk = self._decompress_chunk(chunk) out.write(chunk) return out.getvalue() def __init__(self, chunk_size: Arg.Number('-c', help='Optionally specify the chunk size for compression, default is 0x1000.') = 0x1000): super().__init__(chunk_size=chunk_size)
class lzo
-
This unit is implemented in
refinery.units.compression.lzo
and has the following commandline Interface:usage: lzo [-h] [-L] [-Q] [-0] [-v] [-F] LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used). generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class lzo(Unit): """ LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used). """ def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray: """ An implementation of LZO decompression. We use the article "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)" as a reference since no proper specification is available. """ def integer() -> int: length = 0 while True: byte = src.read_byte() if byte: return length + byte length += 0xFF if length > 0x100000: raise LZOError('Too many zeros in integer encoding.') def literal(count): dst.write(src.read_bytes(count)) def copy(distance: int, length: int): if distance > len(dst): raise LZOError(F'Distance {distance} > bufsize {len(dst)}') buffer = dst.getbuffer() if distance > length: start = len(buffer) - distance end = start + length dst.write(buffer[start:end]) else: block = buffer[-distance:] while len(block) < length: block += block[:length - len(block)] if len(block) > length: block[length:] = () dst.write(block) src = StructReader(memoryview(data)) dst = MemoryFile() state = 0 first = src.read_byte() if first == 0x10: raise LZOError('Invalid first stream byte 0x10.') elif first <= 0x12: src.seekrel(-1) elif first <= 0x15: state = first - 0x11 literal(state) else: state = 4 literal(first - 0x11) while True: instruction = src.read_byte() if instruction < 0x10: if state == 0: length = instruction or integer() + 15 state = length + 3 if state < 4: raise LZOError('Literal encoding is too short.') else: state = instruction & 0b0011 D = (instruction & 0b1100) >> 2 H = src.read_byte() distance = (H << 2) + D + 1 if state >= 4: distance += 0x800 length = 3 else: length = 2 copy(distance, length) elif instruction < 0x20: L = instruction & 0b0111 H = instruction & 0b1000 length = L or integer() + 7 argument = src.u16() state = argument & 3 distance = (H << 11) + (argument >> 2) if not distance: return dst.getbuffer() if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265): raise LZOError('Compressed data contains sequence that is banned in LZOv1.') if LZOv1 and distance == 0xBFFF: X = src.read_byte() count = ((X << 3) | L) + 4 self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.') dst.write(B'\0' * count) else: copy(distance + 0x4000, length + 2) elif instruction < 0x40: L = instruction & 0b11111 length = L or integer() + 31 argument = src.u16() state = argument & 3 distance = (argument >> 2) + 1 copy(distance, length + 2) else: if instruction < 0x80: length = 3 + ((instruction >> 5) & 1) else: length = 5 + ((instruction >> 5) & 3) H = src.read_byte() D = (instruction & 0b11100) >> 2 state = instruction & 3 distance = (H << 3) + D + 1 copy(distance, length) if state: literal(state) def process(self, data): try: lzo = LZO(data) except LZOError: self.log_info('Not an LZO archive, processing raw stream.') return self.decompress_stream(data) with MemoryFile() as output: for k, chunk in enumerate(lzo, 1): self.log_debug(F'decompressing chunk {k}') output.write(self.decompress_stream(chunk.data)) return self.labelled( output.getbuffer(), path=lzo.name, date=date_from_timestamp(lzo.mtime) ) @classmethod def handles(self, data: bytearray) -> Optional[bool]: sig = LZO.SIGNATURE if data[:len(sig)] == sig: return True
Methods
def decompress_stream(self, data, LZOv1=False)
-
An implementation of LZO decompression. We use the article "LZO stream format as understood by Linux's LZO decompressor" as a reference since no proper specification is available.
Expand source code Browse git
def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray: """ An implementation of LZO decompression. We use the article "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)" as a reference since no proper specification is available. """ def integer() -> int: length = 0 while True: byte = src.read_byte() if byte: return length + byte length += 0xFF if length > 0x100000: raise LZOError('Too many zeros in integer encoding.') def literal(count): dst.write(src.read_bytes(count)) def copy(distance: int, length: int): if distance > len(dst): raise LZOError(F'Distance {distance} > bufsize {len(dst)}') buffer = dst.getbuffer() if distance > length: start = len(buffer) - distance end = start + length dst.write(buffer[start:end]) else: block = buffer[-distance:] while len(block) < length: block += block[:length - len(block)] if len(block) > length: block[length:] = () dst.write(block) src = StructReader(memoryview(data)) dst = MemoryFile() state = 0 first = src.read_byte() if first == 0x10: raise LZOError('Invalid first stream byte 0x10.') elif first <= 0x12: src.seekrel(-1) elif first <= 0x15: state = first - 0x11 literal(state) else: state = 4 literal(first - 0x11) while True: instruction = src.read_byte() if instruction < 0x10: if state == 0: length = instruction or integer() + 15 state = length + 3 if state < 4: raise LZOError('Literal encoding is too short.') else: state = instruction & 0b0011 D = (instruction & 0b1100) >> 2 H = src.read_byte() distance = (H << 2) + D + 1 if state >= 4: distance += 0x800 length = 3 else: length = 2 copy(distance, length) elif instruction < 0x20: L = instruction & 0b0111 H = instruction & 0b1000 length = L or integer() + 7 argument = src.u16() state = argument & 3 distance = (H << 11) + (argument >> 2) if not distance: return dst.getbuffer() if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265): raise LZOError('Compressed data contains sequence that is banned in LZOv1.') if LZOv1 and distance == 0xBFFF: X = src.read_byte() count = ((X << 3) | L) + 4 self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.') dst.write(B'\0' * count) else: copy(distance + 0x4000, length + 2) elif instruction < 0x40: L = instruction & 0b11111 length = L or integer() + 31 argument = src.u16() state = argument & 3 distance = (argument >> 2) + 1 copy(distance, length + 2) else: if instruction < 0x80: length = 3 + ((instruction >> 5) & 1) else: length = 5 + ((instruction >> 5) & 3) H = src.read_byte() D = (instruction & 0b11100) >> 2 state = instruction & 3 distance = (H << 3) + D + 1 copy(distance, length) if state: literal(state)
class lzw
-
This unit is implemented in
refinery.units.compression.lzw
and has the following commandline Interface:usage: lzw [-h] [-L] [-Q] [-0] [-v] [-F] LZW decompression based on ancient Unix sources. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class lzw(Unit): ''' LZW decompression based on ancient Unix sources. ''' _MAGIC = B'\x1F\x9D' def process(self, data: bytearray): out = MemoryFile() inf = StructReader(memoryview(data)) if inf.peek(2) != self._MAGIC: self.log_info('No LZW signature found, assuming raw stream.') maxbits = LZW.BITS block_mode = True else: inf.seekrel(2) maxbits = inf.read_integer(5) if inf.read_integer(2) != 0: self.log_info('reserved bits were set in LZW header') block_mode = bool(inf.read_bit()) if maxbits > LZW.BITS: raise ValueError(F'Compressed with {maxbits} bits; cannot handle file.') maxmaxcode = 1 << maxbits ibuf = inf.read() tab_suffix = bytearray(LZW.WSIZE * 2) tab_prefix = array('H', itertools.repeat(0, 1 << LZW.BITS)) n_bits = LZW.INIT_BITS maxcode = (1 << n_bits) - 1 bitmask = (1 << n_bits) - 1 oldcode = ~0 finchar = +0 posbits = +0 free_entry = LZW.FIRST if block_mode else 0x100 tab_suffix[:0x100] = range(0x100) resetbuf = True while resetbuf: resetbuf = False ibuf = ibuf[posbits >> 3:] insize = len(ibuf) posbits = 0 inbits = (insize << 3) - (n_bits - 1) while inbits > posbits: if free_entry > maxcode: n = n_bits << 3 p = posbits - 1 posbits = p + (n - (p + n) % n) n_bits += 1 if (n_bits == maxbits): maxcode = maxmaxcode else: maxcode = (1 << n_bits) - 1 bitmask = (1 << n_bits) - 1 resetbuf = True break p = ibuf[posbits >> 3:] code = int.from_bytes(p[:3], 'little') >> (posbits & 7) & bitmask posbits += n_bits if oldcode == -1: if code >= 256: raise ValueError('corrupt input.') oldcode = code finchar = oldcode out.write_byte(finchar) continue if code == LZW.CLEAR and block_mode: tab_prefix[:0x100] = array('H', itertools.repeat(0, 0x100)) free_entry = LZW.FIRST - 1 n = n_bits << 3 p = posbits - 1 posbits = p + (n - (p + n) % n) n_bits = LZW.INIT_BITS maxcode = (1 << n_bits) - 1 bitmask = (1 << n_bits) - 1 resetbuf = True break incode = code stack = bytearray() if code >= free_entry: if code > free_entry: raise RefineryPartialResult('corrupt input.', out.getbuffer()) stack.append(finchar) code = oldcode while code >= 256: stack.append(tab_suffix[code]) code = tab_prefix[code] finchar = tab_suffix[code] stack.append(finchar) stack.reverse() out.write(stack) code = free_entry if code < maxmaxcode: tab_prefix[code] = oldcode & 0xFFFF tab_suffix[code] = finchar & 0x00FF free_entry = code + 1 oldcode = incode return out.getvalue() @classmethod def handles(self, data: bytearray) -> Optional[bool]: sig = self._MAGIC if data[:len(sig)] == sig: return True
class machometa (all=True, header=False, linked_images=False, signatures=False, version=False, load_commands=False, exports=False, imports=False, tabular=False)
-
This unit is implemented in
refinery.units.formats.macho.machometa
and has the following commandline Interface:usage: machometa [-h] [-L] [-Q] [-0] [-v] [-c] [-H] [-K] [-S] [-V] [-D] [-E] [-I] [-t] Extract metadata from Mach-O files. optional arguments: -c, --custom Unless enabled, all default categories will be extracted. -H, --header Parse basic data from the Mach-O header. -K, --linked-images Parse all library images linked by the Mach-O. -S, --signatures Parse signature and entitlement information. -V, --version Parse version information from the Mach-O load commands. -D, --load-commands Parse load commands from the Mach-O header. -E, --exports List all exported functions. -I, --imports List all imported functions. -t, --tabular Print information in a table rather than as JSON generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class machometa(Unit): """ Extract metadata from Mach-O files. """ def __init__( self, all: Arg('-c', '--custom', help='Unless enabled, all default categories will be extracted.') = True, header: Arg('-H', help='Parse basic data from the Mach-O header.') = False, linked_images: Arg('-K', help='Parse all library images linked by the Mach-O.') = False, signatures: Arg('-S', help='Parse signature and entitlement information.') = False, version: Arg('-V', help='Parse version information from the Mach-O load commands.') = False, load_commands: Arg('-D', help='Parse load commands from the Mach-O header.') = False, exports: Arg('-E', help='List all exported functions.') = False, imports: Arg('-I', help='List all imported functions.') = False, tabular: Arg('-t', help='Print information in a table rather than as JSON') = False, ): super().__init__( header=all or header, linked_images=all or linked_images, version=all or version, signatures=all or signatures, load_commands=load_commands, imports=imports, exports=exports, tabular=tabular, ) @Unit.Requires('k2l>=2.0', 'all') def _ktool(): import ktool import ktool.macho import ktool.codesign return ktool def compute_symhash(self, macho_image: Image) -> Dict: def _symbols(symbols: Iterable[Symbol]): for sym in symbols: if sym.types: continue yield sym.fullname symbols = sorted(set(_symbols(macho_image.symbol_table.ext))) symbols: str = ','.join(symbols) return md5(symbols.encode('utf8')).hexdigest() def parse_macho_header(self, macho_image: Image, data=None) -> Dict: info = {} macho_header = macho_image.macho_header dyld_header = macho_image.macho_header.dyld_header if dyld_header is not None: info['Type'] = dyld_header.type_name info['Magic'] = dyld_header.magic info['CPUType'] = macho_image.slice.type.name info['CPUSubType'] = macho_image.slice.subtype.name info['FileType'] = macho_image.macho_header.filetype.name info['LoadCount'] = dyld_header.loadcnt info['LoadSize'] = dyld_header.loadsize info['Flags'] = [flag.name for flag in macho_header.flags] info['Reserved'] = dyld_header.reserved return info def parse_linked_images(self, macho_image: Image, data=None) -> Dict: load_command_images = {} linked_images = macho_image.linked_images LOAD_COMMAND = self._ktool.macho.LOAD_COMMAND for linked_image in linked_images: load_command_name = LOAD_COMMAND(linked_image.cmd.cmd).name load_command_images.setdefault(load_command_name, []).append(linked_image.install_name) return load_command_images def parse_signature(self, macho_image: Image, data=None) -> Dict: _km = self._ktool.macho _kc = self._ktool.codesign class CodeDirectoryBlob(_km.Struct): FIELDS = { 'magic': _km.uint32_t, 'length': _km.uint32_t, 'version': _km.uint32_t, 'flags': _km.uint32_t, 'hashOffset': _km.uint32_t, 'identOffset': _km.uint32_t, 'nSpecialSlots': _km.uint32_t, 'nCodeSlots': _km.uint32_t, 'codeLimit': _km.uint32_t, 'hashSize': _km.uint8_t, 'hashType': _km.uint8_t, 'platform': _km.uint8_t, 'pageSize': _km.uint8_t, 'spare2': _km.uint32_t } def __init__(self, byte_order='little'): super().__init__(byte_order=byte_order) self.magic = 0 self.length = 0 self.version = 0 self.flags = 0 self.hashOffset = 0 self.identOffset = 0 self.nSpecialSlots = 0 self.nCodeSlots = 0 self.codeLimit = 0 self.hashSize = 0 self.hashType = 0 self.platform = 0 self.pageSize = 0 self.spare2 = 0 info = {} if macho_image.codesign_info is not None: superblob: SuperBlob = macho_image.codesign_info.superblob for blob in macho_image.codesign_info.slots: blob: BlobIndex # ktool does not include code for extracting Blobs of types # CSSLOT_CODEDIRECTORY, CSSLOT_CMS_SIGNATURE # so we must do it ourselves here. if blob.type == _kc.CSSLOT_CODEDIRECTORY: start = superblob.off + blob.offset codedirectory_blob = macho_image.read_struct(start, CodeDirectoryBlob) # Ad-hoc signing flags = _kc.swap_32(codedirectory_blob.flags) if flags & CS_ADHOC != 0: info['AdHocSigned'] = True else: info['AdHocSigned'] = False # Signature identifier identifier_offset = _kc.swap_32(codedirectory_blob.identOffset) identifier_data = macho_image.read_cstr(start + identifier_offset) info['SignatureIdentifier'] = identifier_data if blob.type == 0x10000: # CSSLOT_CMS_SIGNATURE start = superblob.off + blob.offset blob_data = macho_image.read_struct(start, _kc.Blob) blob_data.magic = _kc.swap_32(blob_data.magic) blob_data.length = _kc.swap_32(blob_data.length) cms_signature = macho_image.read_bytearray(start + _kc.Blob.SIZE, blob_data.length - _kc.Blob.SIZE) if len(cms_signature) != 0: try: parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature)) info['Signature'] = parsed_cms_signature except ValueError as pkcs7_parse_error: self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}') if macho_image.codesign_info.req_dat is not None: # TODO: Parse the requirements blob, # which is encoded according to the code signing requirements language: # https://developer.apple.com/library/archive/documentation/Security/Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html info['Requirements'] = macho_image.codesign_info.req_dat.hex() if macho_image.codesign_info.entitlements is not None: entitlements: str = macho_image.codesign_info.entitlements if entitlements: try: entitlements = plistlib.loads(entitlements.encode('utf8')) except Exception as error: self.log_warn(F'failed to parse entitlements: {error!s}') else: info['Entitlements'] = entitlements return info def parse_version(self, macho_image: Image, data=None) -> Dict: info = {} load_commands = macho_image.macho_header.load_commands SVC = self._ktool.macho.source_version_command BVC = self._ktool.macho.build_version_command for load_command in load_commands: if isinstance(load_command, SVC): if 'SourceVersion' not in info: info['SourceVersion'] = load_command.version else: self.log_warn('More than one load command of type source_version_command found; the MachO file is possibly malformed') elif isinstance(load_command, BVC): if 'BuildVersion' not in info: info['BuildVersion'] = {} info['BuildVersion']['Platform'] = macho_image.platform.name info['BuildVersion']['MinOS'] = F'{macho_image.minos.x}.{macho_image.minos.y}.{macho_image.minos.z}' info['BuildVersion']['SDK'] = F'{macho_image.sdk_version.x}.{macho_image.sdk_version.y}.{macho_image.sdk_version.z}' info['BuildVersion']['Ntools'] = load_command.ntools else: self.log_warn('More than one load command of type build_version_command found; the MachO file is possibly malformed') return info def parse_load_commands(self, macho_image: Image, data=None) -> List: info = [] load_commands = macho_image.macho_header.load_commands for load_command in load_commands: info.append(load_command.serialize()) return info def parse_imports(self, macho_image: Image, data=None) -> List: info = [] for imp in macho_image.imports: info.append(imp.name) return info def parse_exports(self, macho_image: Image, data=None) -> List: info = [] for exp in macho_image.exports: info.append(exp.name) return info def process(self, data: bytearray): result = {} ktool = self._ktool with NoLogging(NoLogging.Mode.ALL): macho = ktool.load_macho_file(fp=MemoryFile(memoryview(data)), use_mmaped_io=False) if macho.type is ktool.MachOFileType.FAT: result['FileType'] = 'FAT' elif macho.type is ktool.MachOFileType.THIN: result['FileType'] = 'THIN' slices = [] for macho_slice in macho.slices: slice_result = {} macho_image = ktool.load_image(fp=macho_slice) for switch, resolver, name in [ (self.args.header, self.parse_macho_header, 'Header'), (self.args.linked_images, self.parse_linked_images, 'LinkedImages'), (self.args.signatures, self.parse_signature, 'Signatures'), (self.args.version, self.parse_version, 'Version'), (self.args.load_commands, self.parse_load_commands, 'LoadCommands'), (self.args.imports, self.parse_imports, 'Imports'), (self.args.exports, self.parse_exports, 'Exports'), ]: if not switch: continue self.log_debug(F'parsing: {name}') try: info = resolver(macho_image, data) except Exception as E: self.log_info(F'failed to obtain {name}: {E!s}') continue if info: slice_result[name] = info if macho_image.uuid is not None: uuid: bytes = macho_image.uuid slice_result['UUID'] = uuid.hex() slice_result['SymHash'] = self.compute_symhash(macho_image) slice_result['BaseName'] = macho_image.base_name slice_result['InstallName'] = macho_image.install_name slices.append(slice_result) if slices: result['Slices'] = slices yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False)
Methods
def compute_symhash(self, macho_image)
-
Expand source code Browse git
def compute_symhash(self, macho_image: Image) -> Dict: def _symbols(symbols: Iterable[Symbol]): for sym in symbols: if sym.types: continue yield sym.fullname symbols = sorted(set(_symbols(macho_image.symbol_table.ext))) symbols: str = ','.join(symbols) return md5(symbols.encode('utf8')).hexdigest()
def parse_macho_header(self, macho_image, data=None)
-
Expand source code Browse git
def parse_macho_header(self, macho_image: Image, data=None) -> Dict: info = {} macho_header = macho_image.macho_header dyld_header = macho_image.macho_header.dyld_header if dyld_header is not None: info['Type'] = dyld_header.type_name info['Magic'] = dyld_header.magic info['CPUType'] = macho_image.slice.type.name info['CPUSubType'] = macho_image.slice.subtype.name info['FileType'] = macho_image.macho_header.filetype.name info['LoadCount'] = dyld_header.loadcnt info['LoadSize'] = dyld_header.loadsize info['Flags'] = [flag.name for flag in macho_header.flags] info['Reserved'] = dyld_header.reserved return info
def parse_linked_images(self, macho_image, data=None)
-
Expand source code Browse git
def parse_linked_images(self, macho_image: Image, data=None) -> Dict: load_command_images = {} linked_images = macho_image.linked_images LOAD_COMMAND = self._ktool.macho.LOAD_COMMAND for linked_image in linked_images: load_command_name = LOAD_COMMAND(linked_image.cmd.cmd).name load_command_images.setdefault(load_command_name, []).append(linked_image.install_name) return load_command_images
def parse_signature(self, macho_image, data=None)
-
Expand source code Browse git
def parse_signature(self, macho_image: Image, data=None) -> Dict: _km = self._ktool.macho _kc = self._ktool.codesign class CodeDirectoryBlob(_km.Struct): FIELDS = { 'magic': _km.uint32_t, 'length': _km.uint32_t, 'version': _km.uint32_t, 'flags': _km.uint32_t, 'hashOffset': _km.uint32_t, 'identOffset': _km.uint32_t, 'nSpecialSlots': _km.uint32_t, 'nCodeSlots': _km.uint32_t, 'codeLimit': _km.uint32_t, 'hashSize': _km.uint8_t, 'hashType': _km.uint8_t, 'platform': _km.uint8_t, 'pageSize': _km.uint8_t, 'spare2': _km.uint32_t } def __init__(self, byte_order='little'): super().__init__(byte_order=byte_order) self.magic = 0 self.length = 0 self.version = 0 self.flags = 0 self.hashOffset = 0 self.identOffset = 0 self.nSpecialSlots = 0 self.nCodeSlots = 0 self.codeLimit = 0 self.hashSize = 0 self.hashType = 0 self.platform = 0 self.pageSize = 0 self.spare2 = 0 info = {} if macho_image.codesign_info is not None: superblob: SuperBlob = macho_image.codesign_info.superblob for blob in macho_image.codesign_info.slots: blob: BlobIndex # ktool does not include code for extracting Blobs of types # CSSLOT_CODEDIRECTORY, CSSLOT_CMS_SIGNATURE # so we must do it ourselves here. if blob.type == _kc.CSSLOT_CODEDIRECTORY: start = superblob.off + blob.offset codedirectory_blob = macho_image.read_struct(start, CodeDirectoryBlob) # Ad-hoc signing flags = _kc.swap_32(codedirectory_blob.flags) if flags & CS_ADHOC != 0: info['AdHocSigned'] = True else: info['AdHocSigned'] = False # Signature identifier identifier_offset = _kc.swap_32(codedirectory_blob.identOffset) identifier_data = macho_image.read_cstr(start + identifier_offset) info['SignatureIdentifier'] = identifier_data if blob.type == 0x10000: # CSSLOT_CMS_SIGNATURE start = superblob.off + blob.offset blob_data = macho_image.read_struct(start, _kc.Blob) blob_data.magic = _kc.swap_32(blob_data.magic) blob_data.length = _kc.swap_32(blob_data.length) cms_signature = macho_image.read_bytearray(start + _kc.Blob.SIZE, blob_data.length - _kc.Blob.SIZE) if len(cms_signature) != 0: try: parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature)) info['Signature'] = parsed_cms_signature except ValueError as pkcs7_parse_error: self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}') if macho_image.codesign_info.req_dat is not None: # TODO: Parse the requirements blob, # which is encoded according to the code signing requirements language: # https://developer.apple.com/library/archive/documentation/Security/Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html info['Requirements'] = macho_image.codesign_info.req_dat.hex() if macho_image.codesign_info.entitlements is not None: entitlements: str = macho_image.codesign_info.entitlements if entitlements: try: entitlements = plistlib.loads(entitlements.encode('utf8')) except Exception as error: self.log_warn(F'failed to parse entitlements: {error!s}') else: info['Entitlements'] = entitlements return info
def parse_version(self, macho_image, data=None)
-
Expand source code Browse git
def parse_version(self, macho_image: Image, data=None) -> Dict: info = {} load_commands = macho_image.macho_header.load_commands SVC = self._ktool.macho.source_version_command BVC = self._ktool.macho.build_version_command for load_command in load_commands: if isinstance(load_command, SVC): if 'SourceVersion' not in info: info['SourceVersion'] = load_command.version else: self.log_warn('More than one load command of type source_version_command found; the MachO file is possibly malformed') elif isinstance(load_command, BVC): if 'BuildVersion' not in info: info['BuildVersion'] = {} info['BuildVersion']['Platform'] = macho_image.platform.name info['BuildVersion']['MinOS'] = F'{macho_image.minos.x}.{macho_image.minos.y}.{macho_image.minos.z}' info['BuildVersion']['SDK'] = F'{macho_image.sdk_version.x}.{macho_image.sdk_version.y}.{macho_image.sdk_version.z}' info['BuildVersion']['Ntools'] = load_command.ntools else: self.log_warn('More than one load command of type build_version_command found; the MachO file is possibly malformed') return info
def parse_load_commands(self, macho_image, data=None)
-
Expand source code Browse git
def parse_load_commands(self, macho_image: Image, data=None) -> List: info = [] load_commands = macho_image.macho_header.load_commands for load_command in load_commands: info.append(load_command.serialize()) return info
def parse_imports(self, macho_image, data=None)
-
Expand source code Browse git
def parse_imports(self, macho_image: Image, data=None) -> List: info = [] for imp in macho_image.imports: info.append(imp.name) return info
def parse_exports(self, macho_image, data=None)
-
Expand source code Browse git
def parse_exports(self, macho_image: Image, data=None) -> List: info = [] for exp in macho_image.exports: info.append(exp.name) return info
class map (index, image, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.map
and has the following commandline Interface:usage: map [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] index image Each block of the input data which occurs as a block of the index argument is replaced by the corresponding block of the image argument. If a block size is specified, and if the index or image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes that are not an integer multiple of the block size are discarded. To prevent any automatic chunking, the btoi handler can be used. positional arguments: index index characters image image characters optional arguments: -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class map(BlockTransformation): """ Each block of the input data which occurs as a block of the index argument is replaced by the corresponding block of the image argument. If a block size is specified, and if the index or image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes that are not an integer multiple of the block size are discarded. To prevent any automatic chunking, the `refinery.lib.argformats.DelayedArgument.btoi` handler can be used. """ _map: Optional[Dict[int, int]] def __init__( self, index: Arg.NumSeq(help='index characters'), image: Arg.NumSeq(help='image characters'), blocksize=None ): super().__init__(blocksize=blocksize, index=index, image=image, _truncate=2) self._map = None def reverse(self, data): return self._process(data, self.args.image, self.args.index) def process(self, data): return self._process(data, self.args.index, self.args.image) def _process(self, data: bytearray, index: Sequence[int], image: Sequence[int]): if not self.bytestream: if isbuffer(index): self.log_info(F'chunking index sequence into blocks of size {self.blocksize}') index = list(self.chunk(index)) self.log_debug(F'index sequence: {index}') if isbuffer(image): self.log_info(F'chunking image sequence into blocks of size {self.blocksize}') image = list(self.chunk(image)) self.log_debug(F'image sequence: {image}') if len(set(index)) != len(index): raise ValueError('The index sequence contains duplicates.') if len(index) > len(image): raise ValueError('The index sequence is longer than the image sequence.') if self.bytestream: mapping = dict(zip(index, image)) mapping = bytes(mapping.get(c, c) for c in range(0x100)) if not isinstance(data, bytearray): data = bytearray(data) data[:] = (mapping[b] for b in data) return data try: self._map = dict(zip(index, image)) return super().process(data) finally: self._map = None def process_block(self, token): return self._map.get(token, token)
class max_ (key=None)
-
This unit is implemented in
refinery.units.meta.max
and has the following commandline Interface:usage: max [-h] [-L] [-Q] [-0] [-v] [key] Picks the maximum of all elements in the current frame. positional arguments: key A meta variable expression to sort by instead of sorting the content. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class max_(Unit): """ Picks the maximum of all elements in the current `refinery.lib.frame`. """ def __init__( self, key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None, ): super().__init__(key=key) def filter(self, chunks: Iterable[Chunk]): def get_value(chunk: Chunk): if key is None: return chunk return metavars(chunk).get(key) key = self.args.key it = iter(chunks) for max_chunk in it: if not max_chunk.visible: yield max_chunk else: max_index = 0 max_value = get_value(max_chunk) break else: return for index, chunk in enumerate(chunks, 1): if not chunk.visible: yield chunk continue value = get_value(chunk) try: is_max = value > max_value except TypeError: if max_value is None: self.log_info( F'Discarding chunk {max_index} in favor of {index} because {key} was not ' F'set on the former; new maximum is {value!r}.') is_max = True else: self.log_info( F'Discarding chunk {index} because {key} had value {value!r}; it could not ' F'be compared to the current maximum {max_value!r} on chunk {max_index}.') is_max = False if is_max: max_value = value max_chunk = chunk max_index = index yield max_chunk
class md2 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: md2 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the MD2 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class md4 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: md4 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the MD4 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class md5 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: md5 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the MD5 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class mimewords
-
This unit is implemented in
refinery.units.pattern.mimewords
and has the following commandline Interface:usage: mimewords [-h] [-L] [-Q] [-0] [-v] Implements the decoding of MIME encoded-word syntax from RFC-2047. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mimewords(Unit): """ Implements the decoding of MIME encoded-word syntax from RFC-2047. """ @unicoded def process(self, data: str) -> str: def replacer(match): self.log_info('encoded mime word:', match[0]) decoded, = decode_header(match[0]) raw, codec = decoded return codecs.decode(raw, codec, errors='surrogateescape') return re.sub(R"=(?:\?[^\?]*){3}\?=", replacer, data)
class min_ (key=None)
-
This unit is implemented in
refinery.units.meta.min
and has the following commandline Interface:usage: min [-h] [-L] [-Q] [-0] [-v] [key] Picks the minimum of all elements in the current frame. positional arguments: key A meta variable expression to sort by instead of sorting the content. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class min_(Unit): """ Picks the minimum of all elements in the current `refinery.lib.frame`. """ def __init__( self, key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None, ): super().__init__(key=key) def filter(self, chunks: Iterable[Chunk]): def get_value(chunk: Chunk): if key is None: return chunk return metavars(chunk).get(key) key = self.args.key it = iter(chunks) for min_chunk in it: if not min_chunk.visible: yield min_chunk else: min_index = 0 min_value = get_value(min_chunk) break else: return for index, chunk in enumerate(chunks, 1): if not chunk.visible: yield chunk continue value = get_value(chunk) try: is_min = value < min_value except TypeError: if min_value is None: self.log_info( F'Discarding chunk {min_index} in favor of {index} because {key} was not ' F'set on the former; new minimum is {value!r}.') is_min = True else: self.log_info( F'Discarding chunk {index} because {key} had value {value!r}; it could not ' F'be compared to the current minimum {min_value!r} on chunk {min_index}.') is_min = False if is_min: min_value = value min_chunk = chunk min_index = index yield min_chunk
class mmh128x32 (seed=0, text=False)
-
This unit is implemented in
refinery.units.crypto.hash.murmur
and has the following commandline Interface:usage: mmh128x32 [-h] [-L] [-Q] [-0] [-v] [-t] [N] Returns the 128bit Murmur Hash of the input data, 64bit variant. positional arguments: N optional seed value optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mmh128x32(MurMurHash): """ Returns the 128bit Murmur Hash of the input data, 64bit variant. """ def _algorithm(self, data: bytes) -> bytes: return mmh128digest32(data, self.args.seed)
class mmh128x64 (seed=0, text=False)
-
This unit is implemented in
refinery.units.crypto.hash.murmur
and has the following commandline Interface:usage: mmh128x64 [-h] [-L] [-Q] [-0] [-v] [-t] [N] Returns the 128bit Murmur Hash of the input data, 64bit variant. positional arguments: N optional seed value optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mmh128x64(MurMurHash): """ Returns the 128bit Murmur Hash of the input data, 64bit variant. """ def _algorithm(self, data: bytes) -> bytes: return mmh128digest64(data, self.args.seed)
class mmh32 (seed=0, text=False)
-
This unit is implemented in
refinery.units.crypto.hash.murmur
and has the following commandline Interface:usage: mmh32 [-h] [-L] [-Q] [-0] [-v] [-t] [N] Returns the 32bit Murmur Hash of the input data. positional arguments: N optional seed value optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mmh32(MurMurHash): """ Returns the 32bit Murmur Hash of the input data. """ def _algorithm(self, data: bytes) -> bytes: return mmh32digest(data, self.args.seed)
class morse (language=None)
-
This unit is implemented in
refinery.units.encoding.morse
and has the following commandline Interface:usage: morse [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [MorseLanguage] Morse encoding and decoding. All tokens in the input data which consist of dashes and dots are replaced by their Morse decoding. positional arguments: MorseLanguage Optionally choose a language. If none is specified, the unit will attempt to detect the language automatically. Options are: ar, de, en, es, fr, he, ru, ua generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class morse(Unit): """ Morse encoding and decoding. All tokens in the input data which consist of dashes and dots are replaced by their Morse decoding. """ def __init__( self, language: Arg.Option(choices=MorseLanguage, help=( 'Optionally choose a language. If none is specified, the unit will attempt to detect ' 'the language automatically. Options are: {choices}')) = None, ): super().__init__(language=Arg.AsOption(language, MorseLanguage)) @classmethod def handles(self, data: bytearray): if re.fullmatch(BR'[-.\s]+', data, re.DOTALL): return True @unicoded def process(self, data: str): language: MorseLanguage = self.args.language parsed = re.split('(\\s+)', data) tokens = {t for t in parsed[::2] if t} tables = [ self._DECODE_SYMBOL, self._DECODE_DIGITS, ] if language is not None: tables.append(self._DECODE[language]) else: special = set(self._DECODE_SYMBOL) | set(self._DECODE_DIGITS) best_ratio = 1 # number of unused codes best_table = None for language in MorseLanguage: table = self._DECODE[language] codes = set(table) if not tokens <= codes | special: continue if language == MorseLanguage.EN: best_table = table break ratio = len(codes - tokens) / len(codes) if ratio < best_ratio: best_ratio = ratio best_table = table if best_table is None: raise LookupError('Unable to determine language, please specify it manually.') tables.append(best_table) with io.StringIO() as out: for k, string in enumerate(parsed): if k % 2 == 1: string = string[1:] if len(string) > 1: string = string[:-1] out.write(string) continue if not string: continue for table in tables: try: out.write(table[string]) break except KeyError: continue else: raise ValueError(F'invalid token: {string}') return out.getvalue() @unicoded def reverse(self, data: str): language: MorseLanguage = self.args.language tables = [ self._ENCODE_SYMBOL, self._ENCODE_DIGITS, ] if language is not None: tables.append(self._ENCODE[language]) else: tables.extend(self._ENCODE.values()) def _encode(letter): for table in tables: try: return table[letter] except KeyError: continue else: raise ValueError(F'cannot encode letter "{letter}"') with io.StringIO() as out: for k, word in enumerate(re.split('(\\s+)', data)): if k % 2 == 1: out.write(F' {word} ') continue out.write(' '.join(_encode(letter) for letter in word.lower())) return out.getvalue() _ENCODE = { MorseLanguage.EN: { 'a': '.-', 'b': '-...', 'c': '-.-.', 'd': '-..', 'e': '.', 'f': '..-.', 'g': '--.', 'h': '....', 'i': '..', 'j': '.---', 'k': '-.-', 'l': '.-..', 'm': '--', 'n': '-.', 'o': '---', 'p': '.--.', 'q': '--.-', 'r': '.-.', 's': '...', 't': '-', 'u': '..-', 'v': '...-', 'w': '.--', 'x': '-..-', 'y': '-.--', 'z': '--..', } } _ENCODE[MorseLanguage.ES] = _extend_dictionary(_ENCODE[MorseLanguage.EN], { 'á': '.--.-', 'é': '..-..', 'í': '..', 'ñ': '--.--', 'ó': '---.', 'ú': '..-', 'ü': '..--', '¿': '..-.-', '¡': '--...-', }) _ENCODE[MorseLanguage.DE] = _extend_dictionary(_ENCODE[MorseLanguage.EN], { 'ä': '.-.-', 'ö': '---.', 'ü': '..--', 'ß': '...--..', }) _ENCODE[MorseLanguage.FR] = _extend_dictionary(_ENCODE[MorseLanguage.EN], { 'à': '.--.-', 'â': '.--.-', 'ç': '-.-..', 'è': '.-..-', 'é': '..-..', 'ê': '-..-.', 'ë': '..-..', 'î': '..', 'ï': '-..--', 'ô': '---', 'ù': '..-', 'ü': '..--', }) _ENCODE[MorseLanguage.RU] = { 'а': '.-', 'б': '-...', 'в': '.--', 'г': '--.', 'д': '-..', 'е': '.', 'ё': '.', 'ж': '...-', 'з': '--..', 'и': '..', 'й': '.---', 'к': '-.-', 'л': '.-..', 'м': '--', 'н': '-.', 'о': '---', 'п': '.--.', 'р': '.-.', 'с': '...', 'т': '-', 'у': '..-', 'ф': '..-.', 'х': '....', 'ц': '-.-.', 'ч': '---.', 'ш': '----', 'щ': '--.-', 'ъ': '--.--', 'ы': '-.--', 'ь': '-..-', 'э': '..-..', 'ю': '..--', 'я': '.-.-', } _ENCODE[MorseLanguage.UA] = _extend_dictionary(_ENCODE[MorseLanguage.RU], { 'ґ': '--.', 'и': '-.--', 'ї': '.---.', }) _ENCODE[MorseLanguage.UA]['є'] = _ENCODE[MorseLanguage.UA].pop('э') _ENCODE[MorseLanguage.UA]['і'] = _ENCODE[MorseLanguage.UA].pop('и') _ENCODE[MorseLanguage.HE] = { 'א': '.-', 'ב': '-...', 'ג': '--.', 'ד': '-..', 'ה': '---', 'ו': '.', 'ז': '--..', 'ח': '....', 'ט': '..--', 'י': '..', 'כ': '-.', 'ל': '.-..', 'מ': '--', 'נ': '--.', 'ס': '-.-.', 'ע': '.---', 'פ': '.--.', 'צ': '.--', 'ק': '--.-', 'ר': '.-.', 'ש': '...', 'ת': '-', } _ENCODE[MorseLanguage.AR] = { 'ا': '.-', 'ب': '-...', 'ت': '-', 'ث': '-.-.', 'ج': '.---', 'ح': '....', 'خ': '---', 'د': '-..', 'ذ': '--..', 'ر': '.-.', 'ز': '---.', 'س': '...', 'ش': '----', 'ص': '-..-', 'ض': '...-', 'ط': '..-', 'ظ': '-.--', 'ع': '.-.-', 'غ': '--.', 'ف': '..-.', 'ق': '--.-', 'ك': '-.-', 'ل': '.-..', 'م': '--', 'ن': '-.', 'ه': '..-..', 'و': '.--', 'ي': '..', 'ﺀ': '.', } _ENCODE_DIGITS = { '0': '-----', '1': '.----', '2': '..---', '3': '...--', '4': '....-', '5': '.....', '6': '-....', '7': '--...', '8': '---..', '9': '----.' } _ENCODE_SYMBOL = { '_': '..--.-', '-': '-....-', ',': '--..--', ';': '-.-.-.', ':': '---...', '!': '-.-.--', '?': '..--..', '.': '.-.-.-', '"': '.-..-.', '(': '-.--.', ')': '-.--.-', '@': '.--.-.', '/': '-..-.', '\\': '-..-.', '&': '.-...', '+': '.-.-.', '=': '-...-', '$': '...-..-', "'": '.----.', } _DECODE = { lng: _reverse_dictionary(tbl) for lng, tbl in _ENCODE.items()} _DECODE_SYMBOL = _reverse_dictionary(_ENCODE_SYMBOL) _DECODE_DIGITS = _reverse_dictionary(_ENCODE_DIGITS)
class mscdk (size, hash='MD5')
-
This unit is implemented in
refinery.units.crypto.keyderive.mscdk
and has the following commandline Interface:usage: mscdk [-h] [-L] [-Q] [-0] [-v] size [hash] An implementation of the CryptDeriveKey routine available from the Win32 API. positional arguments: size The number of bytes to generate. hash Specify one of these algorithms (default is MD5): md2, md4, md5, sha1, sha256, sha512, sha224, sha384 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mscdk(KeyDerivation): """ An implementation of the CryptDeriveKey routine available from the Win32 API. """ def __init__(self, size, hash='MD5'): super().__init__(size=size, salt=None, hash=hash) def process(self, data): def digest(x): return self.hash.new(x).digest() size = self.args.size if self.args.hash in (HASH.SHA224, HASH.SHA256, HASH.SHA384, HASH.SHA512): buffer = digest(data) max_size = len(buffer) else: max_size = 2 * self.hash.digest_size value = digest(data) del data buffer1 = bytearray([0x36] * 64) buffer2 = bytearray([0x5C] * 64) for k, b in enumerate(value): buffer1[k] ^= b buffer2[k] ^= b buffer = digest(buffer1) + digest(buffer2) if size > max_size: raise RefineryPartialResult(F'too many bytes requested, can only provide {max_size}', partial=buffer) return buffer[:size]
class mscf (mode=None)
-
This unit is implemented in
refinery.units.compression.mscf
and has the following commandline Interface:usage: mscf [-h] [-L] [-Q] [-0] [-v] [-F] [MODE] The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both with and without Huffman table) are supported. This pure Python implementation is very slow when compared to native code, so decompressing very large inputs can take several minutes. positional arguments: MODE Manually select decompression mode (mszip, xpress, xpress-huff, lzms); by default the unit attempts to derive the mode from the header, but this will fail for raw streams. However, even if a header is found, a manually specified mode will take precedence. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class mscf(Unit): """ The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both with and without Huffman table) are supported. This pure Python implementation is very slow when compared to native code, so decompressing very large inputs can take several minutes. """ _SIGNATURE = B'\x0A\x51\xE5\xC0' def __init__( self, mode: Unit.Arg.Option(choices=MODE, help=( 'Manually select decompression mode ({choices}); by default the unit attempts to derive the ' 'mode from the header, but this will fail for raw streams. However, even if a header is ' 'found, a manually specified mode will take precedence.')) = None, ): mode = Unit.Arg.AsOption(mode, MODE) super().__init__(mode=mode) def process(self, data): mode: MODE = self.args.mode with StructReader(memoryview(data)) as reader, MemoryFile() as writer: reader: StructReader[memoryview] check = zlib.crc32(reader.peek(6)) magic = reader.read(4) if magic != self._SIGNATURE: if mode is None: self.log_warn( F'data starts with {magic.hex().upper()} rather than the expected sequence ' F'{self._SIGNATURE.hex().upper()}; this could be a raw stream.') else: reader.seek(0) handler = self._get_handler(mode) handler(reader, writer, None) return writer.getbuffer() header_size = reader.u16() if header_size != 24: self.log_warn(F'the header size {header_size} was not equal to 24') crc32byte = reader.u8() check = zlib.crc32(reader.peek(0x11), check) & 0xFF if check != crc32byte: self.log_warn(F'the CRC32 check byte was {crc32byte}, computed value was {check}') _mode_code = reader.u8() try: _mode = MODE(_mode_code) except ValueError: msg = F'header contains unknown compression type code {_mode_code}' if mode is None: raise ValueError(msg) else: self.log_warn(msg) else: if mode is not None and mode != _mode: logger = self.log_warn else: logger = self.log_info mode = _mode logger(F'header specifies algorithm {_mode.name}') self.log_info(F'using algorithm {mode.name}') decompress = self._get_handler(mode) final_size = reader.u32() _unknown_1 = reader.u32() chunk_size = reader.u32() _unknown_2 = reader.u32() if _unknown_1 != 0: self.log_warn(F'unknown value 1 was unexpectedly nonzero: 0x{_unknown_1:08X}') if _unknown_2 != 0: self.log_warn(F'unknown value 2 was unexpectedly nonzero: 0x{_unknown_2:08X}') self.log_debug(F'final size: 0x{final_size:08X}') self.log_debug(F'chunk size: 0x{chunk_size:08X}') if chunk_size > COMPRESS_MAX_CHUNK: raise ValueError('the header chunk size is greater than the maximum value') while len(writer) < final_size: src_size = reader.u32() src_data = reader.read(src_size) if len(src_data) != src_size: raise IndexError(F'Attempted to read {src_size} bytes, but got only {len(src_data)}.') if src_size + len(writer) == final_size: self.log_debug(F'final chunk is uncompressed, appending {src_size} raw bytes to output') writer.write(src_data) break self.log_debug(F'reading chunk of size {src_size}') start = writer.tell() chunk = StructReader(src_data) target = min(chunk_size, final_size - len(writer)) decompress(chunk, writer, target) writer.flush() written = writer.tell() - start if written != target: raise RuntimeError(F'decompressed output had unexpected size {written} instead of {chunk_size}') if not reader.eof: self.log_info(F'compression complete with {reader.remaining_bytes} bytes remaining in input') return writer.getbuffer() def _get_handler(self, mode: MODE) -> Callable[[StructReader, MemoryFile, Optional[int]], None]: decompress = { mode.MSZIP : self._decompress_mszip, mode.XPRESS_HUFF : self._decompress_xpress_huffman, mode.XPRESS : self._decompress_xpress, }.get(mode, None) if decompress is None: raise NotImplementedError(F'algorithm {mode.name} is not yet implemented') return decompress def _decompress_mszip(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None): header = bytes(reader.read(2)) if header != B'CK': raise ValueError(F'chunk did not begin with CK header, got {header!r} instead') decompress = zlib.decompressobj(-zlib.MAX_WBITS, zdict=writer.getbuffer()) writer.write(decompress.decompress(reader.read())) writer.write(decompress.flush()) def _decompress_xpress_huffman( self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None, max_chunk_size: int = 0x10000 ) -> None: limit = writer.tell() if target is not None: target += limit while not reader.eof: if reader.remaining_bytes < XPRESS_NUM_SYMBOLS // 2: raise IndexError( F'There are only {reader.remaining_bytes} bytes reamining in the input buffer,' F' but at least {XPRESS_NUM_SYMBOLS // 2} are required to read a Huffman table.') table = bytearray(reader.read_integer(4) for _ in range(XPRESS_NUM_SYMBOLS)) table = make_huffman_decode_table(table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN) limit = limit + max_chunk_size flags = BitBufferedReader(reader, 16) while True: position = writer.tell() if position == target: if reader.remaining_bytes: self.log_info(F'chunk decompressed with {reader.remaining_bytes} bytes remaining in input buffer') return if position >= limit: if position > limit: limit = position self.log_info(F'decompression of one chunk generated more than the limit of {max_chunk_size} bytes') flags.collect() break try: sym = read_huffman_symbol(flags, table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN) except EOFError: self.log_debug('end of file while reading huffman symbol') break if sym < XPRESS_NUM_CHARS: writer.write_byte(sym) continue length = sym & 0xF offsetlog = (sym >> 4) & 0xF flags.collect() if reader.eof: break offset = (1 << offsetlog) | flags.read(offsetlog) if length == 0xF: nudge = reader.read_byte() if nudge < 0xFF: length += nudge else: length = reader.u16() or reader.u32() length += XPRESS_MIN_MATCH_LEN writer.replay(offset, length) def _decompress_xpress(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None) -> bytearray: if target is not None: target += writer.tell() flags = BitBufferedReader(reader) nibble_cache = None while not reader.eof: if target is not None and writer.tell() >= target: return if not flags.next(): writer.write(reader.read(1)) continue offset, length = divmod(reader.u16(), 8) offset += 1 if length == 7: length = nibble_cache if length is None: length_pair = reader.u8() nibble_cache = length_pair >> 4 length = length_pair & 0xF else: nibble_cache = None if length == 15: length = reader.u8() if length == 0xFF: length = reader.u16() or reader.u32() length -= 22 if length < 0: raise RuntimeError(F'Invalid match length of {length} for long delta sequence') length += 15 length += 7 length += 3 writer.replay(offset, length) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: sig = cls._SIGNATURE if data[:len(sig)] == sig: return True
class msgpack
-
This unit is implemented in
refinery.units.formats.msgpack
and has the following commandline Interface:usage: msgpack [-h] [-L] [-Q] [-0] [-v] [-R] Converts a message-pack (msgpack) buffer to JSON and vice-versa. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class msgpack(Unit): """ Converts a message-pack (msgpack) buffer to JSON and vice-versa. """ def reverse(self, data): return mp.dumps(json.loads(data)) def process(self, data): unpacker: mp.fallback.Unpacker = mp.Unpacker(MemoryFile(data, read_as_bytes=True)) for k in itertools.count(): try: last = unpacker.tell() item = unpacker.unpack() except Exception as E: if isinstance(E, mp.OutOfData) and k == 1: break raise RefineryPartialResult(str(E), memoryview(data)[last:]) from E else: yield json.dumps(item).encode(self.codec)
class mspdb (size, salt, iter=100, hash='SHA1')
-
This unit is implemented in
refinery.units.crypto.keyderive.mspdb
and has the following commandline Interface:usage: mspdb [-h] [-L] [-Q] [-0] [-v] size salt [iter] [hash] An implementation of the PasswordDeriveBytes routine available from the .NET standard library. According to documentation, it is an extension of PBKDF1. positional arguments: size The number of bytes to generate. salt Salt for the derivation. iter Number of iterations; default is 100. hash Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256, sha512, sha224, sha384 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mspdb(KeyDerivation): """ An implementation of the PasswordDeriveBytes routine available from the .NET standard library. According to documentation, it is an extension of PBKDF1. """ def __init__(self, size, salt, iter=100, hash='SHA1'): self.superinit(super(), **vars()) def process(self, data): if self.codec != 'UTF8': data = data.decode(self.codec).encode('UTF8') data += self.args.salt for _ in range(self.args.iter - 1): data = self.hash.new(data).digest() counter, seedhash = 1, data data = self.hash.new(data).digest() while len(data) < self.args.size: data += self.hash.new(B'%d%s' % (counter, seedhash)).digest() counter += 1 return data[:self.args.size]
class mvg (*names, top=False)
-
This unit is implemented in
refinery.units.meta.mvg
and has the following commandline Interface:usage: mvg [-h] [-L] [-Q] [-0] [-v] [-t] [name [name ...]] Short for "Make Variable Global": This unit can move meta variables into the scope of the parent frame. If used at the end of a frame, the variables will be moved the scope of the frame that the pipeline will return to. Otherwise and if the --top switch is being used, variables will be moved to scope 0, i.e. to the topmost frame in the current tree. Note that it is not possible to promote a variable to a parent frame if that variable does not have the same value on all chunks in the current frame - such variables will always be removed when the frame closes. positional arguments: name Name of a variable to be removed. If no variables are explicitly specified, all variables in the current chunk will be rescoped. optional arguments: -t, --top Move the variable(s) to the topmost frame layer. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class mvg(Unit): """ Short for "Make Variable Global": This unit can move meta variables into the scope of the parent frame. If used at the end of a frame, the variables will be moved the scope of the frame that the pipeline will return to. Otherwise and if the --top switch is being used, variables will be moved to scope 0, i.e. to the topmost frame in the current tree. Note that it is not possible to promote a variable to a parent frame if that variable does not have the same value on all chunks in the current frame - such variables will always be removed when the frame closes. """ def __init__( self, *names: Arg(type=str, metavar='name', help=( 'Name of a variable to be removed. If no variables are explicitly specified, all ' 'variables in the current chunk will be rescoped.' )), top: Arg.Switch('-t', help='Move the variable(s) to the topmost frame layer.') = False ): super().__init__(names=names, top=top) def process(self, data): meta = metavars(data) nest = self.args.nesting if nest < 0 and not self.args.top: spot = meta.scope + nest else: spot = 1 for name in self.args.names or meta.variable_names(): try: if meta.get_scope(name) <= spot: continue meta.set_scope(name, spot) except KeyError: self.log_info(F'variable not defined: {name}') return data
class n40 (key)
-
This unit is implemented in
refinery.units.malware.n40
and has the following commandline Interface:usage: n40 [-h] [-L] [-Q] [-0] [-v] key Decrypts hex-encoded strings in various latin-american banker families, including N40. positional arguments: key Decryption key. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class n40(Unit): """ Decrypts hex-encoded strings in various latin-american banker families, including N40. """ def __init__(self, key: Arg(help='Decryption key.')): ... def process(self, data): try: data = b16decode(data, casefold=True) except Error: self.log_info('Input was not hex-encoded; ignoring this step.') mask = data[1:] | xor(self.args.key) | bytearray return bytearray(0xFF + b - a if b <= a else b - a for a, b in zip(data, mask))
class neg (bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.neg
and has the following commandline Interface:usage: neg [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] Each block of the input data is negated bitwise. This is sometimes also called the bitwise complement or inverse. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class neg(UnaryOperation): """ Each block of the input data is negated bitwise. This is sometimes also called the bitwise complement or inverse. """ def operate(self, a): return ~a def inplace(self, a): a ^= self.fmask
class netbios (key=b'A')
-
This unit is implemented in
refinery.units.encoding.netbios
and has the following commandline Interface:usage: netbios [-h] [-L] [-Q] [-0] [-v] [-R] [key] Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character, respectively. The default offset is the capital letter A. positional arguments: key Provide a single letter to use as the offset. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class netbios(Unit): """ Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character, respectively. The default offset is the capital letter A. """ def __init__(self, key: Arg(help="Provide a single letter to use as the offset.") = B'A'): if len(key) != 1: raise ValueError("The key must be a binary string of length exactly 1") super().__init__(key=key[0]) def reverse(self, data): result = bytearray(2 * len(data)) for k, byte in enumerate(data): hi, lo = byte >> 4, byte & 15 result[2 * k + 0] = hi + self.args.key result[2 * k + 1] = lo + self.args.key return result def process(self, data): def merge(it): while True: try: hi = next(it) - self.args.key lo = next(it) - self.args.key if hi not in range(16) or lo not in range(16): raise ValueError(F'Invalid character encoding detected: hi={hi:X}, lo={lo:X}.') yield (hi << 4) | lo except StopIteration: break return bytearray(merge(iter(data)))
class ngrams (size=slice(2, None, None))
-
This unit is implemented in
refinery.units.strings.ngrams
and has the following commandline Interface:usage: ngrams [-h] [-L] [-Q] [-0] [-v] [start:end:step] Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams and deduplicates using a set data structure. The number n is taken from an arbitrary range given as a Python slice expression. positional arguments: start:end:step Specifies the sizes of each n-gram, i.e. the number n. Defaults to 2:. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ngrams(Unit): """ Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams and deduplicates using a set data structure. The number n is taken from an arbitrary range given as a Python slice expression. """ def __init__( self, size: Arg.Bounds( help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.') = slice(2, None), ): super().__init__(size=size) def process(self, data: bytearray): for n in integers_of_slice(self.args.size): self.log_info(F'emitting {n}-grams') if n > len(data): break deduplicator = set() view = memoryview(data) for index in range(len(data) - n + 1): block = bytes(view[index:index + n]) if block in deduplicator: continue deduplicator.add(block) yield self.labelled(block, offset=index)
class nop
-
This unit is implemented in
refinery.units.misc.nop
and has the following commandline Interface:usage: nop [-h] [-L] [-Q] [-0] [-v] The unit generates the exact output that was received as input. All unknown arguments passed to nop are completely ignored, which is different from the behavior of other units. As such, nop can be used to comment out other units in longer refinery pipelines by simply prefixing a command with nop. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class nop(Unit): """ The unit generates the exact output that was received as input. All unknown arguments passed to nop are completely ignored, which is different from the behavior of other units. As such, nop can be used to comment out other units in longer refinery pipelines by simply prefixing a command with nop. """ @classmethod def argparser(cls, **keywords): argp = NopArgParser( keywords, prog=cls.name, description=documentation(cls), add_help=False) argp.set_defaults(nesting=0) return cls._interface(argp)
Static methods
def argparser(**keywords)
-
Expand source code Browse git
@classmethod def argparser(cls, **keywords): argp = NopArgParser( keywords, prog=cls.name, description=documentation(cls), add_help=False) argp.set_defaults(nesting=0) return cls._interface(argp)
class nrv2b (bits=32)
-
This unit is implemented in
refinery.units.compression.nrv
and has the following commandline Interface:usage: nrv2b [-h] [-L] [-Q] [-0] [-v] [N] Decompress data using the NRV2B algorithm. positional arguments: N Specify the number of codec bits. The default is 32. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class nrv2b(NRVUnit): """ Decompress data using the NRV2B algorithm. """ def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader): last_offset = 1 while not src.eof: while next(bb): dst.write_byte(src.read_byte()) offset = 2 + next(bb) while not next(bb): offset = 2 * offset + next(bb) if offset == 2: offset = last_offset else: offset = (offset - 3) * 0x100 + src.read_byte() if offset & 0xFFFFFFFF == 0xFFFFFFFF: break offset += 1 last_offset = offset length = next(bb) length = 2 * length + next(bb) if length == 0: length = 2 + next(bb) while not next(bb): length = 2 * length + next(bb) length += 2 length += int(bool(offset > 0xD00)) dst.replay(offset, length + 1)
class nrv2d (bits=32)
-
This unit is implemented in
refinery.units.compression.nrv
and has the following commandline Interface:usage: nrv2d [-h] [-L] [-Q] [-0] [-v] [N] Decompress data using the NRV2D algorithm. positional arguments: N Specify the number of codec bits. The default is 32. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class nrv2d(NRVUnit): """ Decompress data using the NRV2D algorithm. """ def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader): last_offset = 1 while not src.eof: while next(bb): dst.write_byte(src.read_byte()) offset = 2 + next(bb) while not next(bb): offset = 2 * (offset - 1) + next(bb) # noqa offset = 2 * offset + next(bb) # noqa if offset == 2: offset = last_offset length = next(bb) else: offset = (offset - 3) * 0x100 + src.read_byte() if offset & 0xFFFFFFFF == 0xFFFFFFFF: break length = (offset ^ 1) & 1 # noqa offset = (offset >> 1) + 1 last_offset = offset length = 2 * length + next(bb) if length == 0: length = 2 + next(bb) while not next(bb): length = 2 * length + next(bb) length += 2 length += int(bool(offset > 0x500)) dst.replay(offset, length + 1)
class nrv2e (bits=32)
-
This unit is implemented in
refinery.units.compression.nrv
and has the following commandline Interface:usage: nrv2e [-h] [-L] [-Q] [-0] [-v] [N] Decompress data using the NRV2E algorithm. positional arguments: N Specify the number of codec bits. The default is 32. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class nrv2e(NRVUnit): """ Decompress data using the NRV2E algorithm. """ def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader): last_offset = 1 while not src.eof: while next(bb): dst.write_byte(src.read_byte()) offset = 2 + next(bb) while not next(bb): offset = 2 * (offset - 1) + next(bb) # noqa offset = 2 * offset + next(bb) # noqa if offset == 2: offset = last_offset length = next(bb) else: offset = (offset - 3) * 0x100 + src.read_byte() if offset & 0xFFFFFFFF == 0xFFFFFFFF: break length = (offset ^ 1) & 1 # noqa offset = (offset >> 1) + 1 last_offset = offset if length: length = 1 + next(bb) elif next(bb): length = 3 + next(bb) else: length = 2 + next(bb) while not next(bb): length = 2 * length + next(bb) length += 3 length += int(bool(offset > 0x500)) dst.replay(offset, length + 1)
class ntlm (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.password_hashes
and has the following commandline Interface:usage: ntlm [-h] [-L] [-Q] [-0] [-v] [-t] Returns the Windows NTLM hash of the input. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ntlm(HashUnit): """ Returns the Windows NTLM hash of the input. """ def _algorithm(self, data: bytes) -> bytes: from Cryptodome.Hash import MD4 return MD4.new(data.decode(self.codec).encode('utf-16le'))
class officecrypt (password=b'VelvetSweatshop')
-
This unit is implemented in
refinery.units.formats.office.officecrypt
and has the following commandline Interface:usage: officecrypt [-h] [-L] [-Q] [-0] [-v] [password] A simple proxy for the msoffcrypto package to decrypt office documents. positional arguments: password The document password. By default, the Excel default password "VelvetSweatshop" is used. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class officecrypt(Unit): """ A simple proxy for the `msoffcrypto` package to decrypt office documents. """ def __init__(self, password: Arg.Binary(help=( 'The document password. By default, the Excel default password "{default}" is used.' )) = b'VelvetSweatshop'): super().__init__(password=password) @Unit.Requires('msoffcrypto-tool', 'formats', 'office') def _msoffcrypto(): import msoffcrypto return msoffcrypto def process(self, data): password: bytes = self.args.password with MemoryFile(data) as stream: doc = self._msoffcrypto.OfficeFile(stream) if not doc.is_encrypted(): self.log_warn('the document is not encrypted; returning input') return data if password: doc.load_key(password=password.decode(self.codec)) with MemoryFile(bytearray()) as output: doc.decrypt(output) return output.getvalue()
class opc (mode='x32', *, count=None, until=None, nvar='name', avar='addr', ovar='arg')
-
This unit is implemented in
refinery.units.formats.exe.opc
and has the following commandline Interface:usage: opc [-h] [-L] [-Q] [-0] [-v] [-c N] [-u STR] [-n STR] [-a STR] [-o STR] [[x32|x64|..]] Disassembles the input data using capstone and generates opcodes with metadata as output. This is useful for programmatic disassembly, while the asm unit outputs a human-readable representation. Internally, asm uses this unit and pretty-prints the output. positional arguments: [x32|x64|..] Machine code architecture, default is x32. Select from the following list: x16, x32, x64, ppc32, ppc64, mips32, mips64. optional arguments: -c, --count N Maximum number of bytes to disassemble, infinite by default. -u, --until STR Disassemble until the given string appears among the disassembly. -n, --nvar STR Variable to receive the disassembled mnemonic. Default is "name". -a, --avar STR Variable to receive the address of the instruction. Default is "addr". -o, --ovar STR Variable prefix for instruction operands. Default is "arg". The complete operand string will be in args, the first argument in arg1, the second in arg2, and so on. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class opc(Unit): """ Disassembles the input data using capstone and generates opcodes with metadata as output. This is useful for programmatic disassembly, while the `refinery.asm` unit outputs a human-readable representation. Internally, `refinery.asm` uses this unit and pretty-prints the output. """ def __init__( self, mode: Arg.Choice( help='Machine code architecture, default is {default}. Select from the following list: {choices}.', choices=_ARCHES, metavar='[x32|x64|..]') = 'x32', *, count: Arg.Number('-c', help='Maximum number of bytes to disassemble, infinite by default.') = None, until: Arg.String('-u', help='Disassemble until the given string appears among the disassembly.') = None, nvar: Arg.String('-n', help=( 'Variable to receive the disassembled mnemonic. Default is "{default}".')) = 'name', avar: Arg.String('-a', help=( 'Variable to receive the address of the instruction. Default is "{default}".')) = 'addr', ovar: Arg.String('-o', help=( 'Variable prefix for instruction operands. Default is "{default}". The complete operand ' 'string will be in {default}s, the first argument in {default}1, the second in {default}2, ' 'and so on.')) = 'arg', **more ): super().__init__( mode=mode, count=count, until=until, nvar=nvar, avar=avar, ovar=ovar, **more) @Unit.Requires('capstone') def _capstone(): import capstone return capstone @property def _capstone_engine(self) -> Cs: cs = self._capstone return cs.Cs(*{ 'arm' : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM), 'mips32' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS32), 'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS64), 'ppc32' : (cs.CS_ARCH_PPC, cs.CS_MODE_32), 'ppc64' : (cs.CS_ARCH_PPC, cs.CS_MODE_64), 'x16' : (cs.CS_ARCH_X86, cs.CS_MODE_16), 'x32' : (cs.CS_ARCH_X86, cs.CS_MODE_32), 'x64' : (cs.CS_ARCH_X86, cs.CS_MODE_64), }.get(self.args.mode.lower())) def process(self, data): count = self.args.count or 0 until = self.args.until nvar = self.args.nvar avar = self.args.avar ovar = self.args.ovar if isinstance(until, str): until = until.lower() for insn in self._capstone_engine.disasm(data, 0, count): kwargs = { avar: insn.address, nvar: insn.mnemonic, } try: ops = insn.op_str operands = [op.strip() for op in ops.split(',')] except Exception: operands = [] else: kwargs[F'{ovar}s'] = ops for k, op in enumerate(operands, 1): if not op: break try: op = int(op, 0) except Exception: pass kwargs[F'{ovar}{k}'] = op yield self.labelled(insn.bytes, **kwargs) if until is None: continue if until in ops.lower() or until in insn.mnemonic.lower(): break
class p1
-
This unit is implemented in
refinery.units.meta.pick
and has the following commandline Interface:usage: p1 [-h] [-L] [-Q] [-0] [-v] A shortcut for pick with the argument 0:1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class p1(pick): """ A shortcut for `refinery.pick` with the argument `0:1`. """ def __init__(self): super().__init__(slice(0, 1))
class p2
-
This unit is implemented in
refinery.units.meta.pick
and has the following commandline Interface:usage: p2 [-h] [-L] [-Q] [-0] [-v] A shortcut for pick with the argument 0:2. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class p2(pick): """ A shortcut for `refinery.pick` with the argument `0:2`. """ def __init__(self): super().__init__(slice(0, 2))
class p3
-
This unit is implemented in
refinery.units.meta.pick
and has the following commandline Interface:usage: p3 [-h] [-L] [-Q] [-0] [-v] A shortcut for pick with the argument 0:3. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class p3(pick): """ A shortcut for `refinery.pick` with the argument `0:3`. """ def __init__(self): super().__init__(slice(0, 3))
class pack (base=0, prefix=False, strict=False, width=0, single_floats=False, double_floats=False, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.pack
and has the following commandline Interface:usage: pack [-h] [-L] [-Q] [-0] [-v] [-R] [-r] [-s] [-w N] [-f] [-d] [-E] [-B N] [base] Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of an array of numbers into its binary form. For example, 123,34,256,12,1,234 would be transformed into the byte sequence 7B22000C01EA, where 256 was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed with options -EB2, the result would be equal to 007B00220100000C000100EA in hexadecimal. positional arguments: base Find only numbers in given base. Default of 0 means that common expressions for hexadecimal, octal and binary are accepted. optional arguments: -r, --prefix Add numeric prefixes like 0x, 0b, and 0o in reverse mode. -s, --strict Only parse integers that fit in one block of the given block size. -w, --width N Pad numbers with the specified amount of leading zeros. -f, --single-floats Pack single-precision floating-point numbers. Implies -B4. -d, --double-floats Pack double-precision floating-point numbers. Implies -B8. -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class pack(BlockTransformationBase): """ Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of an array of numbers into its binary form. For example, `123,34,256,12,1,234` would be transformed into the byte sequence `7B22000C01EA`, where `256` was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed with options -EB2, the result would be equal to `007B00220100000C000100EA` in hexadecimal. """ def __init__(self, base: Arg(type=number[2:36], help=( 'Find only numbers in given base. Default of 0 means that ' 'common expressions for hexadecimal, octal and binary are ' 'accepted.')) = 0, prefix : Arg.Switch('-r', group='FLT', help='Add numeric prefixes like 0x, 0b, and 0o in reverse mode.') = False, strict : Arg.Switch('-s', help='Only parse integers that fit in one block of the given block size.') = False, width : Arg.Number('-w', help='Pad numbers with the specified amount of leading zeros.') = 0, single_floats: Arg.Switch('-f', group='FLT', help='Pack single-precision floating-point numbers. Implies -B4.') = False, double_floats: Arg.Switch('-d', group='FLT', help='Pack double-precision floating-point numbers. Implies -B8.') = False, bigendian=False, blocksize=None ): if single_floats and double_floats: raise ValueError('The floats and doubles option are mutually exclusive.') elif single_floats: fmode = FMode.SINGLE blocksize = 4 elif double_floats: fmode = FMode.DOUBLE blocksize = 8 else: fmode = FMode.TO_INT super().__init__( base=base, prefix=prefix, strict=strict, width=width, bigendian=bigendian, blocksize=blocksize, fmode=fmode, _truncate=2, ) @property def bytestream(self): # never alow bytes to be left unchunked return False def reverse(self, data): base = self.args.base or 10 width = self.args.width mode: FMode = self.args.fmode prefix = B'' self.log_debug(F'using base {base:d}') if self.args.prefix: prefix = { 0x02: b'0b', 0x08: b'0o', 0x10: b'0x' }.get(base, B'') if mode is FMode.TO_INT: converter = BaseUnit( base, little_endian=not self.args.bigendian, strip_padding=True, ) for n in self.chunk_into_bytes(data): converted = converter.reverse(n) if width: converted = converted.rjust(width, B'0') if prefix: converted = prefix + converted yield converted return elif mode is FMode.SINGLE: float_format = 'f' float_size = 4 elif mode is FMode.DOUBLE: float_format = 'd' float_size = 8 count, rest = divmod(len(data), float_size) if rest: self.log_warn(F'data contained {rest} trailing bytes that were ignored') data = memoryview(data)[:-rest] float_format *= count if self.args.bigendian: float_format = F'>{float_format}' else: float_format = F'<{float_format}' for n in struct.unpack(float_format, data): yield str(n).encode(self.codec) def process(self, data): base: int = self.args.base strict: bool = self.args.strict mode: FMode = self.args.fmode ep = '>' if self.args.bigendian else '<' def evaluate_literals(literals: Iterable[bytes]): for literal in literals: if mode is FMode.TO_INT: if base == 0 and literal[0] == 0x30 and literal[1:].isdigit(): literal = B'0o%s' % literal N = int(literal, base) elif mode is FMode.SINGLE: N, = struct.unpack(F'{ep}I', struct.pack(F'{ep}f', float(literal))) elif mode is FMode.DOUBLE: N, = struct.unpack(F'{ep}Q', struct.pack(F'{ep}d', float(literal))) else: raise TypeError('unexpected floating point mode') M = N & self.fmask if strict and M != N: continue yield M if base == 0: pattern = formats.number elif base <= 10: pattern = re.compile(B'[-+]?[0-%d]{1,64}' % (base - 1)) else: pattern = re.compile(B'[-+]?[0-9a-%c]{1,20}' % (0x57 + base), re.IGNORECASE) return self.unchunk(evaluate_literals(m[0] for m in pattern.finditer(data)))
class pad (width, padding=b'\x00', left=False, absolute=False)
-
This unit is implemented in
refinery.units.meta.pad
and has the following commandline Interface:usage: pad [-h] [-L] [-Q] [-0] [-v] [-l] [-a] N [padding] Allows padding of the input data. positional arguments: N Input is padded to the nearest multiple of this size. padding This custom binary sequence is used (repeatedly, if necessary) to pad the input. The default is a zero byte. optional arguments: -l, --left Pad on the left instead of the right. -a, --absolute The width argument specifies an absolute size, not a block size. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pad(Unit): """ Allows padding of the input data. """ def __init__( self, width: Arg.Number(help='Input is padded to the nearest multiple of this size.'), padding: Arg(help=( 'This custom binary sequence is used (repeatedly, if necessary) to pad the ' 'input. The default is a zero byte.')) = B'\0', left: Arg.Switch('-l', help='Pad on the left instead of the right.') = False, absolute: Arg.Switch('-a', help=( 'The width argument specifies an absolute size, not a block size.')) = False ): super().__init__(width=width, padding=padding, left=left, absolute=absolute) def process(self, data): width = self.args.width if self.args.absolute and len(data) >= width: return data q, r = divmod(len(data), width) size = (q + bool(r)) * width missing = (size - len(data)) if missing <= 0: return data pad = self.args.padding if missing > len(pad): pad *= missing // len(pad) if self.args.left: return pad[:missing] + data else: data += pad[:missing] return data
class pbkdf1 (size, salt=b'\x00\x00\x00\x00\x00\x00\x00\x00', iter=1000, hash='SHA1')
-
This unit is implemented in
refinery.units.crypto.keyderive.pbkdf1
and has the following commandline Interface:usage: pbkdf1 [-h] [-L] [-Q] [-0] [-v] size [salt] [iter] [hash] PBKDF1 Key derivation positional arguments: size The number of bytes to generate. salt Salt for the derivation; default are 8 null bytes. iter Number of iterations; default is 1000. hash Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256, sha512, sha224, sha384 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pbkdf1(KeyDerivation): """PBKDF1 Key derivation""" @Arg('salt', help='Salt for the derivation; default are 8 null bytes.') def __init__(self, size, salt=bytes(8), iter=1000, hash='SHA1'): self.superinit(super(), **vars()) def process(self, data): from Cryptodome.Protocol.KDF import PBKDF1 return multidecode(data, lambda pwd: ( PBKDF1(pwd, self.args.salt, dkLen=self.args.size, count=self.args.iter, hashAlgo=self.hash) ))
class pbkdf2 (size, salt, iter=1000, hash='SHA1')
-
This unit is implemented in
refinery.units.crypto.keyderive.pbkdf2
and has the following commandline Interface:usage: pbkdf2 [-h] [-L] [-Q] [-0] [-v] size salt [iter] [hash] PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries. positional arguments: size The number of bytes to generate. salt Salt for the derivation. iter Number of iterations; default is 1000. hash Specify one of these algorithms (default is SHA1): md2, md4, md5, sha1, sha256, sha512, sha224, sha384 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pbkdf2(KeyDerivation): """ PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries. """ def __init__(self, size, salt, iter=1000, hash='SHA1'): self.superinit(super(), **vars()) def process(self, data: ByteStr): from Cryptodome.Protocol.KDF import PBKDF2 return multidecode(data, partial( PBKDF2, salt=self.args.salt, dkLen=self.args.size, hmac_hash_module=self.hash, count=self.args.iter ))
class pcap (merge=False, client=False, server=False)
-
This unit is implemented in
refinery.units.formats.pcap
and has the following commandline Interface:usage: pcap [-h] [-L] [-Q] [-0] [-v] [-m] [-c | -s] Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of each TCP conversation, attaching several pieces of metadata to each such output: Included are the source and destination socket address as well as the variable stream which identifies the conversation which it was part of. The chunks are returned in the order that the bytes were exchanged between source and destination. When the --merge parameter is specified, the unit instead collects all bytes going forward and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place. optional arguments: -m, --merge Merge both parts of each TCP conversation into one chunk. -c, --client Show only the client part of each conversation. -s, --server Show only the server part of each conversation. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pcap(Unit): """ Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of each TCP conversation, attaching several pieces of metadata to each such output: Included are the source and destination socket address as well as the variable `stream` which identifies the conversation which it was part of. The chunks are returned in the order that the bytes were exchanged between source and destination. When the `--merge` parameter is specified, the unit instead collects all bytes going forward and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place. """ def __init__( self, merge: Arg.Switch('-m', help='Merge both parts of each TCP conversation into one chunk.') = False, client: Arg.Switch('-c', group='D', help='Show only the client part of each conversation.') = False, server: Arg.Switch('-s', group='D', help='Show only the server part of each conversation.') = False, ): super().__init__(merge=merge, client=client, server=server) @Unit.Requires('pypcapkit[scapy]>=1.3', 'all') def _pcapkit(): with NoLogging(): import scapy.layers.tls.session # noqa import pcapkit return pcapkit @Unit.Requires('scapy', 'all') def _scapy(): import scapy import scapy.packet return scapy def process(self, data): pcapkit = self._pcapkit merge = self.args.merge with NoLogging(), VirtualFileSystem() as fs: vf = VirtualFile(fs, data, 'pcap') pcap = pcapkit.extract( fin=vf.path, engine=pcapkit.Scapy, store=True, nofile=True, extension=False, ip=True, tcp=True, reassembly=True, reasm_strict=True, ) tcp: List[Datagram] = list(pcap.reassembly.tcp) tcp.sort(key=lambda p: min(p.index, default=0)) count, convo = 0, None src_buffer = MemoryFile() dst_buffer = MemoryFile() self.log_debug(F'extracted {len(pcap.frame)} packets, assembled {len(tcp)} datagrams') PT = self._scapy.packet def payload(packet: Packet): ok = (bytes, bytearray, PT.Raw) no = (PT.NoPayload, PT.Padding) circle = set() while True: try: inner = packet.payload except AttributeError: break if isinstance(packet, ok) and not isinstance(packet, no): return packet.original if id(inner) in circle: break packet = inner circle.add(id(inner)) return B'' def sequence(i: int): packet = pcap.frame[i - 1] while len(packet): try: return packet.seq except AttributeError: pass try: packet = packet.payload except AttributeError: break return 0 client = self.args.client server = self.args.server def commit(): if src_buffer.tell(): if not server: yield self.labelled(src_buffer.getvalue(), **convo.src_to_dst()) src_buffer.truncate(0) if dst_buffer.tell(): if not client: yield self.labelled(dst_buffer.getvalue(), **convo.dst_to_src()) dst_buffer.truncate(0) for datagram in tcp: this_convo = Conversation.FromID(datagram.id) if this_convo != convo: if count and merge: yield from commit() count = count + 1 convo = this_convo data = bytearray() for index in sorted(datagram.index, key=sequence): data.extend(payload(pcap.frame[index - 1])) if not data: continue if not merge: yield self.labelled(data, **this_convo.src_to_dst(), stream=count) elif this_convo.src == convo.src: src_buffer.write(data) elif this_convo.dst == convo.src: dst_buffer.write(data) else: raise RuntimeError(F'direction of packet {convo!s} in conversation {count} is unknown') yield from commit()
class pcap_http
-
This unit is implemented in
refinery.units.formats.pcap_http
and has the following commandline Interface:usage: pcap-http [-h] [-L] [-Q] [-0] [-v] Extracts HTTP payloads from packet capture (PCAP) files. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pcap_http(Unit): """ Extracts HTTP payloads from packet capture (PCAP) files. """ def process(self, data): http_parser = httpresponse() requests: List[_HTTP_Request] = [] responses: List[bytearray] = [] def lookup(src, dst): for k, request in enumerate(requests): if request.src == dst and request.dst == src: requests.pop(k) return self.labelled(data, url=request.url) return None for stream in data | pcap(): try: data = http_parser.process(stream) except Exception: try: rq = _parse_http_request(stream) requests.append(rq) except _HTTPParseError as E: self.log_info(F'error parsing http request: {E!s}') except Exception: pass continue if not data: continue src, dst = stream['src'], stream['dst'] item = lookup(src, dst) if item is None: responses.append((src, dst, data)) continue yield item while responses: src, dst, data = responses.pop() item = lookup(src, dst) yield data if item is None else item
class pedebloat (*names, certificate=False, directories=False, memdump=False, resources=False, sections=False, trim_code=False, trim_rsrc=False, threshold=0.05, size_limit=10.0 MB, keep_limit=False, aggressive=False)
-
This unit is implemented in
refinery.units.formats.pe.pedebloat
and has the following commandline Interface:usage: pedebloat [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m] [-r] [-s] [-X] [-Y] [-t T] [-l N] [-k] [-a] [names [names ...]] Removes junk or excess data from PE files and returns the stripped executable. By default, only the PE overlay is considered; use the flags -r and -s to also consider resources and entire sections. Any buffer is only considered for removal if it exceeds a certain size. If this condition is met, a binary search is performed to determine the offset inside the buffer up to which the compression ratio is above a certain threshold; everything beyond that point is then removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely. positional arguments: names optional arguments: -c, --certificate Include digital signatures for the size computation. -d, --directories Include data directories for size computation. -m, --memdump Assume that the file data was a memory-mapped PE file. -r, --resources Strip large resources. -s, --sections Strip large sections. -X, --trim-code Lift the exception on code sections for stripping. -Y, --trim-rsrc Lift the exception on rsrc sections for stripping. -t, --threshold T Trailing data from resources and sections is stripped until the compression ratio of the remaining data rises above this threshold. The default value is 0.05. Set this to 1 to ignore the limit entirely and trim every structure as much as possible without violating alignment. Setting this value to 0 will only strip repeated occurrences of the last byte. -l, --size-limit N Structures below this size are not stripped. Default is 10.0 MB. -k, --keep-limit Do not strip structures to below the above size limit. -a, --aggressive Equivalent to -srt1: Strip large sections and resources aggressively. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pedebloat(OverlayUnit): """ Removes junk or excess data from PE files and returns the stripped executable. By default, only the PE overlay is considered; use the flags `-r` and `-s` to also consider resources and entire sections. Any buffer is only considered for removal if it exceeds a certain size. If this condition is met, a binary search is performed to determine the offset inside the buffer up to which the compression ratio is above a certain threshold; everything beyond that point is then removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely. """ def __init__( self, *names: Arg(type=str), certificate=False, directories=False, memdump=False, resources: Arg.Switch('-r', help='Strip large resources.') = False, sections : Arg.Switch('-s', help='Strip large sections.') = False, trim_code: Arg.Switch('-X', help='Lift the exception on code sections for stripping.') = False, trim_rsrc: Arg.Switch('-Y', help='Lift the exception on rsrc sections for stripping.') = False, threshold: Arg('-t', metavar='T', type=percent, help=( 'Trailing data from resources and sections is stripped until the compression ratio ' 'of the remaining data rises above this threshold. The default value is {default}. ' 'Set this to 1 to ignore the limit entirely and trim every structure as much as ' 'possible without violating alignment. Setting this value to 0 will only strip repeated ' 'occurrences of the last byte.')) = 0.05, size_limit: Arg.Number('-l', help=( 'Structures below this size are not stripped. Default is {default!r}.')) = _STRIP, keep_limit: Arg.Switch('-k', help=( 'Do not strip structures to below the above size limit.')) = False, aggressive: Arg.Switch('-a', help=( 'Equivalent to -srt1: Strip large sections and resources aggressively.')) = False, ): if aggressive: sections = True resources = True threshold = 1 super().__init__( certificate, directories, memdump, sections=sections, resources=resources, size_limit=size_limit, keep_limit=keep_limit, threshold=threshold, trim_rsrc=trim_rsrc, trim_code=trim_code, names=names, ) def _right_strip_data(self, data: memoryview, alignment=1, block_size=_MB) -> int: if not data: return 0 threshold = self.args.threshold data_overhang = len(data) % alignment result = data_overhang if 0 < threshold < 1: def compression_ratio(offset: int): ratio = len(zlib.compress(data[:offset], level=1)) / offset self.log_debug(F'compressing {SizeInt(offset)!r} ratio={ratio:6.4f}') return ratio upper = len(data) lower = result if compression_ratio(upper) <= threshold: while block_size < upper - lower: pivot = (lower + upper) // 2 ratio = compression_ratio(pivot) if ratio > threshold: lower = pivot + 1 continue upper = pivot if abs(ratio - threshold) < 1e-10: break result = upper elif threshold == 0: result = len(data) elif threshold == 1: result = 0 while result > 1 and data[result - 2] == data[result - 1]: result -= 1 result = max(result, data_overhang) if self.args.keep_limit: result = max(result, self.args.size_limit) result = result + (data_overhang - result) % alignment if result > len(data): excess = result - len(data) excess = excess + (-excess % alignment) result = result - excess return result def _adjust_offsets(self, pe: PE, gap_offset: int, gap_size: int): base = pe.OPTIONAL_HEADER.ImageBase alignment = pe.OPTIONAL_HEADER.FileAlignment rva_offset = pe.get_rva_from_offset(gap_offset) tva_offset = rva_offset + base section = pe.get_section_by_offset(gap_offset) new_section_size = section.SizeOfRawData - gap_size if new_section_size % alignment != 0: raise RuntimeError( F'trimming 0x{gap_size:X} bytes from section {_ASCII(section.Name)} of size 0x{section.SizeOfRawData:X} ' F'violates required section alignment of 0x{alignment:X} bytes') inside_section_offset = gap_offset - section.PointerToRawData if inside_section_offset > new_section_size: overlap = inside_section_offset - new_section_size raise RuntimeError(F'trimming from section {_ASCII(section.Name)}; data extends {overlap} beyond section') rva_lbound = section.VirtualAddress rva_ubound = section.VirtualAddress + section.Misc_VirtualSize - 1 tva_lbound = rva_lbound + base tva_ubound = rva_ubound + base def adjust_attributes_of_structure( structure: Structure, gap_offset: int, valid_values_lower_bound: Optional[int], valid_values_upper_bound: Optional[int], attributes: Iterable[str] ): for attribute in attributes: old_value = getattr(structure, attribute, 0) if old_value <= gap_offset: continue if valid_values_lower_bound is not None and old_value < valid_values_lower_bound: continue if valid_values_upper_bound is not None and old_value > valid_values_upper_bound: continue new_value = old_value - gap_size if new_value < gap_offset: raise BrokenLink(F'attribute {attribute} points into removed region') self.log_debug(F'adjusting field in {structure.name}: {attribute}') setattr(structure, attribute, new_value) it: Iterable[Structure] = iter(pe.__structures__) remove = [] for index, structure in enumerate(it): old_offset = structure.get_file_offset() new_offset = old_offset - gap_offset if old_offset > gap_offset: if old_offset < gap_offset + gap_size: self.log_debug(F'removing structure {structure.name}; starts inside removed region') remove.append(index) continue if isinstance(structure, SectionStructure) and new_offset % alignment != 0: raise RuntimeError( F'structure {structure.name} would be moved to offset 0x{new_offset:X}, ' F'violating section alignment value 0x{alignment:X}.') structure.set_file_offset(new_offset) try: adjust_attributes_of_structure(structure, rva_offset, rva_lbound, rva_ubound, ( 'OffsetToData', 'AddressOfData', 'VirtualAddress', 'AddressOfNames', 'AddressOfNameOrdinals', 'AddressOfFunctions', 'AddressOfEntryPoint', 'AddressOfRawData', 'BaseOfCode', 'BaseOfData', )) adjust_attributes_of_structure(structure, tva_offset, tva_lbound, tva_ubound, ( 'StartAddressOfRawData', 'EndAddressOfRawData', 'AddressOfIndex', 'AddressOfCallBacks', )) adjust_attributes_of_structure(structure, gap_offset, None, None, ( 'OffsetModuleName', 'PointerToRawData', )) except BrokenLink as error: self.log_debug(F'removing structure {structure.name}; {error!s}') remove.append(index) continue for attribute in ( 'CvHeaderOffset', 'OffsetIn2Qwords', 'OffsetInQwords', 'Offset', 'OffsetLow', 'OffsetHigh' ): if not hasattr(structure, attribute): continue self.log_warn(F'potential offset in structure {structure.name} ignored: {attribute}') while remove: index = remove.pop() pe.__structures__[index:index + 1] = [] section.SizeOfRawData = new_section_size def _trim_sections(self, pe: PE, data: bytearray) -> int: S = self.args.size_limit P = self.args.names trimmed = 0 for section in pe.sections: section: SectionStructure offset = section.PointerToRawData name = _ASCII(section.Name) if not self.args.trim_code and name.lower() in ('.text', '.code'): self.log_debug(F'skipping code section {name}; specify --trim-code to override.') continue if not self.args.trim_rsrc and name.lower() == '.rsrc': self.log_debug(F'skipping rsrc section {name}; specify --trim-rsrc to override.') continue old_size = section.SizeOfRawData if old_size <= S and not any(fnmatch(name, p) for p in P): self.log_debug(F'criteria not satisfied for section: {SizeInt(old_size)!r} {name}') continue new_size = self._right_strip_data( memoryview(data)[offset:offset + old_size], pe.OPTIONAL_HEADER.FileAlignment) if new_size == old_size: continue self.log_info(F'stripping section {name} from {TI(old_size)!r} to {TI(new_size)!r}') gap_size = old_size - new_size gap_offset = offset + new_size if gap_size <= 0: continue self._adjust_offsets(pe, gap_offset, gap_size) trimmed += gap_size data[gap_offset:gap_offset + gap_size] = [] return trimmed def _trim_pe_resources(self, pe: PE, data: bytearray) -> int: S = self.args.size_limit P = self.args.names trimmed = 0 def find_bloated_resources(pe: PE, directory, level: int = 0, *path) -> Generator[Structure, None, None]: for entry in directory.entries: name = getattr(entry, 'name') numeric = getattr(entry, 'id') if not name: if level == 0 and numeric in iter(RSRC): name = RSRC(entry.id) elif numeric is not None: name = str(numeric) name = name and str(name) or '?' if entry.struct.DataIsDirectory: yield from find_bloated_resources(pe, entry.directory, level + 1, *path, name) continue struct: Structure = entry.data.struct name = '/'.join((*path, name)) if struct.Size <= S and not any(fnmatch(name, p) for p in P): self.log_debug(F'criteria not satisfied for resource: {SizeInt(struct.Size)!r} {name}') continue yield name, struct RSRC_INDEX = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE'] pe.parse_data_directories(directories=[RSRC_INDEX]) try: resources = pe.DIRECTORY_ENTRY_RESOURCE except AttributeError: return 0 for name, resource in find_bloated_resources(pe, resources): offset = pe.get_offset_from_rva(resource.OffsetToData) old_size = resource.Size new_size = self._right_strip_data( memoryview(data)[offset:offset + old_size], pe.OPTIONAL_HEADER.FileAlignment) self.log_info(F'stripping resource {name} from {old_size} to {new_size}') gap_size = old_size - new_size gap_offset = offset + new_size if gap_size <= 0: continue resource.Size = new_size self._adjust_offsets(pe, gap_offset, gap_size) trimmed += gap_size data[gap_offset:gap_offset + gap_size] = [] pe.OPTIONAL_HEADER.DATA_DIRECTORY[RSRC_INDEX].Size -= trimmed self.log_info(F'trimming size of resource data directory by {TI(trimmed)!r}') return trimmed def process(self, data: bytearray) -> bytearray: overlay_offset = self._get_size(data) if len(data) - overlay_offset >= self.args.size_limit: view = memoryview(data) overlay_length = self._right_strip_data(view[overlay_offset:]) body_size = overlay_offset + overlay_length try: data[body_size:] = [] except Exception: data = data[:body_size] if not self.args.resources and not self.args.sections: return data pe = PE(data=data, fast_load=True) total = len(data) trimmed = 0 view = pe.__data__ copy = False if not isinstance(view, bytearray): view = memoryview(view) try: view[0] = 0x4D except Exception: copy = True view = bytearray(pe.__data__) if self.args.resources: trimmed += self._trim_pe_resources(pe, view) if self.args.sections: trimmed += self._trim_sections(pe, view) if copy: pe.__data__ = view data = pe.write() end = total - trimmed if end < len(data): self.log_warn(F'output contains {len(data) - end} trailing bytes') return data
class peek (lines=10, all=False, brief=False, decode=0, escape=False, bare=False, meta=0, gray=False, index=False, stdout=False, narrow=False, blocks=1, dense=False, expand=False, width=0)
-
This unit is implemented in
refinery.units.sinks.peek
and has the following commandline Interface:usage: peek [-h] [-L] [-Q] [-0] [-v] [-l N | -a | -b] [-d | -e] [-r | -m] [-g] [-i] [-2] [-N] [-B N] [-D] [-E] [-W N] The unit extracts preview information of the input data and displays it on the standard error stream. If the standard output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal, the data is discarded instead. optional arguments: -l, --lines N Specify number N of lines in the preview, default is 10. -a, --all Output all possible preview lines without restriction -b, --brief One line peek, implies --lines=1. -d, --decode Attempt to decode and display printable data. Specify twice to enable line wrapping. -e, --escape Always peek data as string, escape characters if necessary. -r, --bare Only peek the data itself, do not show a metadata preview. -m, --meta Show more auto-derivable metadata. Specify multiple times to populate more variables. -g, --gray Do not colorize the output. -i, --index Display the index of each chunk within the current frame. -2, --stdout Print the peek to STDOUT rather than STDERR; the input data is lost. -N, --narrow Do not show addresses in hexdump -B, --blocks N Group hexadecimal bytes in blocks of the given size; default is 1. -D, --dense Do not insert spaces in hexdump. -E, --expand Do not compress sequences of identical lines in hexdump -W, --width N Specify the number of hexadecimal characters to use in preview. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class peek(HexViewer): """ The unit extracts preview information of the input data and displays it on the standard error stream. If the standard output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal, the data is discarded instead. """ def __init__( self, lines : Arg.Number('-l', group='SIZE', help='Specify number N of lines in the preview, default is 10.') = 10, all : Arg.Switch('-a', group='SIZE', help='Output all possible preview lines without restriction') = False, brief : Arg.Switch('-b', group='SIZE', help='One line peek, implies --lines=1.') = False, decode : Arg.Counts('-d', group='MODE', help=( 'Attempt to decode and display printable data. Specify twice to enable line wrapping.')) = 0, escape : Arg.Switch('-e', group='MODE', help='Always peek data as string, escape characters if necessary.') = False, bare : Arg.Switch('-r', group='META', help='Only peek the data itself, do not show a metadata preview.') = False, meta : Arg.Counts('-m', group='META', help=( 'Show more auto-derivable metadata. Specify multiple times to populate more variables.')) = 0, gray : Arg.Switch('-g', help='Do not colorize the output.') = False, index : Arg.Switch('-i', help='Display the index of each chunk within the current frame.') = False, stdout : Arg.Switch('-2', help='Print the peek to STDOUT rather than STDERR; the input data is lost.') = False, narrow=False, blocks=1, dense=False, expand=False, width=0 ): if decode and escape: raise ValueError('The decode and esc options are exclusive.') if brief: narrow = True if environment.colorless.value: gray = True lines = 1 if brief else INF if all else lines super(peek, self).__init__( brief=brief, gray=gray, blocks=blocks, decode=decode, dense=dense, index=index, escape=escape, expand=expand, narrow=narrow, lines=lines, meta=meta, bare=bare, width=width, stdout=stdout, ) @HexViewer.Requires('colorama', 'display', 'default', 'extended') def _colorama(): import colorama return colorama def process(self, data): colorize = not self.args.gray and not self.args.stdout lines = self._peeklines(data, colorize) if self.args.stdout: for line in lines: yield line.encode(self.codec) return stderr = sys.stderr if colorize: colorama = self._colorama if os.name == 'nt': stderr = colorama.AnsiToWin32(stderr).stream _erase = ' ' * get_terminal_size() _reset = F'\r{colorama.Style.RESET_ALL}{_erase}\r' else: _reset = '' try: for line in lines: print(line, file=stderr) except BaseException: stderr.write(_reset) raise if not self.isatty: self.log_info('forwarding input to next unit') yield data def _peekmeta(self, linewidth, sep, meta: dict, peek=None) -> Generator[str, None, None]: if not meta and not peek: return width = max((len(name) for name in meta), default=0) separators = iter([sep]) if peek is not None: if len(peek) > linewidth: peek = peek[:linewidth - 3] + '...' yield from separators yield peek for name in sorted(meta, key=lambda s: (len(s) <= 3, s)): value = meta[name] if value is None: continue if isinstance(value, CustomStringRepresentation): value = repr(value).strip() elif isbuffer(value): value = repr(ByteStringWrapper(value)) elif isinstance(value, int): if value in range(-999, 1000): value = str(value) elif value > 0: value = F'0x{value:X}' else: value = F'-0x{-value:X}' elif isinstance(value, float): value = F'{value:.4f}' metavar = F'{name:>{width + 2}} = {value!s}' if len(metavar) > linewidth: metavar = metavar[:linewidth - 3] + '...' yield from separators yield metavar def _trydecode(self, data, codec: Optional[str], width: int, linecount: int) -> str: remaining = linecount result = [] wrap = self.args.decode > 1 if codec is None: from refinery.units.encoding.esc import esc decoded = data[:abs(width * linecount)] decoded = str(decoded | -esc(bare=True)) limit = abs(min(linecount * width, len(decoded))) for k in range(0, limit, width): result.append(decoded[k:k + width]) return result try: import unicodedata unprintable = {'Cc', 'Cf', 'Co', 'Cs'} self.log_info(F'trying to decode as {codec}.') decoded = codecs.decode(data, codec, errors='strict') count = sum(unicodedata.category(c) not in unprintable for c in decoded) ratio = count / len(decoded) except UnicodeDecodeError as DE: self.log_info('decoding failed:', DE.reason) return None except ValueError as V: self.log_info('decoding failed:', V) return None if ratio < 0.8: self.log_info(F'data contains {ratio * 100:.2f}% printable characters, this is too low.') return None decoded = decoded.splitlines(False) if not wrap: for k, line in enumerate(decoded): line = line.replace('\t', '\x20' * 4) if len(line) <= width: continue clipped = line[:width - 3] if self.args.gray: color = '' reset = '' else: colorama = self._colorama color = colorama.Fore.LIGHTRED_EX reset = colorama.Style.RESET_ALL decoded[k] = F'{clipped}{color}...{reset}' return decoded[:abs(linecount)] for paragraph in decoded: if not remaining: break wrapped = [ line for chunk in textwrap.wrap( paragraph, width, break_long_words=True, break_on_hyphens=False, drop_whitespace=False, expand_tabs=True, max_lines=abs(remaining + 1), replace_whitespace=False, tabsize=4, ) for line in chunk.splitlines(keepends=False) ] remaining -= len(wrapped) result.extend(wrapped) return result[:abs(linecount)] def _peeklines(self, data: bytearray, colorize: bool) -> Generator[str, None, None]: meta = metavars(data) codec = None lines = None final = data.temp or False empty = True if not self.args.index: meta.discard('index') index = None else: index = meta.get('index', None) if not self.args.brief: padding = 0 else: padding = SizeInt.width + 2 if index is not None: padding += 6 metrics = self._get_metrics(len(data), self.args.lines, padding) if self.args.brief: metrics.address_width = 0 metrics.fit_to_width(allow_increase=True) sepsize = metrics.hexdump_width txtsize = self.args.width or sepsize if self.args.lines and data: if self.args.escape: lines = self._trydecode(data, None, txtsize, metrics.line_count) if self.args.decode > 0: for codec in ('utf8', 'utf-16le', 'utf-16', 'utf-16be'): lines = self._trydecode(data, codec, txtsize, metrics.line_count) if lines: codec = codec break else: codec = None if lines is None: lines = list(self.hexdump(data, metrics, colorize)) else: sepsize = txtsize def separator(title=None): if title is None or sepsize <= len(title) + 8: return sepsize * '-' return '-' * (sepsize - len(title) - 5) + F'[{title}]---' if self.args.brief: final = False elif not self.args.bare: peek = repr(meta.size) line = separator() if len(data) <= 5_000_000: peek = F'{peek}; {meta.entropy!r} entropy' peek = F'{peek}; {meta.magic!s}' if self.args.lines == 0: peek = None elif not data: peek = None line = separator('empty chunk') if self.args.meta > 0: meta.derive('size') meta.derive('magic') meta.derive('entropy') peek = None if self.args.meta > 1: meta.derive('crc32') meta.derive('sha256') if self.args.meta > 2: for name in meta.derivations: meta[name] for line in self._peekmeta(metrics.hexdump_width, line, meta, peek=peek): empty = False yield line if lines: empty = False if not self.args.brief: yield separator(codec or None) yield from lines else: brief = next(iter(lines)) brief = F'{SizeInt(len(data))!r}: {brief}' if index is not None: brief = F'#{index:03d}: {brief}' yield brief if final and (self.args.bare or not empty): yield separator() def filter(self, chunks): try: self._colorama.init(wrap=False) except ImportError: pass discarded = 0 it = iter(chunks) buffer = collections.deque(itertools.islice(it, 0, 2)) buffer.reverse() while buffer: if self.isatty and not buffer[0].visible: buffer.popleft() discarded += 1 else: item = buffer.pop() last = not bool(buffer) item.temp = last if not item.visible and self.isatty: discarded += 1 else: yield item try: buffer.appendleft(next(it)) except StopIteration: pass if discarded: self.log_warn(F'discarded {discarded} invisible chunks to prevent them from leaking into the terminal.')
class pemeta (custom=False, debug=False, dotnet=False, signatures=False, timestamps=0, version=False, header=False, exports=0, imports=0, tabular=False, timeraw=False)
-
This unit is implemented in
refinery.units.formats.pe.pemeta
and has the following commandline Interface:usage: pemeta [-h] [-L] [-Q] [-0] [-v] [-c] [-D] [-N] [-S] [-T] [-V] [-H] [-E] [-I] [-t] [-r] Extract metadata from PE files. By default, all information except for imports and exports are extracted. optional arguments: -c, --custom Unless enabled, all default categories will be extracted. -D, --debug Parse the PDB path from the debug directory. -N, --dotnet Parse the .NET header. -S, --signatures Parse digital signatures. -T, --timestamps Extract time stamps. Specify twice for more detail. -V, --version Parse the VERSION resource. -H, --header Parse base data from the PE header. -E, --exports List all exported functions. Specify twice to include addresses. -I, --imports List all imported functions. Specify twice to include addresses. -t, --tabular Print information in a table rather than as JSON -r, --timeraw Extract time stamps as numbers instead of human-readable format. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pemeta(Unit): """ Extract metadata from PE files. By default, all information except for imports and exports are extracted. """ def __init__( self, custom : Arg('-c', '--custom', help='Unless enabled, all default categories will be extracted.') = False, debug : Arg.Switch('-D', help='Parse the PDB path from the debug directory.') = False, dotnet : Arg.Switch('-N', help='Parse the .NET header.') = False, signatures : Arg.Switch('-S', help='Parse digital signatures.') = False, timestamps : Arg.Counts('-T', help='Extract time stamps. Specify twice for more detail.') = 0, version : Arg.Switch('-V', help='Parse the VERSION resource.') = False, header : Arg.Switch('-H', help='Parse base data from the PE header.') = False, exports : Arg.Counts('-E', help='List all exported functions. Specify twice to include addresses.') = 0, imports : Arg.Counts('-I', help='List all imported functions. Specify twice to include addresses.') = 0, tabular : Arg.Switch('-t', help='Print information in a table rather than as JSON') = False, timeraw : Arg.Switch('-r', help='Extract time stamps as numbers instead of human-readable format.') = False, ): if not custom and not any((debug, dotnet, signatures, timestamps, version, header)): debug = dotnet = signatures = timestamps = version = header = True super().__init__( debug=debug, dotnet=dotnet, signatures=signatures, timestamps=timestamps, version=version, header=header, imports=imports, exports=exports, timeraw=timeraw, tabular=tabular, ) @classmethod def _ensure_string(cls, x): if not isinstance(x, str): x = repr(x) if not isinstance(x, bytes) else x.decode(cls.codec, 'backslashreplace') return x @classmethod def _parse_pedict(cls, bin): return dict(( cls._ensure_string(key), cls._ensure_string(val) ) for key, val in bin.items() if val) @classmethod def parse_signature(cls, data: bytearray) -> dict: """ Extracts a JSON-serializable and human-readable dictionary with information about time stamp and code signing certificates that are attached to the input PE file. """ from refinery.units.formats.pkcs7 import pkcs7 try: signature = data | pkcs7 | json.loads except Exception as E: raise ValueError(F'PKCS7 parser failed with error: {E!s}') info = {} def find_timestamps(entry): if isinstance(entry, dict): if set(entry.keys()) == {'type', 'value'}: if entry['type'] == 'signing_time': return {'Timestamp': entry['value']} for value in entry.values(): result = find_timestamps(value) if result is None: continue with suppress(KeyError): result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name']) return result elif isinstance(entry, list): for value in entry: result = find_timestamps(value) if result is None: continue return result timestamp_info = find_timestamps(signature) if timestamp_info is not None: info.update(timestamp_info) try: certificates = signature['content']['certificates'] except KeyError: return info if len(certificates) == 1: main_certificate = certificates[0] else: certificates_with_extended_use = [] main_certificate = None for certificate in certificates: with suppress(Exception): crt = certificate['tbs_certificate'] ext = [e for e in crt['extensions'] if e['extn_id'] == 'extended_key_usage' and e['extn_value'] != ['time_stamping']] key = [e for e in crt['extensions'] if e['extn_id'] == 'key_usage'] if ext: certificates_with_extended_use.append(certificate) if any('key_cert_sign' in e['extn_value'] for e in key): continue if any('code_signing' in e['extn_value'] for e in ext): main_certificate = certificate break if main_certificate is None and len(certificates_with_extended_use) == 1: main_certificate = certificates_with_extended_use[0] if main_certificate: crt = main_certificate['tbs_certificate'] serial = crt['serial_number'] if isinstance(serial, int): serial = F'{serial:x}' if len(serial) % 2 != 0: serial = F'0{serial}' assert bytes.fromhex(serial) in data subject = crt['subject'] location = [subject.get(t, '') for t in ('locality_name', 'state_or_province_name', 'country_name')] info.update(Subject=subject['common_name']) if any(location): info.update(SubjectLocation=', '.join(filter(None, location))) for signer_info in signature['content'].get('signer_infos', ()): try: if signer_info['sid']['serial_number'] != crt['serial_number']: continue for attr in signer_info['signed_attrs']: if attr['type'] == 'authenticode_info': info.update(ProgramName=attr['value']['programName']) info.update(MoreInfo=attr['value']['moreInfo']) except KeyError: continue try: valid_from = crt['validity']['not_before'] valid_until = crt['validity']['not_after'] except KeyError: pass else: info.update(ValidFrom=valid_from, ValidUntil=valid_until) info.update( Issuer=crt['issuer']['common_name'], Fingerprint=main_certificate['fingerprint'], Serial=serial) return info return info def _pe_characteristics(self, pe: PE): return {name for name, mask in image_characteristics if pe.FILE_HEADER.Characteristics & mask} def _pe_address_width(self, pe: PE, default=16) -> int: if 'IMAGE_FILE_16BIT_MACHINE' in self._pe_characteristics(pe): return 4 elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in ['IMAGE_FILE_MACHINE_I386']: return 8 elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in [ 'IMAGE_FILE_MACHINE_AMD64', 'IMAGE_FILE_MACHINE_IA64', ]: return 16 else: return default def _vint(self, pe: PE, value: int): if not self.args.tabular: return value aw = self._pe_address_width(pe) return F'0x{value:0{aw}X}' def parse_version(self, pe: PE, data=None) -> dict: """ Extracts a JSON-serializable and human-readable dictionary with information about the version resource of an input PE file, if available. """ pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']]) string_table_entries = [] for FileInfo in pe.FileInfo: for FileInfoEntry in FileInfo: with suppress(AttributeError): for StringTableEntry in FileInfoEntry.StringTable: StringTableEntryParsed = self._parse_pedict(StringTableEntry.entries) with suppress(AttributeError): LangID = StringTableEntry.entries.get('LangID', None) or StringTableEntry.LangID LangID = int(LangID, 0x10) if not isinstance(LangID, int) else LangID LangHi = LangID >> 0x10 LangLo = LangID & 0xFFFF Language = self._LCID.get(LangHi, 'Language Neutral') Charset = self._CHARSET.get(LangLo, 'Unknown Charset') StringTableEntryParsed.update( LangID=F'{LangID:08X}', Charset=Charset, Language=Language ) for key in StringTableEntryParsed: if key.endswith('Version'): value = StringTableEntryParsed[key] separator = ', ' if re.match(F'\\d+({re.escape(separator)}\\d+){{3}}', value): StringTableEntryParsed[key] = '.'.join(value.split(separator)) string_table_entries.append(StringTableEntryParsed) if not string_table_entries: return None elif len(string_table_entries) == 1: return string_table_entries[0] else: return string_table_entries def parse_exports(self, pe: PE, data=None, include_addresses=False) -> list: pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']]) base = pe.OPTIONAL_HEADER.ImageBase info = [] for k, exp in enumerate(pe.DIRECTORY_ENTRY_EXPORT.symbols): if not exp.name: name = F'@{k}' else: name = exp.name.decode('ascii') item = {'Name': name, 'Address': self._vint(pe, exp.address + base)} if include_addresses else name info.append(item) return info def parse_imports(self, pe: PE, data=None, include_addresses=False) -> list: info = {} dirs = [] for name in [ 'DIRECTORY_ENTRY_IMPORT', 'DIRECTORY_ENTRY_DELAY_IMPORT', ]: pe.parse_data_directories(directories=[DIRECTORY_ENTRY[F'IMAGE_{name}']]) with suppress(AttributeError): dirs.append(getattr(pe, name)) self.log_warn(dirs) for idd in itertools.chain(*dirs): dll: bytes = idd.dll dll = dll.decode('ascii') if dll.lower().endswith('.dll'): dll = dll[:~3] imports: list[str] = info.setdefault(dll, []) with suppress(AttributeError): symbols = idd.imports with suppress(AttributeError): symbols = idd.entries try: for imp in symbols: name: bytes = imp.name name = name and name.decode('ascii') or F'@{imp.ordinal}' if not include_addresses: imports.append(name) else: imports.append(dict(Name=name, Address=self._vint(pe, imp.address))) except Exception as e: self.log_warn(F'error parsing {name}: {e!s}') return info def parse_header(self, pe: PE, data=None) -> dict: def format_macro_name(name: str, prefix, convert=True): name = name.split('_')[prefix:] if convert: for k, part in enumerate(name): name[k] = part.upper() if len(part) <= 3 else part.capitalize() return ' '.join(name) major = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion minor = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion version = self._WINVER.get(major, {0: 'Unknown'}) try: MinimumOS = version[minor] except LookupError: MinimumOS = version[0] header_information = { 'Machine': format_macro_name(MACHINE_TYPE[pe.FILE_HEADER.Machine], 3, False), 'Subsystem': format_macro_name(SUBSYSTEM_TYPE[pe.OPTIONAL_HEADER.Subsystem], 2), 'MinimumOS': MinimumOS, } pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'], ]) try: export_name = pe.DIRECTORY_ENTRY_EXPORT.name if isinstance(export_name, bytes): export_name = export_name.decode('utf8') if not export_name.isprintable(): export_name = None except Exception: export_name = None if export_name: header_information['ExportName'] = export_name rich_header = pe.parse_rich_header() rich = [] if rich_header: it = rich_header.get('values', []) if self.args.tabular: cw = max(len(F'{c:d}') for c in it[1::2]) for idv, count in zip(it[0::2], it[1::2]): info = get_rich_info(idv) if not info: continue pid = info.pid.upper() if self.args.tabular: short_pid = get_rich_short_pid(pid) rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}') else: rich.append({ 'Counter': count, 'Encoded': F'{idv:08x}', 'Library': pid, 'Product': info.ver, }) header_information['RICH'] = rich characteristics = self._pe_characteristics(pe) for typespec, flag in { 'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE', 'DLL': 'IMAGE_FILE_DLL', 'SYS': 'IMAGE_FILE_SYSTEM' }.items(): if flag in characteristics: header_information['Type'] = typespec base = pe.OPTIONAL_HEADER.ImageBase header_information['ImageBase'] = self._vint(pe, base) header_information['ImageSize'] = get_pe_size(pe) header_information['Bits'] = 4 * self._pe_address_width(pe, 16) header_information['EntryPoint'] = self._vint(pe, pe.OPTIONAL_HEADER.AddressOfEntryPoint + base) return header_information def parse_time_stamps(self, pe: PE, raw_time_stamps: bool, more_detail: bool) -> dict: """ Extracts time stamps from the PE header (link time), as well as from the imports, exports, debug, and resource directory. The resource time stamp is also parsed as a DOS time stamp and returned as the "Delphi" time stamp. """ if raw_time_stamps: def dt(ts): return ts else: dt = date_from_timestamp pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE'] ]) info = {} with suppress(AttributeError): info.update(Linker=dt(pe.FILE_HEADER.TimeDateStamp)) for dir_name, _dll, info_key in [ ('DIRECTORY_ENTRY_IMPORT', 'dll', 'Import'), # noqa ('DIRECTORY_ENTRY_DELAY_IMPORT', 'dll', 'Symbol'), # noqa ('DIRECTORY_ENTRY_BOUND_IMPORT', 'name', 'Module'), # noqa ]: impts = {} for entry in getattr(pe, dir_name, []): ts = 0 with suppress(AttributeError): ts = entry.struct.dwTimeDateStamp with suppress(AttributeError): ts = entry.struct.TimeDateStamp if ts == 0 or ts == 0xFFFFFFFF: continue name = getattr(entry, _dll, B'').decode() if name.lower().endswith('.dll'): name = name[:-4] impts[name] = dt(ts) if not impts: continue if not more_detail: dmin = min(impts.values()) dmax = max(impts.values()) small_delta = 2 * 60 * 60 if not raw_time_stamps: small_delta = timedelta(seconds=small_delta) if dmax - dmin < small_delta: impts = dmin info[info_key] = impts with suppress(AttributeError): Export = pe.DIRECTORY_ENTRY_EXPORT.struct.TimeDateStamp if Export: info.update(Export=dt(Export)) with suppress(AttributeError): res_timestamp = pe.DIRECTORY_ENTRY_RESOURCE.struct.TimeDateStamp if res_timestamp: with suppress(ValueError): from refinery.units.misc.datefix import datefix dos = datefix.dostime(res_timestamp) info.update(Delphi=dos) info.update(RsrcTS=dt(res_timestamp)) def norm(value): if isinstance(value, list): return [norm(v) for v in value] if isinstance(value, dict): return {k: norm(v) for k, v in value.items()} if isinstance(value, int): return value return str(value) return {key: norm(value) for key, value in info.items()} def parse_dotnet(self, pe: PE, data): """ Extracts a JSON-serializable and human-readable dictionary with information about the .NET metadata of an input PE file. """ header = DotNetHeader(data, pe=pe) tables = header.meta.Streams.Tables info = dict( RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}', Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}', VersionString=header.meta.VersionString ) info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check] if len(tables.Assembly) == 1: assembly = tables.Assembly[0] info.update( AssemblyName=assembly.Name, Release='{}.{}.{}.{}'.format( assembly.MajorVersion, assembly.MinorVersion, assembly.BuildNumber, assembly.RevisionNumber ) ) try: entry = self._vint(pe, header.head.EntryPointToken + pe.OPTIONAL_HEADER.ImageBase) info.update(EntryPoint=entry) except AttributeError: pass if len(tables.Module) == 1: module = tables.Module[0] info.update(ModuleName=module.Name) return info def parse_debug(self, pe: PE, data=None): result = {} pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']]) for dbg in pe.DIRECTORY_ENTRY_DEBUG: if DEBUG_TYPE.get(dbg.struct.Type, None) != 'IMAGE_DEBUG_TYPE_CODEVIEW': continue with suppress(Exception): pdb = dbg.entry.PdbFileName if 0 in pdb: pdb = pdb[:pdb.index(0)] result.update( PdbPath=pdb.decode(self.codec), PdbAge=dbg.entry.Age ) return result def process(self, data): result = {} pe = PE(data=data, fast_load=True) for switch, resolver, name in [ (self.args.debug, self.parse_debug, 'Debug'), # noqa (self.args.dotnet, self.parse_dotnet, 'DotNet'), # noqa (self.args.header, self.parse_header, 'Header'), # noqa (self.args.version, self.parse_version, 'Version'), # noqa (self.args.imports, self.parse_imports, 'Imports'), # noqa (self.args.exports, self.parse_exports, 'Exports'), # noqa ]: if not switch: continue self.log_debug(F'parsing: {name}') args = pe, data if switch > 1: args = *args, True try: info = resolver(*args) except Exception as E: self.log_info(F'failed to obtain {name}: {E!s}') continue if info: result[name] = info signature = {} if self.args.timestamps or self.args.signatures: with suppress(Exception): from refinery.units.formats.pe.pesig import pesig signature = self.parse_signature(next(data | pesig)) if self.args.timestamps: ts = self.parse_time_stamps(pe, self.args.timeraw, self.args.timestamps > 1) with suppress(KeyError): ts.update(Signed=signature['Timestamp']) result.update(TimeStamp=ts) if signature and self.args.signatures: result['Signature'] = signature if result: yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False) _LCID = { 0x0C00: 'Default Custom Locale Language', 0x1400: 'Default Custom MUI Locale Language', 0x007F: 'Invariant Locale Language', 0x0000: 'Neutral Locale Language', 0x0800: 'System Default Locale Language', 0x1000: 'Unspecified Custom Locale Language', 0x0400: 'User Default Locale Language', 0x0436: 'Afrikaans-South Africa', 0x041c: 'Albanian-Albania', 0x045e: 'Amharic-Ethiopia', 0x0401: 'Arabic (Saudi Arabia)', 0x1401: 'Arabic (Algeria)', 0x3c01: 'Arabic (Bahrain)', 0x0c01: 'Arabic (Egypt)', 0x0801: 'Arabic (Iraq)', 0x2c01: 'Arabic (Jordan)', 0x3401: 'Arabic (Kuwait)', 0x3001: 'Arabic (Lebanon)', 0x1001: 'Arabic (Libya)', 0x1801: 'Arabic (Morocco)', 0x2001: 'Arabic (Oman)', 0x4001: 'Arabic (Qatar)', 0x2801: 'Arabic (Syria)', 0x1c01: 'Arabic (Tunisia)', 0x3801: 'Arabic (U.A.E.)', 0x2401: 'Arabic (Yemen)', 0x042b: 'Armenian-Armenia', 0x044d: 'Assamese', 0x082c: 'Azeri (Cyrillic)', 0x042c: 'Azeri (Latin)', 0x042d: 'Basque', 0x0423: 'Belarusian', 0x0445: 'Bengali (India)', 0x0845: 'Bengali (Bangladesh)', 0x141A: 'Bosnian (Bosnia/Herzegovina)', 0x0402: 'Bulgarian', 0x0455: 'Burmese', 0x0403: 'Catalan', 0x045c: 'Cherokee-United States', 0x0804: 'Chinese (People\'s Republic of China)', 0x1004: 'Chinese (Singapore)', 0x0404: 'Chinese (Taiwan)', 0x0c04: 'Chinese (Hong Kong SAR)', 0x1404: 'Chinese (Macao SAR)', 0x041a: 'Croatian', 0x101a: 'Croatian (Bosnia/Herzegovina)', 0x0405: 'Czech', 0x0406: 'Danish', 0x0465: 'Divehi', 0x0413: 'Dutch-Netherlands', 0x0813: 'Dutch-Belgium', 0x0466: 'Edo', 0x0409: 'English (United States)', 0x0809: 'English (United Kingdom)', 0x0c09: 'English (Australia)', 0x2809: 'English (Belize)', 0x1009: 'English (Canada)', 0x2409: 'English (Caribbean)', 0x3c09: 'English (Hong Kong SAR)', 0x4009: 'English (India)', 0x3809: 'English (Indonesia)', 0x1809: 'English (Ireland)', 0x2009: 'English (Jamaica)', 0x4409: 'English (Malaysia)', 0x1409: 'English (New Zealand)', 0x3409: 'English (Philippines)', 0x4809: 'English (Singapore)', 0x1c09: 'English (South Africa)', 0x2c09: 'English (Trinidad)', 0x3009: 'English (Zimbabwe)', 0x0425: 'Estonian', 0x0438: 'Faroese', 0x0429: 'Farsi', 0x0464: 'Filipino', 0x040b: 'Finnish', 0x040c: 'French (France)', 0x080c: 'French (Belgium)', 0x2c0c: 'French (Cameroon)', 0x0c0c: 'French (Canada)', 0x240c: 'French (Democratic Rep. of Congo)', 0x300c: 'French (Cote d\'Ivoire)', 0x3c0c: 'French (Haiti)', 0x140c: 'French (Luxembourg)', 0x340c: 'French (Mali)', 0x180c: 'French (Monaco)', 0x380c: 'French (Morocco)', 0xe40c: 'French (North Africa)', 0x200c: 'French (Reunion)', 0x280c: 'French (Senegal)', 0x100c: 'French (Switzerland)', 0x1c0c: 'French (West Indies)', 0x0462: 'Frisian-Netherlands', 0x0467: 'Fulfulde-Nigeria', 0x042f: 'FYRO Macedonian', 0x083c: 'Gaelic (Ireland)', 0x043c: 'Gaelic (Scotland)', 0x0456: 'Galician', 0x0437: 'Georgian', 0x0407: 'German (Germany)', 0x0c07: 'German (Austria)', 0x1407: 'German (Liechtenstein)', 0x1007: 'German (Luxembourg)', 0x0807: 'German (Switzerland)', 0x0408: 'Greek', 0x0474: 'Guarani-Paraguay', 0x0447: 'Gujarati', 0x0468: 'Hausa-Nigeria', 0x0475: 'Hawaiian (United States)', 0x040d: 'Hebrew', 0x0439: 'Hindi', 0x040e: 'Hungarian', 0x0469: 'Ibibio-Nigeria', 0x040f: 'Icelandic', 0x0470: 'Igbo-Nigeria', 0x0421: 'Indonesian', 0x045d: 'Inuktitut', 0x0410: 'Italian (Italy)', 0x0810: 'Italian (Switzerland)', 0x0411: 'Japanese', 0x044b: 'Kannada', 0x0471: 'Kanuri-Nigeria', 0x0860: 'Kashmiri', 0x0460: 'Kashmiri (Arabic)', 0x043f: 'Kazakh', 0x0453: 'Khmer', 0x0457: 'Konkani', 0x0412: 'Korean', 0x0440: 'Kyrgyz (Cyrillic)', 0x0454: 'Lao', 0x0476: 'Latin', 0x0426: 'Latvian', 0x0427: 'Lithuanian', 0x043e: 'Malay-Malaysia', 0x083e: 'Malay-Brunei Darussalam', 0x044c: 'Malayalam', 0x043a: 'Maltese', 0x0458: 'Manipuri', 0x0481: 'Maori-New Zealand', 0x044e: 'Marathi', 0x0450: 'Mongolian (Cyrillic)', 0x0850: 'Mongolian (Mongolian)', 0x0461: 'Nepali', 0x0861: 'Nepali-India', 0x0414: 'Norwegian (Bokmål)', 0x0814: 'Norwegian (Nynorsk)', 0x0448: 'Oriya', 0x0472: 'Oromo', 0x0479: 'Papiamentu', 0x0463: 'Pashto', 0x0415: 'Polish', 0x0416: 'Portuguese-Brazil', 0x0816: 'Portuguese-Portugal', 0x0446: 'Punjabi', 0x0846: 'Punjabi (Pakistan)', 0x046B: 'Quecha (Bolivia)', 0x086B: 'Quecha (Ecuador)', 0x0C6B: 'Quecha (Peru)', 0x0417: 'Rhaeto-Romanic', 0x0418: 'Romanian', 0x0818: 'Romanian (Moldava)', 0x0419: 'Russian', 0x0819: 'Russian (Moldava)', 0x043b: 'Sami (Lappish)', 0x044f: 'Sanskrit', 0x046c: 'Sepedi', 0x0c1a: 'Serbian (Cyrillic)', 0x081a: 'Serbian (Latin)', 0x0459: 'Sindhi (India)', 0x0859: 'Sindhi (Pakistan)', 0x045b: 'Sinhalese-Sri Lanka', 0x041b: 'Slovak', 0x0424: 'Slovenian', 0x0477: 'Somali', 0x042e: 'Sorbian', 0x0c0a: 'Spanish (Modern Sort)', 0x040a: 'Spanish (Traditional Sort)', 0x2c0a: 'Spanish (Argentina)', 0x400a: 'Spanish (Bolivia)', 0x340a: 'Spanish (Chile)', 0x240a: 'Spanish (Colombia)', 0x140a: 'Spanish (Costa Rica)', 0x1c0a: 'Spanish (Dominican Republic)', 0x300a: 'Spanish (Ecuador)', 0x440a: 'Spanish (El Salvador)', 0x100a: 'Spanish (Guatemala)', 0x480a: 'Spanish (Honduras)', 0x580a: 'Spanish (Latin America)', 0x080a: 'Spanish (Mexico)', 0x4c0a: 'Spanish (Nicaragua)', 0x180a: 'Spanish (Panama)', 0x3c0a: 'Spanish (Paraguay)', 0x280a: 'Spanish (Peru)', 0x500a: 'Spanish (Puerto Rico)', 0x540a: 'Spanish (United States)', 0x380a: 'Spanish (Uruguay)', 0x200a: 'Spanish (Venezuela)', 0x0430: 'Sutu', 0x0441: 'Swahili', 0x041d: 'Swedish', 0x081d: 'Swedish-Finland', 0x045a: 'Syriac', 0x0428: 'Tajik', 0x045f: 'Tamazight (Arabic)', 0x085f: 'Tamazight (Latin)', 0x0449: 'Tamil', 0x0444: 'Tatar', 0x044a: 'Telugu', 0x041e: 'Thai', 0x0851: 'Tibetan (Bhutan)', 0x0451: 'Tibetan (People\'s Republic of China)', 0x0873: 'Tigrigna (Eritrea)', 0x0473: 'Tigrigna (Ethiopia)', 0x0431: 'Tsonga', 0x0432: 'Tswana', 0x041f: 'Turkish', 0x0442: 'Turkmen', 0x0480: 'Uighur-China', 0x0422: 'Ukrainian', 0x0420: 'Urdu', 0x0820: 'Urdu-India', 0x0843: 'Uzbek (Cyrillic)', 0x0443: 'Uzbek (Latin)', 0x0433: 'Venda', 0x042a: 'Vietnamese', 0x0452: 'Welsh', 0x0434: 'Xhosa', 0x0478: 'Yi', 0x043d: 'Yiddish', 0x046a: 'Yoruba', 0x0435: 'Zulu', 0x04ff: 'HID (Human Interface DeVITe)' } _CHARSET = { 0x0000: '7-bit ASCII', 0x03A4: 'Japan (Shift ? JIS X-0208)', 0x03B5: 'Korea (Shift ? KSC 5601)', 0x03B6: 'Taiwan (Big5)', 0x04B0: 'Unicode', 0x04E2: 'Latin-2 (Eastern European)', 0x04E3: 'Cyrillic', 0x04E4: 'Multilingual', 0x04E5: 'Greek', 0x04E6: 'Turkish', 0x04E7: 'Hebrew', 0x04E8: 'Arabic', } _WINVER = { 3: { 0x00: 'Windows NT 3', 0x0A: 'Windows NT 3.1', 0x32: 'Windows NT 3.5', 0x33: 'Windows NT 3.51', }, 4: { 0x00: 'Windows 95', 0x0A: 'Windows 98', }, 5: { 0x00: 'Windows 2000', 0x5A: 'Windows Me', 0x01: 'Windows XP', 0x02: 'Windows Server 2003', }, 6: { 0x00: 'Windows Vista', 0x01: 'Windows 7', 0x02: 'Windows 8', 0x03: 'Windows 8.1', }, 10: { 0x00: 'Windows 10', } }
Static methods
def parse_signature(data)
-
Extracts a JSON-serializable and human-readable dictionary with information about time stamp and code signing certificates that are attached to the input PE file.
Expand source code Browse git
@classmethod def parse_signature(cls, data: bytearray) -> dict: """ Extracts a JSON-serializable and human-readable dictionary with information about time stamp and code signing certificates that are attached to the input PE file. """ from refinery.units.formats.pkcs7 import pkcs7 try: signature = data | pkcs7 | json.loads except Exception as E: raise ValueError(F'PKCS7 parser failed with error: {E!s}') info = {} def find_timestamps(entry): if isinstance(entry, dict): if set(entry.keys()) == {'type', 'value'}: if entry['type'] == 'signing_time': return {'Timestamp': entry['value']} for value in entry.values(): result = find_timestamps(value) if result is None: continue with suppress(KeyError): result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name']) return result elif isinstance(entry, list): for value in entry: result = find_timestamps(value) if result is None: continue return result timestamp_info = find_timestamps(signature) if timestamp_info is not None: info.update(timestamp_info) try: certificates = signature['content']['certificates'] except KeyError: return info if len(certificates) == 1: main_certificate = certificates[0] else: certificates_with_extended_use = [] main_certificate = None for certificate in certificates: with suppress(Exception): crt = certificate['tbs_certificate'] ext = [e for e in crt['extensions'] if e['extn_id'] == 'extended_key_usage' and e['extn_value'] != ['time_stamping']] key = [e for e in crt['extensions'] if e['extn_id'] == 'key_usage'] if ext: certificates_with_extended_use.append(certificate) if any('key_cert_sign' in e['extn_value'] for e in key): continue if any('code_signing' in e['extn_value'] for e in ext): main_certificate = certificate break if main_certificate is None and len(certificates_with_extended_use) == 1: main_certificate = certificates_with_extended_use[0] if main_certificate: crt = main_certificate['tbs_certificate'] serial = crt['serial_number'] if isinstance(serial, int): serial = F'{serial:x}' if len(serial) % 2 != 0: serial = F'0{serial}' assert bytes.fromhex(serial) in data subject = crt['subject'] location = [subject.get(t, '') for t in ('locality_name', 'state_or_province_name', 'country_name')] info.update(Subject=subject['common_name']) if any(location): info.update(SubjectLocation=', '.join(filter(None, location))) for signer_info in signature['content'].get('signer_infos', ()): try: if signer_info['sid']['serial_number'] != crt['serial_number']: continue for attr in signer_info['signed_attrs']: if attr['type'] == 'authenticode_info': info.update(ProgramName=attr['value']['programName']) info.update(MoreInfo=attr['value']['moreInfo']) except KeyError: continue try: valid_from = crt['validity']['not_before'] valid_until = crt['validity']['not_after'] except KeyError: pass else: info.update(ValidFrom=valid_from, ValidUntil=valid_until) info.update( Issuer=crt['issuer']['common_name'], Fingerprint=main_certificate['fingerprint'], Serial=serial) return info return info
Methods
def parse_version(self, pe, data=None)
-
Extracts a JSON-serializable and human-readable dictionary with information about the version resource of an input PE file, if available.
Expand source code Browse git
def parse_version(self, pe: PE, data=None) -> dict: """ Extracts a JSON-serializable and human-readable dictionary with information about the version resource of an input PE file, if available. """ pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']]) string_table_entries = [] for FileInfo in pe.FileInfo: for FileInfoEntry in FileInfo: with suppress(AttributeError): for StringTableEntry in FileInfoEntry.StringTable: StringTableEntryParsed = self._parse_pedict(StringTableEntry.entries) with suppress(AttributeError): LangID = StringTableEntry.entries.get('LangID', None) or StringTableEntry.LangID LangID = int(LangID, 0x10) if not isinstance(LangID, int) else LangID LangHi = LangID >> 0x10 LangLo = LangID & 0xFFFF Language = self._LCID.get(LangHi, 'Language Neutral') Charset = self._CHARSET.get(LangLo, 'Unknown Charset') StringTableEntryParsed.update( LangID=F'{LangID:08X}', Charset=Charset, Language=Language ) for key in StringTableEntryParsed: if key.endswith('Version'): value = StringTableEntryParsed[key] separator = ', ' if re.match(F'\\d+({re.escape(separator)}\\d+){{3}}', value): StringTableEntryParsed[key] = '.'.join(value.split(separator)) string_table_entries.append(StringTableEntryParsed) if not string_table_entries: return None elif len(string_table_entries) == 1: return string_table_entries[0] else: return string_table_entries
def parse_exports(self, pe, data=None, include_addresses=False)
-
Expand source code Browse git
def parse_exports(self, pe: PE, data=None, include_addresses=False) -> list: pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']]) base = pe.OPTIONAL_HEADER.ImageBase info = [] for k, exp in enumerate(pe.DIRECTORY_ENTRY_EXPORT.symbols): if not exp.name: name = F'@{k}' else: name = exp.name.decode('ascii') item = {'Name': name, 'Address': self._vint(pe, exp.address + base)} if include_addresses else name info.append(item) return info
def parse_imports(self, pe, data=None, include_addresses=False)
-
Expand source code Browse git
def parse_imports(self, pe: PE, data=None, include_addresses=False) -> list: info = {} dirs = [] for name in [ 'DIRECTORY_ENTRY_IMPORT', 'DIRECTORY_ENTRY_DELAY_IMPORT', ]: pe.parse_data_directories(directories=[DIRECTORY_ENTRY[F'IMAGE_{name}']]) with suppress(AttributeError): dirs.append(getattr(pe, name)) self.log_warn(dirs) for idd in itertools.chain(*dirs): dll: bytes = idd.dll dll = dll.decode('ascii') if dll.lower().endswith('.dll'): dll = dll[:~3] imports: list[str] = info.setdefault(dll, []) with suppress(AttributeError): symbols = idd.imports with suppress(AttributeError): symbols = idd.entries try: for imp in symbols: name: bytes = imp.name name = name and name.decode('ascii') or F'@{imp.ordinal}' if not include_addresses: imports.append(name) else: imports.append(dict(Name=name, Address=self._vint(pe, imp.address))) except Exception as e: self.log_warn(F'error parsing {name}: {e!s}') return info
def parse_header(self, pe, data=None)
-
Expand source code Browse git
def parse_header(self, pe: PE, data=None) -> dict: def format_macro_name(name: str, prefix, convert=True): name = name.split('_')[prefix:] if convert: for k, part in enumerate(name): name[k] = part.upper() if len(part) <= 3 else part.capitalize() return ' '.join(name) major = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion minor = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion version = self._WINVER.get(major, {0: 'Unknown'}) try: MinimumOS = version[minor] except LookupError: MinimumOS = version[0] header_information = { 'Machine': format_macro_name(MACHINE_TYPE[pe.FILE_HEADER.Machine], 3, False), 'Subsystem': format_macro_name(SUBSYSTEM_TYPE[pe.OPTIONAL_HEADER.Subsystem], 2), 'MinimumOS': MinimumOS, } pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'], ]) try: export_name = pe.DIRECTORY_ENTRY_EXPORT.name if isinstance(export_name, bytes): export_name = export_name.decode('utf8') if not export_name.isprintable(): export_name = None except Exception: export_name = None if export_name: header_information['ExportName'] = export_name rich_header = pe.parse_rich_header() rich = [] if rich_header: it = rich_header.get('values', []) if self.args.tabular: cw = max(len(F'{c:d}') for c in it[1::2]) for idv, count in zip(it[0::2], it[1::2]): info = get_rich_info(idv) if not info: continue pid = info.pid.upper() if self.args.tabular: short_pid = get_rich_short_pid(pid) rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}') else: rich.append({ 'Counter': count, 'Encoded': F'{idv:08x}', 'Library': pid, 'Product': info.ver, }) header_information['RICH'] = rich characteristics = self._pe_characteristics(pe) for typespec, flag in { 'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE', 'DLL': 'IMAGE_FILE_DLL', 'SYS': 'IMAGE_FILE_SYSTEM' }.items(): if flag in characteristics: header_information['Type'] = typespec base = pe.OPTIONAL_HEADER.ImageBase header_information['ImageBase'] = self._vint(pe, base) header_information['ImageSize'] = get_pe_size(pe) header_information['Bits'] = 4 * self._pe_address_width(pe, 16) header_information['EntryPoint'] = self._vint(pe, pe.OPTIONAL_HEADER.AddressOfEntryPoint + base) return header_information
def parse_time_stamps(self, pe, raw_time_stamps, more_detail)
-
Extracts time stamps from the PE header (link time), as well as from the imports, exports, debug, and resource directory. The resource time stamp is also parsed as a DOS time stamp and returned as the "Delphi" time stamp.
Expand source code Browse git
def parse_time_stamps(self, pe: PE, raw_time_stamps: bool, more_detail: bool) -> dict: """ Extracts time stamps from the PE header (link time), as well as from the imports, exports, debug, and resource directory. The resource time stamp is also parsed as a DOS time stamp and returned as the "Delphi" time stamp. """ if raw_time_stamps: def dt(ts): return ts else: dt = date_from_timestamp pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE'] ]) info = {} with suppress(AttributeError): info.update(Linker=dt(pe.FILE_HEADER.TimeDateStamp)) for dir_name, _dll, info_key in [ ('DIRECTORY_ENTRY_IMPORT', 'dll', 'Import'), # noqa ('DIRECTORY_ENTRY_DELAY_IMPORT', 'dll', 'Symbol'), # noqa ('DIRECTORY_ENTRY_BOUND_IMPORT', 'name', 'Module'), # noqa ]: impts = {} for entry in getattr(pe, dir_name, []): ts = 0 with suppress(AttributeError): ts = entry.struct.dwTimeDateStamp with suppress(AttributeError): ts = entry.struct.TimeDateStamp if ts == 0 or ts == 0xFFFFFFFF: continue name = getattr(entry, _dll, B'').decode() if name.lower().endswith('.dll'): name = name[:-4] impts[name] = dt(ts) if not impts: continue if not more_detail: dmin = min(impts.values()) dmax = max(impts.values()) small_delta = 2 * 60 * 60 if not raw_time_stamps: small_delta = timedelta(seconds=small_delta) if dmax - dmin < small_delta: impts = dmin info[info_key] = impts with suppress(AttributeError): Export = pe.DIRECTORY_ENTRY_EXPORT.struct.TimeDateStamp if Export: info.update(Export=dt(Export)) with suppress(AttributeError): res_timestamp = pe.DIRECTORY_ENTRY_RESOURCE.struct.TimeDateStamp if res_timestamp: with suppress(ValueError): from refinery.units.misc.datefix import datefix dos = datefix.dostime(res_timestamp) info.update(Delphi=dos) info.update(RsrcTS=dt(res_timestamp)) def norm(value): if isinstance(value, list): return [norm(v) for v in value] if isinstance(value, dict): return {k: norm(v) for k, v in value.items()} if isinstance(value, int): return value return str(value) return {key: norm(value) for key, value in info.items()}
def parse_dotnet(self, pe, data)
-
Extracts a JSON-serializable and human-readable dictionary with information about the .NET metadata of an input PE file.
Expand source code Browse git
def parse_dotnet(self, pe: PE, data): """ Extracts a JSON-serializable and human-readable dictionary with information about the .NET metadata of an input PE file. """ header = DotNetHeader(data, pe=pe) tables = header.meta.Streams.Tables info = dict( RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}', Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}', VersionString=header.meta.VersionString ) info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check] if len(tables.Assembly) == 1: assembly = tables.Assembly[0] info.update( AssemblyName=assembly.Name, Release='{}.{}.{}.{}'.format( assembly.MajorVersion, assembly.MinorVersion, assembly.BuildNumber, assembly.RevisionNumber ) ) try: entry = self._vint(pe, header.head.EntryPointToken + pe.OPTIONAL_HEADER.ImageBase) info.update(EntryPoint=entry) except AttributeError: pass if len(tables.Module) == 1: module = tables.Module[0] info.update(ModuleName=module.Name) return info
def parse_debug(self, pe, data=None)
-
Expand source code Browse git
def parse_debug(self, pe: PE, data=None): result = {} pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']]) for dbg in pe.DIRECTORY_ENTRY_DEBUG: if DEBUG_TYPE.get(dbg.struct.Type, None) != 'IMAGE_DEBUG_TYPE_CODEVIEW': continue with suppress(Exception): pdb = dbg.entry.PdbFileName if 0 in pdb: pdb = pdb[:pdb.index(0)] result.update( PdbPath=pdb.decode(self.codec), PdbAge=dbg.entry.Age ) return result
class peoverlay (certificate=False, directories=False, memdump=False)
-
This unit is implemented in
refinery.units.formats.pe.peoverlay
and has the following commandline Interface:usage: peoverlay [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m] Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This does not include digital signatures. Use pestrip to obtain only the body of the PE file after removing the overlay. optional arguments: -c, --cert Include digital signatures for the size computation. -d, --dirs Include data directories for size computation. -m, --memdump Assume that the file data was a memory-mapped PE file. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class peoverlay(OverlayUnit): """ Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This does not include digital signatures. Use `refinery.pestrip` to obtain only the body of the PE file after removing the overlay. """ def process(self, data: bytearray) -> bytearray: size = self._get_size(data) try: data[:size] = [] except Exception: return data[size:] else: return data
class perc (*paths, pretty=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)
-
This unit is implemented in
refinery.units.formats.pe.perc
and has the following commandline Interface:usage: perc [-h] [-L] [-Q] [-0] [-v] [-p] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract PE file resources. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -p, --pretty Add missing headers to bitmap and icon resources. -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class perc(PathExtractorUnit): """ Extract PE file resources. """ def __init__( self, *paths, pretty: Arg.Switch('-p', help='Add missing headers to bitmap and icon resources.') = False, **kwargs ): super().__init__(*paths, pretty=pretty, **kwargs) def _get_icon_dir(self, pe: pefile.PE): try: group, = (e for e in pe.DIRECTORY_ENTRY_RESOURCE.entries if e.id == RSRC.ICON_GROUP.value) group = group.directory.entries[0].directory.entries[0].data.struct return GRPICONDIR(pe.get_data(group.OffsetToData, group.Size)) except Exception: return None def _search(self, pe: pefile.PE, directory, level=0, *parts): if level >= 3: self.log_warn(F'unexpected resource tree level {level + 1:d}') for entry in directory.entries: if entry.name: identifier = str(entry.name) elif level == 0 and entry.id in iter(RSRC): identifier = RSRC(entry.id) elif entry.id is not None: identifier = entry.id else: self.log_warn(F'resource entry has name {entry.name} and id {entry.id} at level {level + 1:d}') continue if entry.struct.DataIsDirectory: yield from self._search(pe, entry.directory, level + 1, *parts, identifier) else: rva = entry.data.struct.OffsetToData size = entry.data.struct.Size path = '/'.join(str(p) for p in (*parts, identifier)) extract = None if self.args.pretty: if parts[0] is RSRC.BITMAP: extract = self._handle_bitmap(pe, rva, size) elif parts[0] is RSRC.ICON: extract = self._handle_icon(pe, parts, rva, size) elif parts[0] is RSRC.STRING: extract = self._handle_strings(pe, parts, rva, size) if extract is None: def extract(pe=pe): return pe.get_data(rva, size) yield UnpackResult( path, extract, offset=pe.get_offset_from_rva(rva), lcid=self._get_lcid(entry.data), ) def _get_lcid(self, node_data) -> Optional[str]: try: pid = node_data.lang or 0 sid = node_data.sublang or 0 except AttributeError: return None try: pid = self._LANG_ID_TO_LCID[pid] except KeyError: return None lcid = pid.get(sid, 0) return pemeta._LCID.get(lcid) def _handle_strings(self, pe: pefile.PE, parts: Tuple[RSRC, int, int], rva: int, size: int): def extract(pe=pe): self.log_debug(parts) base = (parts[1] - 1) << 4 reader = StructReader(pe.get_data(rva, size)) table = {} index = 0 while not reader.eof: string = reader.read_exactly(reader.u16() * 2) if not string: break key = F'{base + index:04X}' table[key] = string.decode('utf-16le') index += 1 return json.dumps(table, indent=4).encode(self.codec) return extract def _handle_bitmap(self, pe: pefile.PE, rva: int, size: int): def extract(pe=pe): bitmap = pe.get_data(rva, size) total = (len(bitmap) + 14).to_bytes(4, 'little') return B'BM' + total + B'\0\0\0\0\x36\0\0\0' + bitmap return extract def _handle_icon(self, pe: pefile.PE, parts: Tuple[RSRC, int, int], rva: int, size: int): try: icondir = self._get_icon_dir(pe) index = int(parts[1]) - 1 info = icondir.entries[index] icon = pe.get_data(rva, size) except Exception: return None if icon.startswith(B'(\0\0\0'): header = struct.pack('<HHHBBBBHHII', 0, 1, 1, info.width, info.height, info.color_count, 0, info.planes, info.bit_count, len(icon), 0x16 ) icon = header + icon return icon def unpack(self, data): pe = pefile.PE(data=data, fast_load=True) pe.parse_data_directories( directories=pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']) try: rsrc = pe.DIRECTORY_ENTRY_RESOURCE except AttributeError: pass else: yield from self._search(pe, rsrc) def _mktbl(ids: List[Tuple[int, int, int]]) -> Dict[int, Dict[int, int]]: table = {} for pid, sid, lcid in ids: if pid not in table: table[pid] = {0: lcid} table[pid][sid] = lcid return table _LANG_ID_TO_LCID = _mktbl([ (0x00, 0x03, 0x0C00), (0x00, 0x05, 0x1400), (0x7F, 0x00, 0x007F), (0x00, 0x00, 0x0000), (0x02, 0x02, 0x0800), (0x00, 0x04, 0x1000), (0x00, 0x01, 0x0400), (0x36, 0x01, 0x0436), (0x1c, 0x01, 0x041C), (0x84, 0x01, 0x0484), (0x5E, 0x01, 0x045E), (0x01, 0x05, 0x1401), (0x01, 0x0f, 0x3C01), (0x01, 0x03, 0x0C01), (0x01, 0x02, 0x0801), (0x01, 0x0B, 0x2C01), (0x01, 0x0D, 0x3401), (0x01, 0x0C, 0x3001), (0x01, 0x04, 0x1001), (0x01, 0x06, 0x1801), (0x01, 0x08, 0x2001), (0x01, 0x10, 0x4001), (0x01, 0x01, 0x0401), (0x01, 0x0A, 0x2801), (0x01, 0x07, 0x1C01), (0x01, 0x0E, 0x3801), (0x01, 0x09, 0x2401), (0x2B, 0x01, 0x042B), (0x4D, 0x01, 0x044D), (0x2C, 0x02, 0x082C), (0x2C, 0x01, 0x042C), (0x45, 0x02, 0x0445), (0x6D, 0x01, 0x046D), (0x2d, 0x01, 0x042D), (0x23, 0x01, 0x0423), (0x1A, 0x08, 0x201A), (0x1A, 0x05, 0x141A), (0x7E, 0x01, 0x047E), (0x02, 0x01, 0x0402), (0x92, 0x01, 0x0492), (0x5C, 0x01, 0x045C), (0x03, 0x01, 0x0403), (0x04, 0x03, 0x0C04), (0x04, 0x05, 0x1404), (0x04, 0x04, 0x1004), (0x04, 0x02, 0x0004), (0x04, 0x01, 0x7C04), (0x83, 0x01, 0x0483), (0x1A, None, 0x001A), (0x1a, 0x04, 0x101A), (0x1a, 0x01, 0x041A), (0x05, 0x01, 0x0405), (0x06, 0x01, 0x0406), (0x8C, 0x01, 0x048C), (0x65, 0x01, 0x0465), (0x13, 0x02, 0x0813), (0x13, 0x01, 0x0413), (0x09, 0x03, 0x0C09), (0x09, 0x0A, 0x2809), (0x09, 0x04, 0x1009), (0x09, 0x09, 0x2409), (0x09, 0x10, 0x4009), (0x09, 0x06, 0x1809), (0x09, 0x08, 0x2009), (0x09, 0x11, 0x4409), (0x09, 0x05, 0x1409), (0x09, 0x0D, 0x3409), (0x09, 0x12, 0x4809), (0x09, 0x07, 0x1c09), (0x09, 0x0B, 0x2C09), (0x09, 0x02, 0x0809), (0x09, 0x01, 0x0409), (0x09, 0x0C, 0x3009), (0x25, 0x01, 0x0425), (0x38, 0x01, 0x0438), (0x64, 0x01, 0x0464), (0x0B, 0x01, 0x040B), (0x0C, 0x02, 0x080c), (0x0C, 0x03, 0x0C0C), (0x0C, 0x01, 0x040c), (0x0C, 0x05, 0x140C), (0x0C, 0x06, 0x180C), (0x0C, 0x04, 0x100C), (0x62, 0x01, 0x0462), (0x56, 0x01, 0x0456), (0x37, 0x01, 0x0437), (0x07, 0x03, 0x0C07), (0x07, 0x01, 0x0407), (0x07, 0x05, 0x1407), (0x07, 0x04, 0x1007), (0x07, 0x02, 0x0807), (0x08, 0x01, 0x0408), (0x6F, 0x01, 0x046F), (0x47, 0x01, 0x0447), (0x68, 0x01, 0x0468), (0x75, 0x01, 0x0475), (0x0D, 0x01, 0x040D), (0x39, 0x01, 0x0439), (0x0E, 0x01, 0x040E), (0x0F, 0x01, 0x040F), (0x70, 0x01, 0x0470), (0x21, 0x01, 0x0421), (0x5D, 0x02, 0x085D), (0x5D, 0x01, 0x045D), (0x3C, 0x02, 0x083C), (0x34, 0x01, 0x0434), (0x35, 0x01, 0x0435), (0x10, 0x01, 0x0410), (0x10, 0x02, 0x0810), (0x11, 0x01, 0x0411), (0x4B, 0x01, 0x044B), (0x3F, 0x01, 0x043F), (0x53, 0x01, 0x0453), (0x86, 0x01, 0x0486), (0x87, 0x01, 0x0487), (0x57, 0x01, 0x0457), (0x12, 0x01, 0x0412), (0x40, 0x01, 0x0440), (0x54, 0x01, 0x0454), (0x26, 0x01, 0x0426), (0x27, 0x01, 0x0427), (0x2E, 0x02, 0x082E), (0x6E, 0x01, 0x046E), (0x2F, 0x01, 0x042F), (0x3E, 0x02, 0x083E), (0x3E, 0x01, 0x043e), (0x4C, 0x01, 0x044C), (0x3A, 0x01, 0x043A), (0x81, 0x01, 0x0481), (0x7A, 0x01, 0x047A), (0x4E, 0x01, 0x044E), (0x7C, 0x01, 0x047C), (0x50, 0x01, 0x0450), (0x50, 0x02, 0x0850), (0x61, 0x01, 0x0461), (0x14, 0x01, 0x0414), (0x14, 0x02, 0x0814), (0x82, 0x01, 0x0482), (0x48, 0x01, 0x0448), (0x63, 0x01, 0x0463), (0x29, 0x01, 0x0429), (0x15, 0x01, 0x0415), (0x16, 0x01, 0x0416), (0x16, 0x02, 0x0816), (0x67, 0x02, 0x0867), (0x46, 0x01, 0x0446), (0x46, 0x02, 0x0846), (0x6B, 0x01, 0x046B), (0x6B, 0x02, 0x086B), (0x6B, 0x03, 0x0C6B), (0x18, 0x01, 0x0418), (0x17, 0x01, 0x0417), (0x19, 0x01, 0x0419), (0x85, 0x01, 0x0485), (0x3B, 0x09, 0x243B), (0x3B, 0x04, 0x103B), (0x3B, 0x05, 0x143B), (0x3B, 0x03, 0x0C3B), (0x3B, 0x01, 0x043B), (0x3B, 0x02, 0x083B), (0x3B, 0x08, 0x203B), (0x3B, 0x06, 0x183B), (0x3B, 0x07, 0x1C3B), (0x4F, 0x01, 0x044F), (0x1a, 0x07, 0x1C1A), (0x1a, 0x06, 0x181A), (0x1a, 0x03, 0x0C1A), (0x1a, 0x02, 0x081A), (0x6C, 0x01, 0x046C), (0x32, 0x02, 0x0832), (0x32, 0x01, 0x0432), (0x32, 0x01, 0x0459), (0x32, 0x02, 0x0859), (0x5B, 0x01, 0x045B), (0x1b, 0x01, 0x041B), (0x24, 0x01, 0x0424), (0x0A, 0x0b, 0x2C0A), (0x0A, 0x10, 0x400A), (0x0A, 0x0D, 0x340A), (0x0A, 0x09, 0x240A), (0x0A, 0x05, 0x140A), (0x0A, 0x07, 0x1C0A), (0x0A, 0x0C, 0x300A), (0x0A, 0x11, 0x440A), (0x0A, 0x04, 0x100A), (0x0A, 0x12, 0x480A), (0x0A, 0x02, 0x080A), (0x0A, 0x13, 0x4C0A), (0x0A, 0x06, 0x180A), (0x0A, 0x0F, 0x3C0A), (0x0A, 0x0A, 0x280A), (0x0A, 0x14, 0x500A), (0x0A, 0x03, 0x0C0A), (0x0A, 0x01, 0x040A), (0x0A, 0x15, 0x540A), (0x0A, 0x0E, 0x380A), (0x0A, 0x08, 0x200A), (0x41, 0x01, 0x0441), (0x1D, 0x02, 0x081D), (0x1D, 0x01, 0x041D), (0x5A, 0x01, 0x045A), (0x28, 0x01, 0x0428), (0x5F, 0x02, 0x085F), (0x49, 0x01, 0x0449), (0x49, 0x02, 0x0849), (0x44, 0x01, 0x0444), (0x4A, 0x01, 0x044A), (0x1E, 0x01, 0x041E), (0x51, 0x01, 0x0451), (0x73, 0x02, 0x0873), (0x73, 0x01, 0x0473), (0x1F, 0x01, 0x041F), (0x42, 0x01, 0x0442), (0x22, 0x01, 0x0422), (0x2E, 0x01, 0x042E), (0x20, 0x02, 0x0820), (0x20, 0x01, 0x0420), (0x80, 0x01, 0x0480), (0x43, 0x02, 0x0843), (0x43, 0x01, 0x0443), (0x03, 0x02, 0x0803), (0x2A, 0x01, 0x042A), (0x52, 0x01, 0x0452), (0x88, 0x01, 0x0488), (0x78, 0x01, 0x0478), (0x6A, 0x01, 0x046A), ])
class pesig
-
This unit is implemented in
refinery.units.formats.pe.pesig
and has the following commandline Interface:usage: pesig [-h] [-L] [-Q] [-0] [-v] Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital signatures in DER format. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pesig(Unit): """ Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital signatures in DER format. """ _SECDIRID = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY'] def __init__(self): pass def process(self, data: bytearray) -> bytearray: pe = PE(data=data, fast_load=True) pe.parse_data_directories(directories=[self._SECDIRID]) security = pe.OPTIONAL_HEADER.DATA_DIRECTORY[self._SECDIRID] self.log_info(F'signature offset: 0x{security.VirtualAddress:08X}') self.log_info(F'signature length: 0x{security.Size:08X}') if security.VirtualAddress == 0 or security.Size == 0: raise ValueError(F'IMAGE_DIRECTORY_ENTRY_SECURITY ({self._SECDIRID}) is corrupt.') sgnoff = security.VirtualAddress + 8 sgnend = sgnoff + security.Size length, revision, certtype = unpack('<IHH', data[sgnoff - 8:sgnoff]) signature = data[sgnoff:sgnend] if len(signature) + 8 != length: raise RefineryPartialResult( F'Found {len(signature) + 8} bytes of signature, but length should be {length}.', partial=signature) return signature
class pestrip (certificate=False, directories=False, memdump=False)
-
This unit is implemented in
refinery.units.formats.pe.pestrip
and has the following commandline Interface:usage: pestrip [-h] [-L] [-Q] [-0] [-v] [-c] [-d] [-m] Removes the overlay of a PE file and returns the main executable. Use peoverlay to extract the overlay. optional arguments: -c, --cert Include digital signatures for the size computation. -d, --dirs Include data directories for size computation. -m, --memdump Assume that the file data was a memory-mapped PE file. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pestrip(OverlayUnit): """ Removes the overlay of a PE file and returns the main executable. Use `refinery.peoverlay` to extract the overlay. """ def process(self, data: bytearray) -> bytearray: size = self._get_size(data) try: data[size:] = [] except Exception: data = data[:size] else: return data
class pick (*bounds)
-
This unit is implemented in
refinery.units.meta.pick
and has the following commandline Interface:usage: pick [-h] [-L] [-Q] [-0] [-v] [start:end:step [start:end:step ...]] Picks sequences from the array of multiple inputs. For example, pick 0 2: will return all but the second ingested input (which has index 1). positional arguments: start:end:step Specify start:end:step in Python slice syntax. The default is 0. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pick(Unit): """ Picks sequences from the array of multiple inputs. For example, `pick 0 2:` will return all but the second ingested input (which has index `1`). """ def __init__(self, *bounds: Arg.Bounds(nargs='*', default=[0])): super().__init__(bounds=[sliceobj(s) for s in bounds]) def process(self, data: Chunk): if not data.visible: yield data return state: _PickState = data.temp a = state.accessor lower = a.start upper = a.stop if lower is not None: lower -= state.discarded if upper is not None: upper -= state.discarded if state.consumed: yield from state.remaining[slice(lower, upper, a.step)] return while lower: try: chunk = next(state.chunks) except StopIteration: upper = None break if chunk.visible: lower -= 1 upper -= 1 state.discarded += 1 else: yield chunk if upper is None: yield from state.chunks return while upper: try: chunk = next(state.chunks) except StopIteration: break if chunk.visible: upper -= 1 state.discarded += 1 yield chunk def filter(self, chunks: Iterable[Chunk]): chunks = begin(chunks) if chunks is None: return container, chunks = chunks if container.scope < 1: raise RuntimeError(F'{self.__class__.__name__} cannot be used outside a frame; maybe you meant to use snip?') container = container.copy() container.visible = True state = _PickState(deque(self.args.bounds), chunks) while state.next(): if not state.consumed: if not state.discardable(): self.log_debug(F'consumed input into buffer after {state.discarded} skips') for chunk in state.chunks: if not chunk.visible: yield chunk continue state.remaining.append(chunk) state.consumed = True container.temp = state yield container
class pkcs7
-
This unit is implemented in
refinery.units.formats.pkcs7
and has the following commandline Interface:usage: pkcs7 [-h] [-L] [-Q] [-0] [-v] Converts PKCS7 encoded data to a JSON representation. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pkcs7(Unit): """ Converts PKCS7 encoded data to a JSON representation. """ @Unit.Requires('asn1crypto', 'default', 'extended') def _asn1crypto(): import asn1crypto import asn1crypto.cms import asn1crypto.core import asn1crypto.x509 return asn1crypto def process(self, data: bytes): asn1 = self._asn1crypto.core cms = self._asn1crypto.cms signature = cms.ContentInfo.load(data) def unsign(data): if isinstance(data, int): size = data.bit_length() if data < 0: data = (1 << (size + 1)) - ~data - 1 if data > 0xFFFFFFFF_FFFFFFFF: size, r = divmod(size, 8) size += bool(r) data = data.to_bytes(size, 'big').hex() return data elif isinstance(data, dict): return {key: unsign(value) for key, value in data.items()} elif isinstance(data, list): return [unsign(x) for x in data] else: return data class SpcString(asn1.Choice): _alternatives = [ ('unicode', asn1.BMPString, {'implicit': 0}), ('ascii', asn1.IA5String, {'implicit': 1}) ] SpcUuid = asn1.OctetString class SpcSerializedObject(asn1.Sequence): _fields = [ ('classId', SpcUuid), ('serializedData', asn1.OctetString), ] class SpcLink(asn1.Choice): _alternatives = [ ('url', asn1.IA5String, {'implicit': 0}), ('monikier', SpcSerializedObject, {'implicit': 1}), ('file', SpcString, {'explicit': 2}) ] class SpcSpOpusInfo(asn1.Sequence): _fields = [ ('programName', SpcString, {'optional': True, 'explicit': 0}), ('moreInfo', SpcLink, {'optional': True, 'explicit': 1}), ] class SetOfInfos(asn1.SetOf): _child_spec = SpcSpOpusInfo cms.CMSAttributeType._map['1.3.6.1.4.1.311.2.1.12'] = 'authenticode_info' cms.CMSAttribute._oid_specs['authenticode_info'] = SetOfInfos class ParsedASN1ToJSON(BytesAsStringEncoder): unit = self @classmethod def _is_keyval(cls, obj): return ( isinstance(obj, dict) and set(obj.keys()) == {'type', 'values'} and len(obj['values']) == 1 ) @classmethod def handled(cls, obj) -> bool: return BytesAsStringEncoder.handled(obj) or cls._is_keyval(obj) def encode_bytes(self, obj: bytes): with suppress(Exception): string = obj.decode('latin1') if string.isprintable(): return string return super().encode_bytes(obj) def default(self, obj): if self._is_keyval(obj): return dict(type=obj['type'], value=obj['values'][0]) with suppress(TypeError): return super().default(obj) if isinstance(obj, (set, tuple)): return list(obj) if isinstance(obj, datetime): return str(obj) dict_result = {} list_result = None if isinstance(obj, self.unit._asn1crypto.x509.Certificate): dict_result.update(fingerprint=obj.sha1.hex()) if isinstance(obj, asn1.BitString): return {'bit_string': obj.native} with suppress(Exception): list_result = list(obj) if all(isinstance(k, str) for k in list_result): dict_result.update((key, obj[key]) for key in list_result) if dict_result: return dict_result if list_result is not None: return list_result if isinstance(obj, self.unit._asn1crypto.cms.CertificateChoices): return obj.chosen if isinstance(obj, asn1.Sequence): children = obj.children if children: return children return obj.dump() with suppress(Exception): return obj.native if isinstance(obj, asn1.Any): parsed = None with suppress(Exception): parsed = obj.parse() if parsed: return parsed return obj.dump() if isinstance(obj, asn1.Asn1Value): return obj.dump() raise ValueError(F'Unable to determine JSON encoding of {obj.__class__.__name__} object.') with ParsedASN1ToJSON as encoder: encoded = encoder.dumps(signature) converted = unsign(json.loads(encoded)) return json.dumps(converted, indent=4).encode(self.codec)
class pkcs7sig (tabular=False)
-
This unit is implemented in
refinery.units.formats.pkcs7sig
and has the following commandline Interface:usage: pkcs7sig [-h] [-L] [-Q] [-0] [-v] [-t] Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to parse authenticode signatures appended to files that are not PE files to get the same output that is produced by the pemeta unit. optional arguments: -t, --tabular Print information in a table rather than as JSON generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pkcs7sig(Unit): """ Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to parse authenticode signatures appended to files that are not PE files to get the same output that is produced by the pemeta unit. """ def __init__(self, tabular: Arg('-t', help='Print information in a table rather than as JSON') = False): super().__init__(tabular=tabular) def process(self, data: bytes): json = pemeta.parse_signature(data) yield from ppjson(tabular=self.args.tabular)._pretty_output(json, indent=4, ensure_ascii=False)
class pop (*names)
-
This unit is implemented in
refinery.units.meta.pop
and has the following commandline Interface:usage: pop [-h] [-L] [-Q] [-0] [-v] [[name[:conversion]|count|@] [[name[:conversion]|count|@] ...]] In processing order, remove visible chunks from the current frame and store their contents in the given meta variables. All chunks in the input stream are consequently made visible again. If pop is used at the end of a frame, then variables will be local to the parent frame. positional arguments: [name[:conversion]|count|@] Specify either the name of a single variable to receive the contents of an input chunk, or an integer expression that specifies a number of values to be removed from the input without storing them. Additionally, it is possible to specify the symbol "@" to remove a single chunk from the input and merge its meta data into the following ones. By default, a single merge is performed. When a variable name is specified, a sequence of transformations can be appended to be applied before storing it. For example, the argument k:le:b64 would first decode the chunk using base64, then convert it to an integer in little endian format, and store the integer result in the variable k. The visual aid is that the content is passed from right to left through all conversions, into the variable k. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pop(Unit): """ In processing order, remove visible chunks from the current frame and store their contents in the given meta variables. All chunks in the input stream are consequently made visible again. If pop is used at the end of a frame, then variables will be local to the parent frame. """ def __init__( self, *names: Arg(type=str, metavar=F'[name[:conversion]|count|{_popcount._MERGE_SYMBOL}]', help=( R'Specify either the name of a single variable to receive the contents of an input chunk, or ' R'an integer expression that specifies a number of values to be removed from the input without ' F'storing them. Additionally, it is possible to specify the symbol "{_popcount._MERGE_SYMBOL}" ' R'to remove a single chunk from the input and merge its meta data into the following ones. By ' R'default, a single merge is performed. When a variable name is specified, a sequence of ' R'transformations can be appended to be applied before storing it. For example, the argument ' R'k:le:b64 would first decode the chunk using base64, then convert it to an integer in little ' R'endian format, and store the integer result in the variable `k`. The visual aid is that the ' R'content is passed from right to left through all conversions, into the variable `k`.' )) ): if not names: names = _popcount._MERGE_SYMBOL, super().__init__(names=[_popcount(n) for n in names]) def process(self, data): return data def filter(self, chunks: Iterable[Chunk]): invisible = [] variables = {} remaining: Iterator[_popcount] = iter(self.args.names) it = iter(chunks) pop = next(remaining).reset() done = False for chunk in it: if not chunk.visible: self.log_debug('buffering invisible chunk') invisible.append(chunk) continue try: while not pop.into(variables, chunk): pop = next(remaining).reset() except StopIteration: done = True invisible.append(chunk) break if not done and pop.done: try: next(remaining) except StopIteration: done = True if not done: raise ValueError('Not all variables could be assigned.') nesting = self.args.nesting for chunk in chain(invisible, it): meta = chunk.meta meta.update(variables) if nesting < 0: for name in variables: meta.set_scope(name, chunk.scope + nesting) chunk.visible = True yield chunk
class ppjscript (indent=4)
-
This unit is implemented in
refinery.units.sinks.ppjscript
and has the following commandline Interface:usage: ppjscript [-h] [-L] [-Q] [-0] [-v] [-i N] Pretty-prints JavaScript without any reflection or evaluation. optional arguments: -i, --indent N Controls the amount of space characters used for indentation in the output. Default is 4. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ppjscript(Unit): """ Pretty-prints JavaScript without any reflection or evaluation. """ def __init__(self, indent: Arg.Number('-i', help=( 'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4 ): return super().__init__(indent=indent) @Unit.Requires('jsbeautifier', 'display', 'extended') def _jsb(): import jsbeautifier import jsbeautifier.unpackers.javascriptobfuscator # TODO: This is a workaround for the following bug: # https://github.com/beautify-web/js-beautify/issues/1350 jsbeautifier.unpackers.javascriptobfuscator.detect = lambda *_: False return jsbeautifier @unicoded def process(self, data: str) -> str: return self._jsb.beautify(data, dict(eval_code=False, indent_size=self.args.indent))
class ppjson (tabular=False, indent=4)
-
This unit is implemented in
refinery.units.sinks.ppjson
and has the following commandline Interface:usage: ppjson [-h] [-L] [-Q] [-0] [-v] [-t | -i N] Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to zero, the output is minified. optional arguments: -t, --tabular Convert JSON input into a flattened table. -i, --indent N Number of spaces used for indentation. Default is 4. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ppjson(Unit): """ Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to zero, the output is minified. """ _TRAILING_COMMA = re.compile(BR',\s*(}|])') def __init__( self, tabular: Arg.Switch('-t', group='OUT', help='Convert JSON input into a flattened table.') = False, indent : Arg.Number('-i', group='OUT', help='Number of spaces used for indentation. Default is {default}.') = 4 ): return super().__init__(indent=indent, tabular=tabular) def _pretty_output(self, parsed, **kwargs): if self.args.tabular: table = list(flattened(parsed)) width = max(len(key) for key, _ in table) tsize = get_terminal_size(80) - width - 4 for key, value in table: value = str(value).rstrip() value = textwrap.wrap(value, tsize) it = iter(value) try: item = next(it) except StopIteration: continue yield F'{key:<{width}} : {item}'.encode(self.codec) for wrap in it: yield F'{"":<{width + 3}}{wrap}'.encode(self.codec) else: yield json.dumps(parsed, **kwargs).encode(self.codec) def process(self, data): if self._TRAILING_COMMA.search(data): def smartfix(match): k = match.start() return match.group(0 if any(k in s for s in strings) else 1) from refinery.lib.patterns import formats strings = {range(*m.span()) for m in formats.string.finditer(data)} data = self._TRAILING_COMMA.sub(smartfix, data) kwargs = {'indent': self.args.indent} if self.args.indent else {'separators': (',', ':')} yield from self._pretty_output(json.loads(data), **kwargs)
class ppxml (indent=4, header=False)
-
This unit is implemented in
refinery.units.sinks.ppxml
and has the following commandline Interface:usage: ppxml [-h] [-L] [-Q] [-0] [-v] [-i N] [-x] Expects XML input data and outputs it in a neatly formatted manner. optional arguments: -i, --indent N Controls the amount of space characters used for indentation in the output. Default is 4. -x, --header Add an XML header to the formatted output. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ppxml(Unit): """ Expects XML input data and outputs it in a neatly formatted manner. """ def __init__(self, indent: Arg.Number('-i', help=( 'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4, header: Arg.Switch('-x', help='Add an XML header to the formatted output.') = False ): super().__init__(indent=indent, header=header) def process(self, data): pad = self.args.indent * ' ' etm = {} try: dom = ForgivingParse(data, etm) except Exception: from refinery.lib.meta import metavars msg = 'error parsing as XML, returning original content' path = metavars(data).get('path') if path: msg = F'{msg}: {path}' self.log_warn(msg) return data def indent(element, level=0, more_sibs=False): """ The credit for this one goes to: https://stackoverflow.com/a/12940014 """ indentation = '\n' if level: indentation += (level - 1) * pad childcount = len(element) if childcount: if not element.text or not element.text.strip(): element.text = indentation + pad if level: element.text += pad for count, child in enumerate(element): indent(child, level + 1, count < childcount - 1) if level and (not element.tail or element.tail.isspace()): element.tail = indentation if more_sibs: element.tail += pad elif level and (not element.tail or element.tail.isspace()): element.tail = indentation if more_sibs: element.tail += pad indent(dom.getroot()) with io.BytesIO() as output: dom.write(output, encoding=self.codec, xml_declaration=self.args.header) result = output.getvalue() for uid, key in etm.items(): entity = F'&{key};'.encode(self.codec) needle = uid.encode(self.codec) result = result.replace(needle, entity) return result
class ps1str
-
This unit is implemented in
refinery.units.encoding.ps1str
and has the following commandline Interface:usage: ps1str [-h] [-L] [-Q] [-0] [-v] [-R] Escapes and unescapes PowerShell strings. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class ps1str(Unit): """ Escapes and unescapes PowerShell strings. """ UNESCAPE = { '`0': '\0', '`a': '\a', '`b': '\b', '`f': '\f', '`n': '\n', '`r': '\r', '`t': '\t', '`v': '\v', '``': '`', "`'": '\'', '`"': '\"', } ESCAPE = { '`' : '``', '$' : '`$', '\0': '`0', '\a': '`a', '\b': '`b', '\f': '`f', '\n': '`n', '\r': '`r', '\t': '`t', '\v': '`v', '\'': "`'", '\"': '""', } def __init__(self): pass @unicoded def process(self, data): match = re.fullmatch(R'''@(['"])\s*?[\r\n](.*?)[\r\n]\1@''', data, flags=re.DOTALL) if match: return match.group(2) if data[0] not in '\'\"' or data[-1] != data[0]: raise ValueError( 'No quotes found at beginning of input. To escape a PowerShell string, the ' 'quotes must be included because quote escaping depends on whether a single ' 'or double quote was used.') quote, data = data[0], data[1:-1] def unescape(match): string = match[0] return self.UNESCAPE.get(string, string[1:]) if quote == '"': if re.search(R'(?<!`)\$(?=[\w\(\{\$\?\^:])', data): self.log_warn('Loss of information: double quoted string contains variable substitutions.') data = re.sub('`.', unescape, data) return data.replace(quote + quote, quote) @unicoded def reverse(self, data): def escaper(match): char = match[0] return ps1str.ESCAPE.get(char, char) return '"{}"'.format(re.sub(R'''[\x00\x07-\x0D`$'"]''', escaper, data))
class push (data=b'')
-
This unit is implemented in
refinery.units.meta.push
and has the following commandline Interface:usage: push [-h] [-L] [-Q] [-0] [-v] [data] The unit inserts an additional chunk before each input chunk and moves the original data out of scope. This chunk is considered the "original" data, while the one inserted in front of it is used as an intermediate result. By default, this intermediate data is a copy of the input data. For example: emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ] will output key=censored. The application of rex turns the (duplicated) data into just the value, which is then stored in the variable v. The application of repl replaces this value with the hard-coded string censored. positional arguments: data The data to be pushed, by default a copy of the input. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class push(Unit): """ The unit inserts an additional chunk before each input chunk and moves the original data out of scope. This chunk is considered the "original" data, while the one inserted in front of it is used as an intermediate result. By default, this intermediate data is a copy of the input data. For example: emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ] will output `key=censored`. The application of `refinery.rex` turns the (duplicated) data into just the value, which is then stored in the variable `v`. The application of `refinery.repl` replaces this value with the hard-coded string `censored`. """ def __init__(self, data: Arg(help='The data to be pushed, by default a copy of the input.') = B''): super().__init__(data=data) def process(self, data: Chunk): src = self.args.data tos = data.copy(meta=True, data=False) tos[:] = src or data if self.args.nesting > 0: data.set_next_scope(False) else: try: data.visible = False except AttributeError: self.log_warn('application has no effect outside frame.') yield data yield tos
class put (name, value=<object object>)
-
This unit is implemented in
refinery.units.meta.put
and has the following commandline Interface:usage: put [-h] [-L] [-Q] [-0] [-v] name [value] Can be used to add a meta variable to the processed chunk. Note that meta variables cease to exist outside a frame. positional arguments: name The name of the variable to be used. value The value for the variable. If no value is given, the entire current chunk is stored. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class put(Unit): """ Can be used to add a meta variable to the processed chunk. Note that meta variables cease to exist outside a frame. """ def __init__( self, name : Arg(help='The name of the variable to be used.', type=str), value: Arg(help='The value for the variable. If no value is given, the entire current chunk is stored.', type=functools.partial(numseq, typecheck=False)) = _EMPTY ): super().__init__(name=check_variable_name(name), value=value) def process(self, data: Chunk): value = self.args.value if value is _EMPTY: value = data if not isinstance(value, (int, float)) and not isbuffer(value): try: len(value) except TypeError: if isinstance(value, itertools.repeat): value = next(value) if not isinstance(value, (int, float)): raise NotImplementedError(F'put does not support {value.__class__.__name__} values.') else: if not isinstance(value, (dict, list)): value = list(value) self.log_debug(F'storing {typename(value)}:', value, clip=True) data.meta[self.args.name] = value return data
class pyc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.pyc
and has the following commandline Interface:usage: pyc [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does not work on recent Python versions, but anything below 3.9 should work. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class pyc(ArchiveUnit): """ Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does not work on recent Python versions, but anything below 3.9 should work. """ def unpack(self, data): input_path = metavars(data).get(self.args.path.decode(self.codec)) for k, code in enumerate(extract_code_from_buffer(bytes(data), input_path)): path = code.container.co_filename or F'__unknown_name_{k:02d}.py' date = datetime.fromtimestamp(code.timestamp) data = decompile_buffer(code) yield self._pack(path, date, data)
class pym
-
This unit is implemented in
refinery.units.formats.pym
and has the following commandline Interface:usage: pym [-h] [-L] [-Q] [-0] [-v] [-R] Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an older Python version, you can use the pyc unit to then decompile the code, but for more recent versions a separate Python decompiler will be required. WARNING: This unit will invoke the marshal.loads function, which may be unsafe. Please refer to the official Python documentation for more details. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class pym(Unit): """ Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an older Python version, you can use the `refinery.pyc` unit to then decompile the code, but for more recent versions a separate Python decompiler will be required. WARNING: This unit will invoke the `marshal.loads` function, which may be unsafe. Please refer to the official Python documentation for more details. """ def reverse(self, data): return marshal.dumps(data) def process(self, data): data = marshal.loads(data) code = (lambda: 0).__code__.__class__ def toblob(data): if isinstance(data, (bytes, bytearray)): self.log_info(U'unmarshalled a byte string, returning as is') return data if isinstance(data, str): self.log_info(F'unmarshalled a string object, encoding as {self.codec}') return data.encode(self.codec) if isinstance(data, code): self.log_info(U'unmarshalled a code object, converting to pyc') import importlib return importlib._bootstrap_external._code_to_timestamp_pyc(data) if isinstance(data, int): self.log_info(U'unmarshalled an integer, returning big endian encoding') q, r = divmod(data.bit_length(), 8) q += int(bool(r)) return data.to_bytes(q, 'big') if isinstance(data, dict): with BytesAsStringEncoder as encoder: return encoder.dumps(data).encode(self.codec) raise NotImplementedError( F'No serialization implemented for object of type {data.__class__.__name__}') if isinstance(data, list): self.log_info('object is a list, converting each item individually') for item in data: yield toblob(item) else: yield toblob(data)
class qb (*data)
-
This unit is implemented in
refinery.units.meta.queue
and has the following commandline Interface:usage: qb [-h] [-L] [-Q] [-0] [-v] [data [data ...]] Short for "queue back": Insert new chunks at the end of the current frame. positional arguments: data The arguments are inserted into the current frame in the given order. These arguments are multibin expressions; If the expression depends on the input data, it will always refer to the first chunk in the current frame. If no argument is given, a single empty chunk is inserted. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class qb(QueueUnit): """ Short for "queue back": Insert new chunks at the end of the current frame. """ def filter(self, chunks: Iterable[Chunk]): yield from self._queue(chunks, False)
class qf (*data)
-
This unit is implemented in
refinery.units.meta.queue
and has the following commandline Interface:usage: qf [-h] [-L] [-Q] [-0] [-v] [data [data ...]] Short for "queue front": Insert new chunks at the beginning of the current frame. positional arguments: data The arguments are inserted into the current frame in the given order. These arguments are multibin expressions; If the expression depends on the input data, it will always refer to the first chunk in the current frame. If no argument is given, a single empty chunk is inserted. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class qf(QueueUnit): """ Short for "queue front": Insert new chunks at the beginning of the current frame. """ def filter(self, chunks: Iterable[Chunk]): yield from self._queue(chunks, True)
class qlz
-
This unit is implemented in
refinery.units.compression.qlz
and has the following commandline Interface:usage: qlz [-h] [-L] [-Q] [-0] [-v] This unit implements QuickLZ decompression levels 1 and 3. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class qlz(Unit): """ This unit implements QuickLZ decompression levels 1 and 3. """ def process(self, data): source = memoryview(data) head = source[0] clvl = (head >> 2) & 0x3 if head & 2: self.log_info('long header detected') size = int.from_bytes(source[5:9], 'little') source = source[9:] else: self.log_info('short header detected') size = source[3] source = source[3:] if head & 1 != 1: self.log_warn('header indicates that data is uncompressed, returning remaining data') return source else: self.log_info(F'compression level {clvl}, decompressed size {SizeInt(size)!r}') def fetchhash(): return int.from_bytes(destination[hashvalue + 1:hashvalue + 4], byteorder='little') codeword = 1 destination = bytearray() hashtable = [0] * _HASH_VALUES hashvalue = -1 last_matchstart = size - _UNCONDITIONAL_MATCHLEN - _UNCOMPRESSED_END - 1 fetch = 0 if clvl == 2: raise ValueError("This version only supports level 1 and 3") while source: if codeword == 1: codeword = int.from_bytes(source[:4], byteorder='little') source = source[4:] if len(destination) <= last_matchstart: c = 3 if clvl == 1 else 4 fetch = int.from_bytes(source[:c], byteorder='little') if codeword & 1: codeword = codeword >> 1 if clvl == 1: hash = (fetch >> 4) & 0xFFF offset = hashtable[hash] if fetch & 0xF: matchlen = (fetch & 0xF) + 2 source = source[2:] else: matchlen = source[2] source = source[3:] else: if (fetch & 3) == 0: delta = (fetch & 0xFF) >> 2 matchlen = 3 source = source[1:] elif (fetch & 2) == 0: delta = (fetch & 0xFFFF) >> 2 matchlen = 3 source = source[2:] elif (fetch & 1) == 0: delta = (fetch & 0xFFFF) >> 6 matchlen = ((fetch >> 2) & 15) + 3 source = source[2:] elif (fetch & 127) != 3: delta = (fetch >> 7) & 0x1FFFF matchlen = ((fetch >> 2) & 0x1F) + 2 source = source[3:] else: delta = fetch >> 15 matchlen = ((fetch >> 7) & 255) + 3 source = source[4:] offset = (len(destination) - delta) & 0xFFFFFFFF for i in range(offset, offset + matchlen): destination.append(destination[i]) if clvl == 1: fetch = fetchhash() while hashvalue < len(destination) - matchlen: hashvalue += 1 hash = ((fetch >> 12) ^ fetch) & _HASH_MASK hashtable[hash] = hashvalue fetch = fetch >> 8 & 0xFFFF try: fetch |= destination[hashvalue + 3] << 16 except IndexError: pass fetch = int.from_bytes(source[:3], byteorder='little') else: fetch = int.from_bytes(source[:4], byteorder='little') hashvalue = len(destination) - 1 else: if len(destination) <= last_matchstart: destination.append(source[0]) source = source[1:] codeword = codeword >> 1 if clvl == 1: while hashvalue < len(destination) - 3: fetch2 = fetchhash() hashvalue += 1 hash = ((fetch2 >> 12) ^ fetch2) & _HASH_MASK hashtable[hash] = hashvalue fetch = fetch >> 8 & 0xFFFF | source[2] << 16 else: fetch = fetch >> 8 & 0xFFFF fetch |= source[2] << 16 fetch |= source[3] << 24 else: while len(destination) <= size - 1: if codeword == 1: source = source[4:] codeword = 0x80000000 destination.append(source[0]) source = source[1:] codeword = codeword >> 1 break if len(destination) != size: raise RefineryPartialResult( F'Header indicates decompressed size 0x{size:X}, but 0x{len(destination):X} bytes ' F'were decompressed.', destination) return destination
class rabbit (key, discard=0, stateful=False, iv=b'')
-
This unit is implemented in
refinery.units.crypto.cipher.rabbit
and has the following commandline Interface:usage: rabbit [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] [-i IV] key RABBIT encryption and decryption. positional arguments: key The encryption key. optional arguments: -d, --discard N Discard the first N bytes of the keystream, 0 by default. -s, --stateful Do not reset the key stream while processing the chunks of one frame. -i, --iv IV Optional initialization vector. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rabbit(StreamCipherUnit): """ RABBIT encryption and decryption. """ key_size = {16} def __init__(self, key, discard=0, stateful=False, iv: Arg('-i', '--iv', help='Optional initialization vector.') = B''): super().__init__(key=key, iv=iv, stateful=stateful, discard=discard) def keystream(self) -> Iterable[int]: if len(self.args.iv) not in (0, 8): raise ValueError('The IV length must be exactly 8 bytes.') return RabbitCipher(self.args.key, self.args.iv)
class rc2 (key, iv=b'', *, eks=1024, derive_eks=False, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.rc2
and has the following commandline Interface:usage: rc2 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-k N | -d] [-p P] [-m M] [-r] [-e] [-S N] [-M N] key RC2 encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -k, --eks N Set the effective key size. Default is 1024. -d, --dks Act as .NET and derive the effective key size from the key length. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, EAX, ECB, OFB. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -M, --mac-len N Only for EAX, GCM, OCB, and CCM: Length of the authentication tag, in bytes. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rc2(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(ARC2)): """ RC2 encryption and decryption. """ def __init__( self, key, iv=b'', *, eks: Arg.Number('-k', '--eks', group='EKS', help='Set the effective key size. Default is {default}.') = 1024, derive_eks: Arg.Switch('-d', '--dks', group='EKS', help='Act as .NET and derive the effective key size from the key length.') = False, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0, **keywords ): super().__init__( key, iv, eks=eks, derive_eks=derive_eks, padding=padding, mode=mode, raw=raw, little_endian=little_endian, segment_size=segment_size, mac_len=mac_len, assoc_len=assoc_len, **keywords ) def _new_cipher(self, **optionals) -> CipherInterface: eks = len(self.args.key) * 8 if self.args.derive_eks else self.args.eks optionals.update(effective_keylen=eks) return super()._new_cipher(**optionals)
class rc4 (key, discard=0)
-
This unit is implemented in
refinery.units.crypto.cipher.rc4
and has the following commandline Interface:usage: rc4 [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] key RC4 encryption and decryption. positional arguments: key The encryption key. optional arguments: -d, --discard N Discard the first N bytes of the keystream, 0 by default. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rc4(StandardCipherUnit, cipher=PyCryptoFactoryWrapper(ARC4)): """ RC4 encryption and decryption. """ def __init__( self, key, discard: Arg.Number('-d', help='Discard the first {varname} bytes of the keystream, {default} by default.') = 0, ): super().__init__(key, discard=discard) def _new_cipher(self, **optionals): return super()._new_cipher(drop=self.args.discard, **optionals)
class rc4mod (key, stateful=False, discard=0, *, size=256)
-
This unit is implemented in
refinery.units.crypto.cipher.rc4mod
and has the following commandline Interface:usage: rc4mod [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-t N] key Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered. positional arguments: key The encryption key. optional arguments: -s, --stateful Do not reset the key stream while processing the chunks of one frame. -d, --discard N Discard the first N bytes of the keystream, 0 by default. -t, --size N Table size, 256 by default. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rc4mod(StreamCipherUnit): """ Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered. """ def __init__( self, key, stateful=False, discard=0, *, size: Arg.Number('-t', help='Table size, {default} by default.', bound=(1, None)) = 0x100 ): super().__init__(key=key, stateful=stateful, discard=discard, size=size) def keystream(self): size = self.args.size tablerange = range(max(size, 0x100)) b, table = 0, bytearray(k & 0xFF for k in tablerange) for a, keybyte in zip(tablerange, cycle(self.args.key)): t = table[a] b = (b + keybyte + t) % size table[a] = table[b] table[b] = t self.log_debug(lambda: F'SBOX = {table.hex(" ").upper()}', clip=True) b, a = 0, 0 while True: a = (a + 1) % size t = table[a] b = (b + t) % size table[a] = table[b] table[b] = t yield table[(table[a] + t) % size]
class rc5 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=12, word_size=32, assoc_len=0, mac_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.rc5
and has the following commandline Interface:usage: rc5 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-k N] [-w N] key RC5 encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -k, --rounds N Number of rounds to use, the default is 12 -w, --word-size N The word size in bits, 32 by default. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rc5(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC5)): """ RC5 encryption and decryption. """ def __init__( self, key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R, word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W, **more ): super().__init__( key, iv, padding=padding, mode=mode, raw=raw, little_endian=little_endian, segment_size=segment_size, rounds=rounds, word_size=word_size, **more ) @property def block_size(self): return self.args.word_size // 4 def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( rounds=self.args.rounds, word_size=self.args.word_size, **optionals )
class rc6 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=20, word_size=32)
-
This unit is implemented in
refinery.units.crypto.cipher.rc6
and has the following commandline Interface:usage: rc6 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] [-k N] [-w N] key RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the unit will allow any key size up to 256 bits. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. -k, --rounds N Number of rounds to use, the default is 20 -w, --word-size N The word size in bits, 32 by default. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rc6(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC6)): """ RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the unit will allow any key size up to 256 bits. """ def __init__( self, key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R, word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W, ): super().__init__( key, iv, padding=padding, mode=mode, raw=raw, little_endian=little_endian, segment_size=segment_size, rounds=rounds, word_size=word_size ) @property def block_size(self): return self.args.word_size // 2 def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( rounds=self.args.rounds, word_size=self.args.word_size, **optionals )
class recode (decode=None, encode='UTF8', decerr=None, encerr=None, errors=None)
-
This unit is implemented in
refinery.units.encoding.recode
and has the following commandline Interface:usage: recode [-h] [-L] [-Q] [-0] [-v] [-R] [-d Handler] [-e Handler] [-E Handler] [decode-as] [encode-as] Expects input string data encoded in the from encoding and encodes it in the to encoding, then outputs the result. positional arguments: decode-as Input encoding; Guess encoding by default. encode-as Output encoding; The default is UTF8. optional arguments: -d, --decerr Handler Specify an error handler for decoding. -e, --encerr Handler Specify an error handler for encoding. -E, --errors Handler Specify an error handler for both encoding and decoding. The possible choices are the following: strict, ignore, replace, xmlref, backslash, surrogate generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class recode(Unit): """ Expects input string data encoded in the `from` encoding and encodes it in the `to` encoding, then outputs the result. """ def __init__( self, decode: Arg(metavar='decode-as', type=str, help='Input encoding; Guess encoding by default.') = None, encode: Arg(metavar='encode-as', type=str, help=F'Output encoding; The default is {Unit.codec}.') = Unit.codec, decerr: Arg.Option('-d', choices=Handler, help='Specify an error handler for decoding.') = None, encerr: Arg.Option('-e', choices=Handler, help='Specify an error handler for encoding.') = None, errors: Arg.Option('-E', choices=Handler, help=( 'Specify an error handler for both encoding and decoding. ' 'The possible choices are the following: {choices}')) = None, ): super().__init__( decode=decode, encode=encode, decerr=Arg.AsOption(decerr or errors or 'STRICT', Handler).value, encerr=Arg.AsOption(encerr or errors or 'STRICT', Handler).value ) @Unit.Requires('chardet', 'default', 'extended') def _chardet(): import chardet return chardet def _detect(self, data): mv = memoryview(data) if not any(mv[1::2]): return 'utf-16le' if not any(mv[0::2]): return 'utf-16be' detection = self._chardet.detect(data) codec = detection['encoding'] self.log_info(lambda: F'Using input encoding: {codec}, detected with {int(detection["confidence"] * 100)}% confidence.') return codec def _recode(self, enc, dec, encerr, decerr, data): dec = dec or self._detect(data) return codecs.encode(codecs.decode(data, dec, errors=decerr), enc, errors=encerr) def reverse(self, data): return self._recode(self.args.decode, self.args.encode, self.args.decerr, self.args.encerr, data) def process(self, data): return self._recode(self.args.encode, self.args.decode, self.args.encerr, self.args.decerr, data)
class reduce (suffix, just=0, temp='t')
-
This unit is implemented in
refinery.units.meta.reduce
and has the following commandline Interface:usage: reduce [-h] [-L] [-Q] [-0] [-v] [-j N] [-t name] suffix The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a single chunk. The first chunk in the frame serves as initialization. positional arguments: suffix The remaining command line is a multibin suffix. The reduction accumulator is initialized with the first chunk in the frame. Then, each remaining chunk is processed with the given suffix and the result is used to overwrite the accumulator. optional arguments: -j, --just N Optionally specify a maximum number of chunks to process beyond the first. -t, --temp name The name of the accumulator variable. The default is "t". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class reduce(Unit): """ The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a single chunk. The first chunk in the frame serves as initialization. """ def __init__(self, suffix: Arg(type=str, help=( 'The remaining command line is a multibin suffix. The reduction accumulator is initialized ' 'with the first chunk in the frame. Then, each remaining chunk is processed with the given ' 'suffix and the result is used to overwrite the accumulator.' )), just: Arg.Number('-j', help='Optionally specify a maximum number of chunks to process beyond the first.') = 0, temp: Arg.String('-t', metavar='name', help='The name of the accumulator variable. The default is "{default}".') = 't', ): super().__init__(suffix=suffix, temp=temp, just=just) def filter(self, chunks: Iterable[Chunk]): it = iter(chunks) just = self.args.just name = self.args.temp accu = next(it) if not just: scope = it else: import itertools self.log_info(F'reducing only the next {just} chunks') scope = itertools.islice(it, 0, just) for chunk in scope: chunk.meta[name] = accu accu[:] = DelayedBinaryArgument(self.args.suffix, reverse=True, seed=chunk)(chunk) self.log_debug('reduced:', accu, clip=True) accu.meta.discard(name) yield accu yield from it
class rep (count=2, label=None)
-
This unit is implemented in
refinery.units.strings.rep
and has the following commandline Interface:usage: rep [-h] [-L] [-Q] [-0] [-v] [count] [label] Duplicates the given input a given number of times. It is also possible to specify an iterable instead of a number, in which case the input will be replicated once for each item in this iterable. positional arguments: count Defines the number of outputs to generate for each input. The default is 2. You can specify any multibin expression that defines an integer iterable here: Each input chunk will be replicated once for each element of that sequence. label If specified, the meta variable with this name will be populated with the index of the replicated chunk. When the count parameter is an integer, this label will be equivalent to the index meta variable. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rep(Unit): """ Duplicates the given input a given number of times. It is also possible to specify an iterable instead of a number, in which case the input will be replicated once for each item in this iterable. """ def __init__( self, count: Arg.NumSeq(help=( 'Defines the number of outputs to generate for each input. The default is {default}. ' 'You can specify any multibin expression that defines an integer iterable here: Each ' 'input chunk will be replicated once for each element of that sequence.')) = 2, label: Arg(type=str, help=( 'If specified, the meta variable with this name will be populated with the index of ' 'the replicated chunk. When the count parameter is an integer, this label will be ' 'equivalent to the index meta variable.')) = None ): super().__init__(count=count, label=label) def process(self, data: bytes): def count(): count = self.args.count if isinstance(count, int): return count return sum(1 for _ in count) if self.args.squeeze or not self._framed: self.log_debug('compressing all repeated items into a single chunk') yield data * count() return self.log_debug('emitting each repeated item as an individual chunk') label = self.args.label if label is None: yield from repeat(data, count()) return meta = {} for counter in self.args.count: meta[label] = counter yield self.labelled(data, **meta)
class repl (search, replace=b'', count=-1)
-
This unit is implemented in
refinery.units.strings.repl
and has the following commandline Interface:usage: repl [-h] [-L] [-Q] [-0] [-v] [-n N] search [replace] Performs a simple binary string replacement on the input data. positional arguments: search This is the search term. replace The substitution string. Leave this empty to remove all occurrences of the search term. optional arguments: -n, --count N Only replace the given number of occurrences generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class repl(Unit): """ Performs a simple binary string replacement on the input data. """ def __init__( self, search : Arg(help='This is the search term.'), replace: Arg(help='The substitution string. Leave this empty to remove all occurrences of the search term.') = B'', count : Arg.Number('-n', help='Only replace the given number of occurrences') = -1 ): super().__init__(search=search, replace=replace, count=count) def process(self, data: bytes): return data.replace( self.args.search, self.args.replace, self.args.count )
class resplit (regex=b'\\r?\\n', multiline=False, ignorecase=False, count=0)
-
This unit is implemented in
refinery.units.pattern.resplit
and has the following commandline Interface:usage: resplit [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-c N] [regex] Splits the data at the given regular expression and returns the sequence of chunks between the separators. By default, the input is split along line breaks. positional arguments: regex Regular expression to match. optional arguments: -M, --multiline Caret and dollar in regular expressions match the beginning and end of a line and a dot does not match line breaks. -I, --ignorecase Ignore capitalization for alphabetic characters in regular expressions. -c, --count N Specify the maximum number of operations to perform. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class resplit(SingleRegexUnit): """ Splits the data at the given regular expression and returns the sequence of chunks between the separators. By default, the input is split along line breaks. """ def __init__( self, regex=RB'\r?\n', multiline=False, ignorecase=False, count=0 ): super().__init__(regex=regex, multiline=multiline, ignorecase=ignorecase, count=count) def process(self, data): view = memoryview(data) cursor = 0 count = self.args.count for k, match in enumerate(self.regex.finditer(view), 2): yield view[cursor:match.start()] cursor = match.end() yield from match.groups() if k > count > 0: break yield view[cursor:]
class resub (regex='\\s+', subst=b'', multiline=False, ignorecase=False, count=0)
-
This unit is implemented in
refinery.units.pattern.resub
and has the following commandline Interface:usage: resub [-h] [-L] [-Q] [-0] [-v] [-M] [-I] [-c N] [regex] [subst] A unit for performing substitutions based on a binary regular expression pattern. Besides the syntax {k} to insert the k-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax: {match-group:handlers} where :handlers is an optional reverse multibin expression that is used to post-process the binary data from the match. For example, {2:hex:b64} represents the base64-decoding of the hex- decoding of the second match group. positional arguments: regex Regular expression to be searched and replaced. The default is "\s+". subst Substitution value: use {1} for group 1, {0} for entire match. Matches are removed (replaced by an empty string) by default. optional arguments: -M, --multiline Caret and dollar in regular expressions match the beginning and end of a line and a dot does not match line breaks. -I, --ignorecase Ignore capitalization for alphabetic characters in regular expressions. -c, --count N Specify the maximum number of operations to perform. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class resub(SingleRegexUnit): """ A unit for performing substitutions based on a binary regular expression pattern. Besides the syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax: {match-group:handlers} where `:handlers` is an optional reverse multibin expression that is used to post-process the binary data from the match. For example, `{2:hex:b64}` represents the base64-decoding of the hex-decoding of the second match group. """ def __init__( self, regex: Arg(help='Regular expression to be searched and replaced. The default is "{default}".') = '\\s+', subst: Arg('subst', help=( 'Substitution value: use {1} for group 1, {0} for entire match. Matches are removed ' '(replaced by an empty string) by default.' )) = B'', multiline=False, ignorecase=False, count=0 ): super().__init__(regex=regex, subst=subst, multiline=multiline, ignorecase=ignorecase, count=count) def process(self, data): def repl(match: Match): return meta.format_bin(spec, self.codec, [match[0], *match.groups()], match.groupdict()) self.log_info('pattern:', getattr(self.regex, 'pattern', self.regex)) self.log_info('replace:', self.args.subst) meta = metavars(data) spec = self.args.subst.decode('ascii', 'backslashreplace') substitute = self.regex.sub if self.args.count: from functools import partial substitute = partial(substitute, count=self.args.count) return substitute(repl, data)
class rev (blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.rev
and has the following commandline Interface:usage: rev [-h] [-L] [-Q] [-0] [-v] [-B N] The blocks of the input data are output in reverse order. If the length of the input data is not a multiple of the block size, the data is truncated. optional arguments: -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rev(UnaryOperation): """ The blocks of the input data are output in reverse order. If the length of the input data is not a multiple of the block size, the data is truncated. """ def __init__(self, blocksize=None): super().__init__(blocksize=blocksize, _truncate=2) def inplace(self, block: ndarray): return self._numpy.flip(block) operate = NotImplemented def process(self, data: bytearray): if self.bytestream: data.reverse() return data try: return self._fastblock(data) except FastBlockError: b = self.blocksize n = len(data) q = n // b m = q * b view = memoryview(data) temp = bytearray(b) for k in range(0, (q // 2) * b, b): lhs = slice(k, k + b) rhs = slice(m - k - b, m - k) temp[:] = view[rhs] data[rhs] = view[lhs] data[lhs] = temp if m < n: del view del temp del data[m:] return data
class rex (regex, *transformation, unicode=False, unique=False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None)
-
This unit is implemented in
refinery.units.pattern.rex
and has the following commandline Interface:usage: rex [-h] [-L] [-Q] [-0] [-v] [-u] [-q] [-M] [-I] [-n N] [-m N] [-e N] [-x] [-l] [-t N] regex [transformation [transformation ...]] Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each match. Each match is an individual output. Besides the syntax {k} to insert the k-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax: {match-group:pipeline} where :pipeline is an optional pipeline of refinery commands as it would be specified on the command line. The value of the corresponding match is post-processed with this command. positional arguments: regex Regular expression to match. transformation An optional sequence of transformations to be applied to each match. Each transformation produces one output in the order in which they are given. The default transformation is {0}, i.e. the entire match. optional arguments: -u, --unicode Also find unicode strings. -q, --unique Yield every (transformed) match only once. -M, --multiline Caret and dollar in regular expressions match the beginning and end of a line and a dot does not match line breaks. -I, --ignorecase Ignore capitalization for alphabetic characters in regular expressions. -n, --min N Matches must have length at least N. -m, --max N Matches must have length at most N. -e, --len N Matches must be of length N. -x, --stripspace Strip all whitespace from input data. -l, --longest Sort results by length. -t, --take N Return only the first N occurrences in order of appearance. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rex(SingleRegexUnit, PatternExtractor): """ Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each match. Each match is an individual output. Besides the syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax: {match-group:pipeline} where `:pipeline` is an optional pipeline of refinery commands as it would be specified on the command line. The value of the corresponding match is post-processed with this command. """ def __init__( self, regex, # TODO: Use positional only in Python 3.8 # /, *transformation: Arg(type=utf8, help=( 'An optional sequence of transformations to be applied to each match. ' 'Each transformation produces one output in the order in which they ' 'are given. The default transformation is {0}, i.e. the entire match. ' )), unicode: Arg.Switch('-u', help='Also find unicode strings.') = False, unique: Arg.Switch('-q', help='Yield every (transformed) match only once.') = False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None ): super().__init__( regex=regex, transformation=transformation, unicode=unicode, unique=unique, multiline=multiline, ignorecase=ignorecase, min=min, max=max, len=len, stripspace=stripspace, longest=longest, take=take, utf16=unicode, ascii=True, duplicates=not unique ) def process(self, data): meta = metavars(data) self.log_debug('regular expression:', getattr(self.regex, 'pattern', self.regex)) transformations = [] specs: List[bytes] = list(self.args.transformation) if not specs: specs.append(B'{0}') for spec in specs: def transformation(match: Match, s=spec.decode(self.codec)): symb: dict = match.groupdict() args: list = [match.group(0), *match.groups()] used = set() for key, value in symb.items(): if value is None: symb[key] = B'' item = meta.format(s, self.codec, args, symb, True, True, used) used.update(key for key, value in symb.items() if not value) for variable in used: symb.pop(variable, None) symb.update(offset=match.start()) chunk = Chunk(item) chunk.meta.update(meta) chunk.meta.update(symb) return chunk transformations.append(transformation) yield from self.matches_filtered(memoryview(data), self.regex, *transformations)
class rijndael (key, iv=b'', block_size=16, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False, raw=False, mode=None, padding=None)
-
This unit is implemented in
refinery.units.crypto.cipher.rijndael
and has the following commandline Interface:usage: rijndael [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-b N] [-p P] [-m M] [-r] [-e] [-S N] key Rijndael encryption and decryption. Note that there is also a aes unit which has much better performance because it calls into the PyCryptodome library. You would have to use this specific Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in which case it is equivalent to AES. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -b, --block-size N Cipher block size, default is 16. Valid choices are 16, 24, and 32. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rijndael(StandardBlockCipherUnit, cipher=BlockCipherFactory(Rijndael)): """ Rijndael encryption and decryption. Note that there is also a `refinery.aes` unit which has much better performance because it calls into the PyCryptodome library. You would have to use this specific Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in which case it is equivalent to AES. """ def __init__( self, key, iv=b'', block_size: Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 16, 24, and 32.') = 16, **more ): return super().__init__(key, iv, block_size=block_size, **more) @property def block_size(self): return self.args.block_size def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher(block_size=self.args.block_size, **optionals)
class ripemd128 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: ripemd128 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the RIPEMD-128 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ripemd128(HashUnit): """ Returns the RIPEMD-128 hash of the input data. """ def _algorithm(self, data): from refinery.lib.ripemd128 import ripemd128 return ripemd128(data)
class ripemd160 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: ripemd160 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the RIPEMD160 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class rmv (*names)
-
This unit is implemented in
refinery.units.meta.rmv
and has the following commandline Interface:usage: rmv [-h] [-L] [-Q] [-0] [-v] [name [name ...]] Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no variable names are given, the unit removes all of them. Note that this can recover variables from outer frames that were previously shadowed. positional arguments: name Name of a variable to be removed. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rmv(Unit): """ Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no variable names are given, the unit removes all of them. Note that this can recover variables from outer frames that were previously shadowed. """ def __init__(self, *names: Arg(type=str, metavar='name', help='Name of a variable to be removed.')): super().__init__(names=names) def process(self, data: Chunk): meta = metavars(data) keys = self.args.names or list(meta.variable_names()) for key in keys: meta.discard(key) return data
class rncrypt (password)
-
This unit is implemented in
refinery.units.crypto.cipher.rncrypt
and has the following commandline Interface:usage: rncrypt [-h] [-L] [-Q] [-0] [-v] [-R] password Implements encryption and decryption using the RNCryptor specification. See also: https://github.com/RNCryptor positional arguments: password generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rncrypt(Unit): """ Implements encryption and decryption using the RNCryptor specification. See also: https://github.com/RNCryptor """ def __init__(self, password: bytearray): super().__init__(password=password) def process(self, data: bytes) -> bytes: encryption_salt = data[2:10] hmac_salt = data[10:18] iv = data[18:34] cipher_text = data[34:-32] hmac_signature = data[-32:] encryption_key = self._pbkdf2(self.args.password, encryption_salt) hmac_key = self._pbkdf2(self.args.password, hmac_salt) if not hmac.compare_digest(self._hmac(hmac_key, data[:-32]), hmac_signature): raise ValueError("Failed to verify signature.") return unpad( self._aes_decrypt(encryption_key, iv, cipher_text), block_size=AES.block_size ) def reverse(self, data: bytes) -> bytes: prng = Random.new() data = pad(data, block_size=AES.block_size) encryption_salt = prng.read(8) encryption_key = self._pbkdf2(self.args.password, encryption_salt) hmac_salt = prng.read(8) hmac_key = self._pbkdf2(self.args.password, hmac_salt) iv = prng.read(AES.block_size) cipher_text = self._aes_encrypt(encryption_key, iv, data) new_data = b'\x03\x01' + encryption_salt + hmac_salt + iv + cipher_text return new_data + self._hmac(hmac_key, new_data) def _aes_encrypt(self, key, iv, text): return AES.new(key, AES.MODE_CBC, iv).encrypt(text) def _aes_decrypt(self, key, iv, text): return AES.new(key, AES.MODE_CBC, iv).decrypt(text) def _hmac(self, key, data): return hmac.new(key, data, hashlib.sha256).digest() def _prf(self, secret, salt): return hmac.new(secret, salt, hashlib.sha1).digest() def _pbkdf2(self, password, salt, iterations=10000, key_length=32): return KDF.PBKDF2(password, salt, dkLen=key_length, count=iterations, prf=self._prf)
class rot (amount=13)
-
This unit is implemented in
refinery.units.crypto.cipher.rot
and has the following commandline Interface:usage: rot [-h] [-L] [-Q] [-0] [-v] [N] Rotate the characters of the alphabet by the given amount. The default amount is 13, providing the common (and weak) string obfuscation method. positional arguments: N Number of letters to rotate by; Default is 13. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rot(Unit): """ Rotate the characters of the alphabet by the given amount. The default amount is 13, providing the common (and weak) string obfuscation method. """ def __init__(self, amount: Arg.Number(help='Number of letters to rotate by; Default is 13.') = 13): super().__init__(amount=amount) def process(self, data: bytearray): rot = self.args.amount % 26 for index, byte in enumerate(data): for alphabet in _LCASE, _UCASE: if byte in alphabet: zero = alphabet[0] data[index] = zero + (byte - zero + rot) % 26 break return data
class rotl (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.rotl
and has the following commandline Interface:usage: rotl [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Rotate the bits of each block left. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rotl(BinaryOperation): """ Rotate the bits of each block left. """ def operate(self, value, shift): shift %= self.fbits return (value << shift) | (value >> (self.fbits - shift)) def inplace(self, value, shift): shift %= self.fbits lower = value >> (self.fbits - shift) value <<= shift value |= lower
class rotr (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.rotr
and has the following commandline Interface:usage: rotr [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Rotate the bits of each block right. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rotr(BinaryOperation): """ Rotate the bits of each block right. """ def operate(self, value, shift): shift %= self.fbits return (value >> shift) | (value << (self.fbits - shift)) def inplace(self, value, shift): shift %= self.fbits lower = value >> shift value <<= self.fbits - shift value |= lower
class rsa (key, swapkeys=False, textbook=False, padding=PAD.AUTO, rsautl=False)
-
This unit is implemented in
refinery.units.crypto.cipher.rsa
and has the following commandline Interface:usage: rsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-t | -p PAD | -r] key Implements single block RSA encryption and decryption. This unit can be used to encrypt and decrypt blocks generated by openssl's rsautl tool when using the mode -verify. When it is executed with a public key for decryption or with a private key for encryption, it will perform a raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5. positional arguments: key RSA key in PEM, DER, or Microsoft BLOB format. optional arguments: -s, --swapkeys Swap public and private exponent. -t, --textbook Equivalent to --padding=NONE. -p, --padding PAD Choose one of the following padding modes: auto, none, oaep, pkcs15, pkcs10. The default is AUTO. -r, --rsautl Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10 generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class rsa(Unit): """ Implements single block RSA encryption and decryption. This unit can be used to encrypt and decrypt blocks generated by openssl's `rsautl` tool when using the mode `-verify`. When it is executed with a public key for decryption or with a private key for encryption, it will perform a raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5. """ def __init__( self, key: Arg(help='RSA key in PEM, DER, or Microsoft BLOB format.'), swapkeys: Arg.Switch('-s', help='Swap public and private exponent.') = False, textbook: Arg.Switch('-t', group='PAD', help='Equivalent to --padding=NONE.') = False, padding : Arg.Option('-p', group='PAD', choices=PAD, help='Choose one of the following padding modes: {choices}. The default is AUTO.') = PAD.AUTO, rsautl : Arg.Switch('-r', group='PAD', help='Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10') = False, ): padding = Arg.AsOption(padding, PAD) if textbook: if padding != PAD.AUTO: raise ValueError('Conflicting padding options!') padding = padding.NONE if rsautl: if padding and padding != PAD.PKCS10: raise ValueError('Conflicting padding options!') swapkeys = True padding = PAD.PKCS10 super().__init__(key=key, textbook=textbook, padding=padding, swapkeys=swapkeys) self._key_hash = None self._key_data = None @property def blocksize(self) -> int: return self.key.size_in_bytes() @property def _blocksize_plain(self) -> int: # PKCS#1 v1.5 padding is at least 11 bytes. return self.blocksize - 11 @property def pub(self): return self.key.d if self.args.swapkeys else self.key.e @property def prv(self): return self.key.e if self.args.swapkeys else self.key.d def _get_msg(self, data): msg = int.from_bytes(data, byteorder='big') if msg > self.key.n: raise ValueError(F'This key can only handle messages of size {self.blocksize}.') return msg def _encrypt_raw(self, data): return pow( self._get_msg(data), self.pub, self.key.n ).to_bytes(self.blocksize, byteorder='big') def _decrypt_raw(self, data): return pow( self._get_msg(data), self.prv, self.key.n ).to_bytes(self.blocksize, byteorder='big') def _unpad(self, data, head, padbyte=None): if len(data) > self.blocksize: raise ValueError(F'This key can only handle messages of size {self.blocksize}.') if data.startswith(head): pos = data.find(B'\0', 2) if pos > 0: pad = data[2:pos] if padbyte is None or all(b == padbyte for b in pad): return data[pos + 1:] raise ValueError('Incorrect padding') def _pad(self, data, head, padbyte=None): if len(data) > self._blocksize_plain: raise ValueError(F'This key can only encrypt messages of size at most {self._blocksize_plain}.') pad = self.blocksize - len(data) - len(head) - 1 if padbyte is not None: padding = pad * bytes((padbyte,)) else: padding = bytearray(1) while not all(padding): padding = bytearray(filter(None, padding)) padding.extend(get_random_bytes(pad - len(padding))) return head + padding + B'\0' + data def _unpad_pkcs10(self, data): return self._unpad(data, B'\x00\x01', 0xFF) def _unpad_pkcs15(self, data): return self._unpad(data, B'\x00\x02', None) def _pad_pkcs10(self, data): return self._pad(data, B'\x00\x01', 0xFF) def _pad_pkcs15(self, data): return self._pad(data, B'\x00\x02', None) def _decrypt_block_OAEP(self, data): self.log_debug('Attempting decryption with PyCrypto PKCS1 OAEP.') return PKCS1_OAEP.new(self.key).decrypt(data) def _encrypt_block_OAEP(self, data): self.log_debug('Attempting encryption with PyCrypto PKCS1 OAEP.') return PKCS1_OAEP.new(self.key).encrypt(data) def _decrypt_block(self, data): if self._oaep and self._pads in {PAD.AUTO, PAD.OAEP}: try: return self._decrypt_block_OAEP(data) except ValueError as E: if self._pads: raise self.log_debug(F'{E!s} No longer attempting OAEP.') self._oaep = False data = self._decrypt_raw(data) return self._unpad_per_argument(data) def _unpad_per_argument(self, data): if self._pads == PAD.NONE: return data elif self._pads == PAD.PKCS10: return self._unpad_pkcs10(data) elif self._pads == PAD.PKCS15: return self._unpad_pkcs15(data) elif self._pads == PAD.AUTO: with suppress(ValueError): data = self._unpad_pkcs10(data) self.log_info('Detected PKCS1.0 padding.') self._pads = PAD.PKCS10 return data with suppress(ValueError): data = self._unpad_pkcs15(data) self.log_info('Detected PKCS1.5 padding.') self._pads = PAD.PKCS15 return data raise RefineryPartialResult('No padding worked, returning raw decrypted blocks.', data) else: raise ValueError(F'Invalid padding value: {self._pads!r}') def _encrypt_block(self, data): if self._pads in {PAD.AUTO, PAD.OAEP}: try: return self._encrypt_block_OAEP(data) except ValueError: if self._pads: raise self.log_debug('PyCrypto primitives for OAEP failed, falling back to PKCS1.5.') self._pads = PAD.PKCS15 if self._pads == PAD.PKCS15: data = self._pad_pkcs15(data) elif self._pads == PAD.PKCS10: data = self._pad_pkcs10(data) return self._encrypt_raw(data) @property def key(self) -> RSA.RsaKey: key_blob = self.args.key key_hash = hash(key_blob) if key_hash != self._key_hash: fmt, key_data = normalize_rsa_key(key_blob) self.log_info(F'successfully parsed RSA key as {fmt.value}') self._key_hash = key_hash self._key_data = key_data return self._key_data def process(self, data): self._oaep = True self._pads = self.args.padding if not self.key.has_private(): try: return self._unpad_per_argument(self._encrypt_raw(data)) except Exception as E: raise ValueError(F'A public key was given for decryption and rsautl mode resulted in an error: {E}') from E return B''.join(self._decrypt_block(block) for block in splitchunks(data, self.blocksize)) def reverse(self, data): self._pads = self.args.padding return B''.join(self._encrypt_block(block) for block in splitchunks(data, self._blocksize_plain))
class rsakey (public=False, output=RSAFormat.PEM)
-
This unit is implemented in
refinery.units.crypto.cipher.rsakey
and has the following commandline Interface:usage: rsakey [-h] [-L] [-Q] [-0] [-v] [-p] [RSAFormat] Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported. The same formats are supported for the input format, but you can also specify a key in the following format, where both modulus and exponent have to be hex-encoded: [modulus]:[exponent] positional arguments: RSAFormat Select an output format (pem, der, xkms, text, json, blob), default is PEM. optional arguments: -p, --public Force public key output even if the input is private. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class rsakey(Unit): """ Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported. The same formats are supported for the input format, but you can also specify a key in the following format, where both modulus and exponent have to be hex-encoded: `[modulus]:[exponent]` """ def __init__( self, public: Arg.Switch('-p', help='Force public key output even if the input is private.') = False, output: Arg.Option(help='Select an output format ({choices}), default is {default}.', choices=RSAFormat) = RSAFormat.PEM ): super().__init__(public=public, output=Arg.AsOption(output, RSAFormat)) def _xkms_wrap(self, number: int): size, r = divmod(number.bit_length(), 8) size += int(bool(r)) return base64.b64encode(number.to_bytes(size, 'big')) def process(self, data): from refinery.lib.mscrypto import TYPES, ALGORITHMS fmt, key = normalize_rsa_key(data, force_public=self.args.public) self.log_info(F'parsing input as {fmt.value} format') out = self.args.output if out is RSAFormat.PEM: yield key.export_key('PEM') return if out is RSAFormat.DER: yield key.export_key('DER') return if out is RSAFormat.BLOB: def le(v: int, s: int): return v.to_bytes(s, 'little') buffer = bytearray() buffer.append(TYPES.PRIVATEKEYBLOB if key.has_private() else TYPES.PUBLICKEYBLOB) buffer.extend(le(2, 3)) buffer.extend(le(ALGORITHMS.CALG_RSA_KEYX, 4)) buffer.extend(B'RSA2' if key.has_private() else B'RSA1') size = 2 while size < key.n.bit_length(): size <<= 1 self.log_info(F'using bit size {size}') buffer.extend(le(size, 4)) size //= 8 buffer.extend(le(key.e, 4)) buffer.extend(le(key.n, size)) if key.has_private(): exp_1 = key.d % (key.p - 1) exp_2 = key.d % (key.q - 1) coeff = pow(key.q, -1, key.p) half = size // 2 buffer.extend(le(key.p, half)) buffer.extend(le(key.q, half)) buffer.extend(le(exp_1, half)) buffer.extend(le(exp_2, half)) buffer.extend(le(coeff, half)) buffer.extend(le(key.d, size)) yield buffer return components = { 'Modulus' : key.n, 'Exponent': key.e, } if key.has_private(): decoded = DerSequence() decoded.decode(key.export_key('DER')) it = itertools.islice(decoded, 3, None) for v in ('D', 'P', 'Q', 'DP', 'DQ', 'InverseQ'): try: components[v] = next(it) except StopIteration: break if out is RSAFormat.XKMS: for tag in components: components[tag] = base64.b64encode(number.long_to_bytes(components[tag])).decode('ascii') tags = '\n'.join(F'\t<{tag}>{value}</{tag}>' for tag, value in components.items()) yield F'<RSAKeyPair>\n{tags}\n</RSAKeyPair>'.encode(self.codec) return components['BitSize'] = key.n.bit_length() for tag, value in components.items(): if value.bit_length() > 32: components[tag] = F'{value:X}' if out is RSAFormat.JSON: yield json.dumps(components, indent=4).encode(self.codec) return if out is RSAFormat.TEXT: table = list(flattened(components)) for key, value in table: value = F'0x{value}' if isinstance(value, str) else str(value) value = '\n'.join(F'{L}' for L in textwrap.wrap(value, 80)) yield F'-- {key + " ":-<77}\n{value!s}'.encode(self.codec)
class salsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
This unit is implemented in
refinery.units.crypto.cipher.salsa
and has the following commandline Interface:usage: salsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce] Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. optional arguments: -s, --stateful Do not reset the key stream while processing the chunks of one frame. -d, --discard N Discard the first N bytes of the keystream, 0 by default. -m, --magic MAGIC The magic constant; depends on the key size by default. -x, --offset N Optionally specify the stream index, default is 0. -r, --rounds N The number of rounds. Has to be an even number. Default is 20. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class salsa(LatinCipherUnit): """ Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored. """ def keystream(self) -> Iterable[int]: key = self.args.key if len(key) == 64: it = SalsaCipher.FromState(key) else: it = SalsaCipher( key, self.args.nonce, self.args.magic, self.args.rounds, self.args.offset, ) yield from it
class salsa20 (key, nonce=b'REFINERY')
-
This unit is implemented in
refinery.units.crypto.cipher.salsa
and has the following commandline Interface:usage: salsa20 [-h] [-L] [-Q] [-0] [-v] [-R] key [nonce] Salsa20 encryption and decryption. This unit is functionally equivalent to salsa with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure Python implementation used by salsa. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class salsa20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(Salsa20)): """ Salsa20 encryption and decryption. This unit is functionally equivalent to `refinery.salsa` with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure Python implementation used by `refinery.salsa`. """ pass
class scope (*slice, visible=True)
-
This unit is implemented in
refinery.units.meta.scope
and has the following commandline Interface:usage: scope [-h] [-L] [-Q] [-0] [-v] [-n] [start:end:step [start:end:step ...]] After using scope within in a frame, all the following operations will be applied only to the selected indices. All remaining chunks still exist, they are just not operated on. When the frame closes or the frame is being rescoped by a second application of this unit, they become visible again. positional arguments: start:end:step Specify start:end:step in Python slice syntax. The default is :. optional arguments: -n, --not Hide the given chunks instead of making them the only ones visible. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class scope(FrameSlicer): """ After using `refinery.scope` within in a `refinery.lib.frame`, all the following operations will be applied only to the selected indices. All remaining chunks still exist, they are just not operated on. When the frame closes or the frame is being rescoped by a second application of this unit, they become visible again. """ def __init__(self, *slice, visible: Arg.Switch('-n', '--not', off=True, help=( 'Hide the given chunks instead of making them the only ones visible.')) = True ): super().__init__(*slice, visible=visible) # Sort any slices with negative arguments to the back so we check # them last. This delays potential consumption of the chunks iterator # as much as possible. self.args.slice.sort( key=lambda s: (s.start or 0, s.stop or 0), reverse=True) def filter(self, chunks): it = iter(chunks) consumed = None size = None def buffered() -> Generator[Chunk, None, None]: yield from it while consumed: yield consumed.popleft() def shift(offset, default): nonlocal consumed, it, size if offset is None: return default if offset >= 0: return offset if consumed is None: from collections import deque self.log_info(F'consuming iterator to compute negative offset {offset}.') consumed = deque(it) size = len(consumed) + k + 1 return max(0, offset + size) for k, chunk in enumerate(buffered()): for s in self.args.slice: if k in range(shift(s.start, 0), shift(s.stop, k + 1), s.step or 1): chunk.visible = self.args.visible break else: chunk.visible = not self.args.visible self.log_debug(chunk) yield chunk
class seal (key, discard=0, stateful=False)
-
This unit is implemented in
refinery.units.crypto.cipher.seal
and has the following commandline Interface:usage: seal [-h] [-L] [-Q] [-0] [-v] [-R] [-d N] [-s] key SEAL encryption and decryption. positional arguments: key The encryption key. optional arguments: -d, --discard N Discard the first N bytes of the keystream, 0 by default. -s, --stateful Do not reset the key stream while processing the chunks of one frame. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class seal(StreamCipherUnit): """ SEAL encryption and decryption. """ key_size = {20} def keystream(self) -> Iterable[bytes]: return SEAL_Cipher(self.args.key)
class secstr (key=b'\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10', iv=None)
-
This unit is implemented in
refinery.units.crypto.cipher.secstr
and has the following commandline Interface:usage: secstr [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [key] Implements the AES-based encryption scheme used by the PowerShell commands ConvertFrom- SecureString and ConvertTo-SecureString. positional arguments: key Secure string encryption 16-byte AES key; the default are the bytes from 1 to 16. optional arguments: -i, --iv IV Optionally specify an IV to use for encryption. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class secstr(Unit): """ Implements the AES-based encryption scheme used by the PowerShell commands `ConvertFrom-SecureString` and `ConvertTo-SecureString`. """ # This is a magic header value used for PowerShell secure strings. _MAGIC = bytes(( 0xEF, 0xAE, 0x3D, 0xD9, 0xDD, 0x75, 0xD7, 0xAE, 0xF8, 0xDD, 0xFD, 0x38, 0xDB, 0x7E, 0x35, 0xDD, 0xBD, 0x7A, 0xD3, 0x9D, 0x1A, 0xE7, 0x7E, 0x39)) # Secure strings include a decimal number formatted as a string directly # following the header. Presumably, this is the PowerShell version. _PSVER = 2 def __init__( self, key: Arg( help='Secure string encryption 16-byte AES key; the default are the bytes from 1 to 16.' ) = bytes(range(1, 17)), iv: Arg('-i', help='Optionally specify an IV to use for encryption.') = None ): super().__init__(key=key, iv=iv) @property def key(self): key = self.args.key if len(key) not in (0x10, 0x18, 0x20): raise ValueError('The encryption key has to be 16 bytes long.') return key @property def iv(self): iv = self.args.iv if iv is not None and len(iv) != 0x10: raise ValueError('The IV has to be 16 bytes long.') return iv def reverse(self, data): ivec = self.iv or urandom(0x10) if len(ivec) != 0x10: raise ValueError(self._IVERR) cipher = AES.new(self.key, AES.MODE_CBC, ivec) data = data.decode('latin-1').encode('utf-16LE') data = cipher.encrypt(pad(data, block_size=0x10)) data = base64.b16encode(data).lower().decode('ascii') ivec = base64.b64encode(ivec).decode('ascii') data = '|'.join(('%d' % self._PSVER, ivec, data)).encode('utf-16LE') return base64.b64encode(self._MAGIC + data) def process(self, data): head, ivec, data = base64.b64decode(data).split(b'|\0') self.log_info('head:', head.hex()) ivec = base64.b64decode(ivec.decode('utf-16LE')) self.log_info('ivec:', ivec.hex()) data = base64.b16decode(data.decode('utf-16LE'), casefold=True) if len(data) % 0x10 != 0: self.log_info('data not block-aligned, padding with zeros') data += B'\0' * (0x10 - len(data) % 0x10) cipher = AES.new(self.key, AES.MODE_CBC, ivec) data = cipher.decrypt(data) try: data = unpad(data, block_size=0x10) except Exception: self.log_warn('decrypted data does not have PKCS7 padding') for p in range(0x10): try: return data[-p:].decode('utf-16LE').encode('latin-1') except UnicodeDecodeError: pass except UnicodeEncodeError: pass self.log_warn('result is not a padded unicode string, key is likely wrong') return data
class sep (separator=b'\n', scoped=False)
-
This unit is implemented in
refinery.units.meta.sep
and has the following commandline Interface:usage: sep [-h] [-L] [-Q] [-0] [-v] [-s] [separator] Multiple inputs are joined along a specified separator. If any of the input Chunks is currently out of scope, sep turns makes them visible by default. This can be prevented by using the -s flag. positional arguments: separator Separator; the default is a line break. optional arguments: -s, --scoped Maintain chunk scope; i.e. do not turn all input chunks visible. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class sep(Unit): """ Multiple inputs are joined along a specified separator. If any of the input `refinery.lib.frame.Chunk`s is currently out of scope, `refinery.sep` turns makes them visible by default. This can be prevented by using the `-s` flag. """ def __init__( self, separator: Arg(help='Separator; the default is a line break.') = B'\n', scoped: Arg.Switch('-s', help=( 'Maintain chunk scope; i.e. do not turn all input chunks visible.')) = False ): super().__init__(separator=separator, scoped=scoped) self.separate = False def filter(self, chunks): it = iter(chunks) try: chunk = next(it) except StopIteration: return self.separate = True for upcoming in it: if not self.args.scoped: chunk.visible = True yield chunk chunk = upcoming self.separate = False yield chunk def process(self, data): yield data if self.separate: yield self.args.separator
class serpent (key, iv=b'', padding=None, mode=None, raw=False, swap=False)
-
This unit is implemented in
refinery.units.crypto.cipher.serpent
and has the following commandline Interface:usage: serpent [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in one direction, some in the other. When decryption results with this unit do not yield the expected result, try using the --swap (or -s) option to swap the bytes in each block. Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by prefixing the input key by the multibin handler snip[::-1]. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -s, --swap Read the bytes in each block in reverse order. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class serpent(StandardBlockCipherUnit, cipher=BlockCipherFactory(Serpent)): """ Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in one direction, some in the other. When decryption results with this unit do not yield the expected result, try using the `--swap` (or `-s`) option to swap the bytes in each block. Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by prefixing the input key by the multibin handler `snip[::-1]`. """ def __init__( self, key, iv=b'', padding=None, mode=None, raw=False, swap: Arg.Switch('-s', help='Read the bytes in each block in reverse order.') = False ): super().__init__(key, iv, padding=padding, mode=mode, raw=raw, swap=swap) def _new_cipher(self, **optionals) -> CipherInterface: instance: Serpent = super()._new_cipher() instance.swap = self.args.swap return instance
class sha1 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha1 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA1 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha224 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha224 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA224 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha256 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha256 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA256 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha384 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha384 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA384 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha3_224 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha3-224 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA3-224 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha3_256 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha3-256 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA3-256 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha3_384 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha3-384 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA3-384 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha3_512 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha3-512 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA3-512 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class sha512 (text=False)
-
This unit is implemented in
refinery.units.crypto.hash.cryptographic
and has the following commandline Interface:usage: sha512 [-h] [-L] [-Q] [-0] [-v] [-t] Returns the SHA512 hash of the input data. optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
class shl (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.shl
and has the following commandline Interface:usage: shl [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Shift the bits of each block left, filling with zero bits. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class shl(BinaryOperation): """ Shift the bits of each block left, filling with zero bits. """ @staticmethod def operate(a, b): return a << b @staticmethod def inplace(a, b): a <<= b
class shr (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.shr
and has the following commandline Interface:usage: shr [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Shift the bits of each block right, filling with zero bits. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class shr(BinaryOperation): """ Shift the bits of each block right, filling with zero bits. """ @staticmethod def operate(a, b): return a >> b @staticmethod def inplace(a, b): a >>= b
class sm4 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
This unit is implemented in
refinery.units.crypto.cipher.sm4
and has the following commandline Interface:usage: sm4 [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-e] [-S N] key The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography Administration of China (SCA). positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -e, --little-endian Only for CTR: Use a little endian counter instead of the default big endian. -S, --segment-size N Only for CFB: Number of bits into which data is segmented. It must be a multiple of 8. The default of 0 means that the block size will be used as the segment size. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class sm4(StandardBlockCipherUnit, cipher=BlockCipherFactory(SM4)): """ The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography Administration of China (SCA). """ pass
class snip (slices=[slice(None, None, None)], length=False, stream=False, remove=False)
-
This unit is implemented in
refinery.units.strings.snip
and has the following commandline Interface:usage: snip [-h] [-L] [-Q] [-0] [-v] [-l] [-s] [-r] [slices [slices ...]] Snips the input data based on a Python slice expression. For example, the initialization slice 0::1 1::1 would yield a unit that first extracts every byte at an even position and then, every byte at an odd position. In this case, multiple outputs are produced. The unit can be used in reverse mode, in which case the specified ranges are deleted sequentially from the input. positional arguments: slices Specify start:stop:step in Python slice syntax. optional arguments: -l, --length Interpret the end of a slice as a length rather than as an offset. -s, --stream After each slice, consider only the data that follows after it for subsequent slicing. -r, --remove Remove the slices from the input rather than selecting them. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class snip(Unit): """ Snips the input data based on a Python slice expression. For example, the initialization `slice 0::1 1::1` would yield a unit that first extracts every byte at an even position and then, every byte at an odd position. In this case, multiple outputs are produced. The unit can be used in reverse mode, in which case the specified ranges are deleted sequentially from the input. """ def __init__( self, slices: Arg(help='Specify start:stop:step in Python slice syntax.') = [slice(None, None)], length: Arg.Switch('-l', help=( 'Interpret the end of a slice as a length rather than as an offset.')) = False, stream: Arg.Switch('-s', help=( 'After each slice, consider only the data that follows after it for subsequent ' 'slicing.')) = False, remove: Arg.Switch('-r', help=( 'Remove the slices from the input rather than selecting them.')) = False, ): super().__init__(slices=slices, length=length, stream=stream, remove=remove) def process(self, data: bytearray): slices: list[slice] = list(self.args.slices) stream = self.args.stream remove = self.args.remove length = self.args.length cursor = 0 view = memoryview(data) for k, bounds in enumerate(slices): upper = bounds.stop lower = bounds.start or 0 if upper is None: upper = len(data) else: upper += cursor if length: upper += lower bounds = slice( lower + cursor, upper, bounds.step) if stream: cursor = upper if not remove: temp = view[bounds] else: if k + 1 >= len(slices): view.release() del view temp = data else: temp = bytearray(data) del temp[bounds] yield temp
class sorted (key=None, ascending=False)
-
This unit is implemented in
refinery.units.meta.sorted
and has the following commandline Interface:usage: sorted [-h] [-L] [-Q] [-0] [-v] [-a] [key] Sorts all elements of the input frame lexicographically. This unit is a nop on single inputs. positional arguments: key A meta variable expression to sort by instead of sorting the content. optional arguments: -a, --ascending Sort in ascending order, the default is descending. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class sorted(Unit): """ Sorts all elements of the input `refinery.lib.frame` lexicographically. This unit is a `refinery.nop` on single inputs. """ def __init__( self, key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None, ascending: Arg.Switch('-a', help='Sort in ascending order, the default is descending.') = False ): super().__init__(key=key, ascending=ascending) def filter(self, chunks): sortbuffer = [] invisibles = [] key = self.args.key rev = not self.args.ascending if key is not None: def _key(chunk): return expression(metavars(chunk)), chunk expression = PythonExpression(key, all_variables_allowed=True) key = _key def sorted(): if not sortbuffer: return sortbuffer.sort(key=key, reverse=rev) yield from sortbuffer sortbuffer.clear() for chunk in chunks: if chunk.visible: yield from invisibles invisibles.clear() sortbuffer.append(chunk) else: yield from sorted() invisibles.append(chunk) yield from invisibles yield from sorted()
class sosemanuk (key, stateful=False, discard=0, nonce=b'')
-
This unit is implemented in
refinery.units.crypto.cipher.sosemanuk
and has the following commandline Interface:usage: sosemanuk [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] key [nonce] positional arguments: key The encryption key. nonce The nonce. Default is empty, which is equivalent to 16 null bytes. optional arguments: -s, --stateful Do not reset the key stream while processing the chunks of one frame. -d, --discard N Discard the first N bytes of the keystream, 0 by default. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class sosemanuk(StreamCipherUnit): def __init__( self, key, stateful=False, discard=0, nonce: Arg(help='The nonce. Default is empty, which is equivalent to 16 null bytes.') = B'', ): super().__init__(key=key, nonce=nonce, stateful=stateful, discard=discard) def keystream(self): yield from Sosemanuk(self.args.key, self.args.nonce)
class stego (transpose, split=False, parts='RGB')
-
This unit is implemented in
refinery.units.formats.stego
and has the following commandline Interface:usage: stego [-h] [-L] [-Q] [-0] [-v] [-t] [-m] [parts] Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs these values as bytes. By default, the pixels are converted left to right, top to bottom. positional arguments: parts A string containing any ordering of the letters R, G, B, and A (case- insensitive). These pixel components will be extracted from every pixel in the given order. The default value is RGB. optional arguments: -t, --transpose Return the columns of the image rather than the rows. -m, --split Emit the individual rows or columns as separate outputs. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class stego(Unit): """ Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs these values as bytes. By default, the pixels are converted left to right, top to bottom. """ def __init__( self, transpose: Arg.Switch('-t', help='Return the columns of the image rather than the rows.'), split: Arg.Switch('-m', help='Emit the individual rows or columns as separate outputs.') = False, parts: Arg('parts', nargs='?', type=str, help=( 'A string containing any ordering of the letters R, G, B, and A (case-insensitive). ' 'These pixel components will be extracted from every pixel in the given order. The ' 'default value is {default}.' )) = 'RGB' ): super().__init__( transpose=transpose, split=split, parts=tuple(Arg.AsOption(p, PIXEL_PART) for p in parts) ) @Unit.Requires('Pillow', 'formats') def _image(): from PIL import Image return Image def process(self, data): split = self.args.split parts = self.args.parts image = self._image.open(MemoryFile(data)) if self.args.transpose: image = image.transpose(self._image.Transpose.ROTATE_90) width, height = image.size chunk_size = len(parts) output = MemoryFile() buffer = bytearray(chunk_size * width) for y in range(height): offset = 0 for x in range(width): pixel = image.getpixel((x, y)) next_offset = offset + chunk_size buffer[offset:next_offset] = (pixel[p] for p in parts) offset = next_offset if split: yield buffer else: output.write(buffer) if not split: yield output.getvalue()
class stretch (*count)
-
This unit is implemented in
refinery.units.strings.stretch
and has the following commandline Interface:usage: stretch [-h] [-L] [-Q] [-0] [-v] [-R] [count [count ...]] Stretch the input data by repeating every byte a number of times. positional arguments: count The number of times every byte should be repeated. By default, every byte is repeated once. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class stretch(Unit): """ Stretch the input data by repeating every byte a number of times. """ def __init__(self, *count: Arg.Number(metavar='count', help=( 'The number of times every byte should be repeated. By default, ' 'every byte is repeated once.' ))): count = count or (2,) if any(k <= 0 for k in count): raise ValueError('You can not use a stretching factor of less than 1.') super().__init__(count=count or (2,)) def process(self, data): def stretched(it): factor = cycle(self.args.count) for byte in it: yield from repeat(byte, next(factor)) return bytearray(stretched(iter(data))) def reverse(self, data): # one-sided inverse def clinched(it): factor = cycle(self.args.count) while True: try: take = islice(it, next(factor)) yield next(take) for _ in take: pass except StopIteration: break return bytearray(clinched(iter(data)))
class struct (spec, *outputs, multi=False, count=∞, until=None, field=None, more=False)
-
This unit is implemented in
refinery.units.pattern.struct_parser
and has the following commandline Interface:usage: struct [-h] [-L] [-Q] [-0] [-v] [-m] [-n N] [-u E] [-f STR] [-M] spec [output [output ...]] Read structured data from the beginning of a chunk and store the extracted fields in chunk meta variables. The structure format is specified in extended Python struct format, and all remaining arguments to this unit are the names of the variables that receive the values from this struct. The extended struct format supports all field types supported by Python, as well as the following: - a for null-terminated ASCII strings, - u to read encoded, null-terminated UTF16 strings, - w to read decoded, null-terminated UTF16 strings, - g to read Microsoft GUID values, - E to read 7-bit encoded integers. For example, the string LLxxHaa will read two unsigned 32bit integers, then skip two bytes, then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to using native byte order with no alignment. The spec parameter may additionally contain format expressions of the following form: {name[!alignment]:format} The alignment parameter is optional. It must be an expression that evaluates to an integer value. The current data pointer is aligned to a multiple of this value before reading the field. The format can either be an integer expression specifying a number of bytes to read, or any format string. If name is specified for an extracted field, its value is made available as a meta variable under the given name. For example, the expression LLxxH{foo:a}{bar:a} would be parsed in the same way as the previous example, but the two ASCII strings would also be stored in meta variables under the names foo and bar, respectively. The format string of a named field is itself parsed as a foramt string expression, where all the previously parsed fields are already available. For example, I{:{}} reads a single 32-bit integer length prefix and then reads as many bytes as that prefix specifies. A second format string expression is used to specify the output format. For example, the format string LLxxH{foo:a}{bar:a} together with the output format {foo}/{bar} would parse data as before, but the output body would be the concatnation of the field foo, a forward slash, and the field bar. Variables used in the output expression are not included as meta variables. As format fields in the output expression, one can also use {1}, {2} or {-1} to access extracted fields by index. The value {0} represents the entire chunk of structured data. By default, the output format {#} is used, which represents either the last byte string field that was extracted, or the entire chunk of structured data if none of the fields were extracted. Reverse multibin expressions can be used to post-process the fields included in any output format. For example, {F:b64:zl} will be the base64-decoded and inflate- decompressed contents of the data that was read as field F. Finally, it is possible to specify a byte alignment by using the syntax {field!T:a:b:c} where the letter T is either a single digit specifying the alignment, or a single letter variable that holds the byte alignment value in the current metadata. positional arguments: spec Structure format as explained above. output Output format as explained above. optional arguments: -m, --multi Read as many pieces of structured data as possible intead of just one. -n, --count N A limit on the number of chunks to read in multi mode; default is ∞. -u, --until E An expression evaluated on each chunk in multi mode. New chunks will be parsed only if the result is nonzero. -f, --field STR Optionally specify a format string expression to auto-name extracted fields without a given name based on their position. -M, --more After parsing the struct, emit one chunk that contains the data that was left over in the buffer. If no data was left over, this chunk will be empty. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class struct(Unit): """ Read structured data from the beginning of a chunk and store the extracted fields in chunk meta variables. The structure format is specified in extended Python struct format, and all remaining arguments to this unit are the names of the variables that receive the values from this struct. The extended struct format supports all field types supported by Python, as well as the following: - `a` for null-terminated ASCII strings, - `u` to read encoded, null-terminated UTF16 strings, - `w` to read decoded, null-terminated UTF16 strings, - `g` to read Microsoft GUID values, - `E` to read 7-bit encoded integers. For example, the string `LLxxHaa` will read two unsigned 32bit integers, then skip two bytes, then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to using native byte order with no alignment. The `spec` parameter may additionally contain format expressions of the following form: {name[!alignment]:format} The `alignment` parameter is optional. It must be an expression that evaluates to an integer value. The current data pointer is aligned to a multiple of this value before reading the field. The `format` can either be an integer expression specifying a number of bytes to read, or any format string. If `name` is specified for an extracted field, its value is made available as a meta variable under the given name. For example, the expression `LLxxH{foo:a}{bar:a}` would be parsed in the same way as the previous example, but the two ASCII strings would also be stored in meta variables under the names `foo` and `bar`, respectively. The `format` string of a named field is itself parsed as a foramt string expression, where all the previously parsed fields are already available. For example, `I{:{}}` reads a single 32-bit integer length prefix and then reads as many bytes as that prefix specifies. A second format string expression is used to specify the output format. For example, the format string `LLxxH{foo:a}{bar:a}` together with the output format `{foo}/{bar}` would parse data as before, but the output body would be the concatnation of the field `foo`, a forward slash, and the field `bar`. Variables used in the output expression are not included as meta variables. As format fields in the output expression, one can also use `{1}`, `{2}` or `{-1}` to access extracted fields by index. The value `{0}` represents the entire chunk of structured data. By default, the output format `{#}` is used, which represents either the last byte string field that was extracted, or the entire chunk of structured data if none of the fields were extracted. Reverse `refinery.lib.argformats.multibin` expressions can be used to post-process the fields included in any output format. For example, `{F:b64:zl}` will be the base64-decoded and inflate- decompressed contents of the data that was read as field `F`. Finally, it is possible to specify a byte alignment by using the syntax `{field!T:a:b:c}` where the letter `T` is either a single digit specifying the alignment, or a single letter variable that holds the byte alignment value in the current metadata. """ def __init__( self, spec: Arg(type=str, help='Structure format as explained above.'), *outputs: Arg(metavar='output', type=str, help='Output format as explained above.'), multi: Arg.Switch('-m', help=( 'Read as many pieces of structured data as possible intead of just one.')) = False, count: Arg.Number('-n', help=( 'A limit on the number of chunks to read in multi mode; default is {default}.')) = INF, until: Arg('-u', metavar='E', type=str, help=( 'An expression evaluated on each chunk in multi mode. New chunks will be parsed ' 'only if the result is nonzero.')) = None, field: Arg.String('-f', help=( 'Optionally specify a format string expression to auto-name extracted fields without a ' 'given name based on their position.')) = None, more : Arg.Switch('-M', help=( 'After parsing the struct, emit one chunk that contains the data that was left ' 'over in the buffer. If no data was left over, this chunk will be empty.')) = False ): outputs = outputs or [F'{{{_SHARP}}}'] super().__init__(spec=spec, outputs=outputs, until=until, field=field, count=count, multi=multi, more=more) def process(self, data: Chunk): formatter = string.Formatter() until = self.args.until until = until and PythonExpression(until, all_variables_allowed=True) reader = StructReader(memoryview(data)) checkpoint = 0 mainspec = self.args.spec byteorder = mainspec[:1] if byteorder in '<@=!>': mainspec = mainspec[1:] else: byteorder = '=' def fixorder(spec): if spec[0] not in '<@=!>': spec = byteorder + spec return spec previously_existing_variables = set(metavars(data).variable_names()) it = itertools.count() if self.args.multi else (0,) for index in it: checkpoint = reader.tell() if reader.eof: break if index >= self.args.count: break meta = metavars(data) meta.ghost = True meta.update_index(index) args = [] last = None self.log_debug(F'starting new read at: 0x{checkpoint:08X}') try: for prefix, name, spec, conversion in formatter.parse(mainspec): name: str spec: str = spec and spec.strip() if prefix: fields = reader.read_struct(fixorder(prefix)) if fmt := self.args.field: for k, field in enumerate(fields, len(args)): meta[fmt.format(k)] = field args.extend(fields) if name is None: continue if name and not name.isdecimal(): check_variable_name(name) if conversion: _aa = reader.tell() reader.byte_align(PythonExpression.Evaluate(conversion, meta)) _ab = reader.tell() if _aa != _ab: self.log_info(F'aligned from 0x{_aa:X} to 0x{_ab:X}') spec, _, pipeline = spec.partition(':') if spec: spec = meta.format_str(spec, self.codec, args) if spec: try: _exp = PythonExpression.Evaluate(spec, meta) except ParserError: pass else: spec = _exp if spec == '': last = value = reader.read() elif isinstance(spec, int): if spec < 0: spec += reader.remaining_bytes if spec < 0: raise ValueError(F'The specified negative read offset is {-spec} beyond the cursor.') last = value = reader.read_bytes(spec) else: value = reader.read_struct(fixorder(spec)) if not value: self.log_debug(F'field {name} was empty, ignoring.') continue if len(value) > 1: self.log_info(F'parsing field {name} produced {len(value)} items reading a tuple') else: value = value[0] if pipeline: value = numseq(pipeline, reverse=True, seed=value) args.append(value) if name == _SHARP: raise ValueError('Extracting a field with name # is forbidden.') elif name.isdecimal(): index = int(name) limit = len(args) - 1 if index > limit: self.log_warn(F'cannot assign index field {name}, the highest index is {limit}') else: args[index] = value continue elif name: meta[name] = value if until and until(meta): self.log_info(F'the expression ({until}) evaluated to true; aborting.') break with StreamDetour(reader, checkpoint) as detour: full = reader.read(detour.cursor - checkpoint) if last is None: last = full outputs = [] symbols = dict(meta) symbols[_SHARP] = last for template in self.args.outputs: used = set() outputs.append(meta.format(template, self.codec, [full, *args], symbols, True, used=used)) for key in used: if key in previously_existing_variables: continue meta.discard(key) for output in outputs: chunk = Chunk(output) chunk.meta.update(meta) chunk.set_next_batch(index) yield chunk except EOFError: break leftover = len(reader) - checkpoint if not leftover: return elif self.args.more: reader.seekset(checkpoint) yield reader.read() else: leftover = repr(SizeInt(leftover)).strip() self.log_info(F'discarding {leftover} left in buffer')
class sub (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.sub
and has the following commandline Interface:usage: sub [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Subtract the given argument from each block. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class sub(BinaryOperationWithAutoBlockAdjustment): """ Subtract the given argument from each block. """ @staticmethod def operate(a, b): return a - b @staticmethod def inplace(a, b): a -= b
class subfiles (memdump=False, recursive=False)
-
This unit is implemented in
refinery.units.pattern.subfiles
and has the following commandline Interface:usage: subfiles [-h] [-L] [-Q] [-0] [-v] [-m] [-r] Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against the input data and generates one output chunk for each successfully carved subfile. optional arguments: -m, --memdump Assume that the input is a memdump for PE file carving. -r, --recursive Extract files that are subfiles of other extracted files as separate chunks. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class subfiles(Unit): """ Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against the input data and generates one output chunk for each successfully carved subfile. """ _MINLENGTH = { 'json': 300, 'xml' : 300, 'rtf' : 100, } def __init__( self, memdump : Unit.Arg.Switch('-m', help='Assume that the input is a memdump for PE file carving.') = False, recursive: Unit.Arg.Switch('-r', help='Extract files that are subfiles of other extracted files as separate chunks.') = False, ): super().__init__(memdump=memdump, recursive=recursive) def process(self, data: bytearray): carvers = { 'zip' : carve_zip(), '7z' : carve_7z(), 'pe' : carve_pe(memdump=self.args.memdump, fileinfo=True, recursive=True, keep_root=True), 'lnk' : carve_lnk(), 'json' : carve_json(dictonly=True), 'xml' : carve_xml(), 'rtf' : carve_rtf(), } covered = [] for extension, unit in carvers.items(): self.log_info(F'carving {extension} files') for chunk in data | unit: if len(chunk) < self._MINLENGTH.get(extension, 1): continue start = chunk['offset'] end = start + len(chunk) if any(start > left and end < right for left, right in covered): continue if not self.args.recursive: covered.append((start, end)) yield chunk
class swap (src, dst=None)
-
This unit is implemented in
refinery.units.meta.swap
and has the following commandline Interface:usage: swap [-h] [-L] [-Q] [-0] [-v] src [dst] Swap the contents of an existing variable with the contents of the chunk or with another meta variable. When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable that does not exist, the original variable is cleared, essentially renaming the variable. positional arguments: src The meta variable name. dst Optional name of the second meta variable. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class swap(Unit): """ Swap the contents of an existing variable with the contents of the chunk or with another meta variable. When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable that does not exist, the original variable is cleared, essentially renaming the variable. """ def __init__( self, src: Arg(type=str, help='The meta variable name.'), dst: Arg(type=str, help='Optional name of the second meta variable.') = None ): super().__init__( src=check_variable_name(src), dst=check_variable_name(dst) ) def filter(self, chunks: Iterable[Chunk]): src = self.args.src dst = self.args.dst for chunk in chunks: if not chunk.visible: pass elif dst is None: try: value = chunk.meta[src] except KeyError: value = bytearray() if isinstance(value, str): value = value.encode(self.codec) elif not isbuffer(value): raise ValueError(F'Unable to swap data with variable {src} because it has type {type(value).__name__}.') if not chunk: chunk.meta.discard(src) else: chunk.meta[src] = bytes(chunk) chunk[:] = value else: try: value = chunk.meta.pop(src) except KeyError: raise KeyError(F'The variable {src} does not exist.') try: swap = chunk.meta.pop(dst) except KeyError: chunk.meta[dst] = value else: chunk.meta[src], chunk.meta[dst] = swap, value yield chunk
class szdd
-
This unit is implemented in
refinery.units.compression.szdd
and has the following commandline Interface:usage: szdd [-h] [-L] [-Q] [-0] [-v] [-F] Extract files from SZDD archives. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class szdd(Unit): """ Extract files from SZDD archives. """ def process(self, data): with StructReader(data) as archive: if archive.read(8) != b'SZDD\x88\xF0\x27\x33': if not self.args.lenient: raise ValueError('signature missing') self.log_warn('the header signature is invalid, this is likely not an SZDD archive') if archive.read_byte() != 0x41: raise ValueError('Unsupported compression mode') # ignore the missing file extension letter: archive.seekrel(1) output_len = archive.u32() window_pos = 0x1000 - 0x10 output_pos = 0 output = bytearray(output_len) window = bytearray(0x1000) for k in range(len(window)): window[k] = 0x20 while not archive.eof: control = archive.read_byte() for cb in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80): if archive.eof: break if control & cb: output[output_pos] = window[window_pos] = archive.read_byte() output_pos += 1 window_pos += 1 window_pos &= 0xFFF else: match_pos = archive.read_byte() match_len = archive.read_byte() match_pos |= (match_len & 0xF0) << 4 match_len = (match_len & 0x0F) + 3 match_pos &= 0xFFF for _ in range(match_len): window[window_pos] = window[match_pos] output[output_pos] = window[window_pos] output_pos += 1 window_pos += 1 match_pos += 1 window_pos &= 0xFFF match_pos &= 0xFFF return output @classmethod def handles(self, data: bytearray): return data[:4] == B'SZDD'
class tea (key, iv=b'', padding=None, mode=None, raw=False, swap=False)
-
This unit is implemented in
refinery.units.crypto.cipher.tea
and has the following commandline Interface:usage: tea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key TEA encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -s, --swap Decode blocks as big endian rather than little endian. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class tea(TEAUnit, cipher=BlockCipherFactory(TEA)): """ TEA encryption and decryption. """
class termfit (width=0, delta=0, tight=False)
-
This unit is implemented in
refinery.units.strings.termfit
and has the following commandline Interface:usage: termfit [-h] [-L] [-Q] [-0] [-v] [-d N] [-t] [width] Reformat incoming text data to fit a certain width. positional arguments: width Optionally specify the width, by default the current terminal width is used. optional arguments: -d, --delta N Subtract this number from the calculated width (0 by default). -t, --tight Separate paragraphs by a single line break instead of two. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class termfit(Unit): """ Reformat incoming text data to fit a certain width. """ def __init__( self, width: Arg('width', help='Optionally specify the width, by default the current terminal width is used.') = 0, delta: Arg.Number('-d', help='Subtract this number from the calculated width (0 by default).') = 0, tight: Arg.Switch('-t', help='Separate paragraphs by a single line break instead of two.') = False, ): super().__init__(width=width, delta=delta, tight=tight) @unicoded def process(self, data: str) -> str: parsep = '\n' if self.args.tight else '\n\n' return terminalfit(data, self.args.delta, self.args.width, parsep)
class terminate (sentinel=b'\x00', blocksize=None, bigendian=False)
-
This unit is implemented in
refinery.units.blockwise.terminate
and has the following commandline Interface:usage: terminate [-h] [-L] [-Q] [-0] [-v] [-R] [-B N] [-E] [sentinel] The unit reads data from the incoming chunk in blocks of any given size until the sentinel value is encountered. The output of the unit is all data that was read, excluding the sentinel. The default block size is one and the default sentinel value is zero, which corresponds to reading a null-terminated string from the input. If the sentinel value is not found anywhere in the incoming data, the complete input is returned as output. positional arguments: sentinel sentinel value to look for; default is H:00 optional arguments: -B, --blocksize N The size of each block in bytes, default is 1. -E, --bigendian Read chunks in big endian. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class terminate(BlockTransformationBase): """ The unit reads data from the incoming chunk in blocks of any given size until the sentinel value is encountered. The output of the unit is all data that was read, excluding the sentinel. The default block size is one and the default sentinel value is zero, which corresponds to reading a null-terminated string from the input. If the sentinel value is not found anywhere in the incoming data, the complete input is returned as output. """ def __init__( self, sentinel: Arg(help='sentinel value to look for; default is {default}') = B'\0', blocksize=None, bigendian=False ): super().__init__(blocksize=blocksize, bigendian=bigendian, sentinel=sentinel) def process(self, data: bytearray): sentinel = self.args.sentinel position = 0 blocksize = self.blocksize self.log_info('blocksize:', blocksize) self.log_debug('separator:', sentinel) while position >= 0: position = data.find(sentinel, position) if position < 0: self.log_info(F'The sentinel value {sentinel} was not found.') break q, r = divmod(position, blocksize) if r: position = (q + 1) * blocksize continue else: data[position:] = [] break return data def reverse(self, data: bytearray): sentinel = self.args.sentinel position = 0 while True: position = data.find(sentinel, position) if position < 0: data.extend(sentinel) break if position % self.blocksize == 0: self.log_warn('input string already contains the termination character; returning unmodified input') break position += 1 return data
class tnetmtm (headers_as_meta_vars, list_header_names, header_filter)
-
This unit is implemented in
refinery.units.formats.tnetmtm
and has the following commandline Interface:usage: tnetmtm [-h] [-L] [-Q] [-0] [-v] [--populate-headers] [--list-header-names] [--header-filter HEADER_FILTER] Parses out payloads from tnetstring files generated by mitmproxy. The unit is also able to populate HTTP headers as meta variables or emitting header values instead of actual payloads. optional arguments: --populate-headers, -p --list-header-names, -l --header-filter, -f HEADER_FILTER generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class tnetmtm(Unit): """ Parses out payloads from tnetstring files generated by mitmproxy. The unit is also able to populate HTTP headers as meta variables or emitting header values instead of actual payloads. """ def __init__( self, headers_as_meta_vars: Arg.Switch('--populate-headers', '-p'), list_header_names: Arg.Switch('--list-header-names', '-l'), header_filter: Arg('--header-filter', '-f'), ): ... @Unit.Requires('mitmproxy', 'all') def _tnetstring(): from mitmproxy.io import tnetstring return tnetstring @staticmethod def _generate_errors(log_line: Dict) -> Iterator[str]: def _extract_error(d: Optional[Dict]) -> Optional[str]: return ((d or {}).get('error') or {}).get('msg') proxy_error = _extract_error(log_line.get('client_conn')) if proxy_error: yield proxy_error error = _extract_error(log_line) if error: yield error return error def _default_meta_vars(self, log_line, request: Dict, response: Dict) -> Dict[str, Union[str, int]]: ret = { 'request_method': request.get('method').decode('utf-8'), 'request_scheme': request.get('scheme').decode('utf-8'), 'request_host': request.get('host'), 'request_query_string': request.get('path').decode('utf-8'), 'request_header_count': len(request.get('headers', [])), 'response_status_code': response.get('status_code'), 'response_header_count': len(response.get('headers', [])), } for num, error in enumerate(self._generate_errors(log_line)): ret[f'error_{num}'] = error request_http_version = request.get('http_version') if request_http_version: ret['request_http_version'] = request_http_version.decode('utf-8') response_http_version = response.get('http_version') if response_http_version: ret['response_http_version'] = response_http_version.decode('utf-8') return ret @staticmethod def _output_type(args) -> OutputType: if args.list_header_names: return OutputType.header_names if args.header_filter: return OutputType.header_value return OutputType.payloads def process(self, data: bytearray): args = self.args tnetstring = self._tnetstring output_type = self._output_type(args) with io.BytesIO(data) as fp: while True: try: log_line = tnetstring.load(fp) request = log_line.get('request') or {} response = log_line.get('response') or {} labels = {} if args.headers_as_meta_vars else self._default_meta_vars(log_line, request, response) for header_name, header_value in request.get('headers', []) + response.get('headers', []): if output_type == OutputType.header_names: yield header_name if output_type == OutputType.header_value: if header_name == args.header_filter: yield header_value if args.headers_as_meta_vars: labels[header_name.decode('utf-8').replace('-', '')] = header_value if output_type == OutputType.payloads: yield self.labelled(response.get('content'), **labels) except ValueError: break
class transpose (padding=b'')
-
This unit is implemented in
refinery.units.meta.transpose
and has the following commandline Interface:usage: transpose [-h] [-L] [-Q] [-0] [-v] [padding] Interprets the chunks in the current frame as rows of a matrix and yields the columns of that matrix. When chunks are not of even length, the matrix is considered to have empty entries in some positions. Optionally, a padding sequence can be provided to pad all rows to the same length. positional arguments: padding Optional byte sequence to use as padding for incomplete rows. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class transpose(Unit): """ Interprets the chunks in the current frame as rows of a matrix and yields the columns of that matrix. When chunks are not of even length, the matrix is considered to have empty entries in some positions. Optionally, a padding sequence can be provided to pad all rows to the same length. """ @Unit.Requires('numpy', 'speed', 'default', 'extended') def _numpy(): import numpy return numpy def __init__( self, padding: Arg(help='Optional byte sequence to use as padding for incomplete rows.') = B'', ): super().__init__(bigendian=False, padding=padding) def filter(self, chunks: Iterable[Chunk]): rows = [] for chunk in chunks: if not chunk.visible: yield chunk continue rows.append(chunk) if not rows: return matrix = rows[0] matrix.temp = rows yield matrix def process(self, data: Chunk): chunks: List[Chunk] = data.temp if not chunks: return length = [len(chunk) for chunk in chunks] n = min(length) m = max(length) pad = self.args.padding if pad: for chunk in chunks: while len(chunk) < m: chunk.extend(pad) del chunk[m:] if n > 0: try: np = self._numpy except ImportError: pass else: t = [chunk[n:] for chunk in chunks if len(chunk) > n] for chunk in chunks: del chunk[n:] a = np.array(chunks, dtype=np.uint8).transpose() for row in a: yield row.tobytes('C') m = m - n chunks = t for i in range(m): yield bytes(chunk[i] for chunk in chunks if len(chunk) > i)
class trim (*junk, unpad=False, left=True, right=True, nocase=False)
-
This unit is implemented in
refinery.units.strings.trim
and has the following commandline Interface:usage: trim [-h] [-L] [-Q] [-0] [-v] [-u] [-r | -l] [-i] [junk [junk ...]] Removes byte sequences at beginning and end of input data. positional arguments: junk Binary strings to be removed, default are all whitespace characters. optional arguments: -u, --unpad Also trim partial occurrences of the junk string. -r, --right-only Do not trim left. -l, --left-only Do not trim right. -i, --nocase Ignore capitalization for alphabetic characters. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class trim(Unit): """ Removes byte sequences at beginning and end of input data. """ def __init__( self, *junk: Arg(help='Binary strings to be removed, default are all whitespace characters.'), unpad: Arg.Switch('-u', help='Also trim partial occurrences of the junk string.') = False, left: Arg.Switch('-r', '--right-only', group='SIDE', help='Do not trim left.') = True, right: Arg.Switch('-l', '--left-only', group='SIDE', help='Do not trim right.') = True, nocase: Arg.Switch('-i', help='Ignore capitalization for alphabetic characters.') = False, ): super().__init__(junk=junk, left=left, right=right, unpad=unpad, nocase=nocase) def _trimfast(self, view: memoryview, *junks: bytes, right=False) -> Tuple[bool, memoryview]: done = False pos = 0 while not done: done = True for junk in junks: temp = junk size = len(junk) if right and self.args.unpad: for k in range(size): n = size - k if view[pos:pos + n] == junk[k:]: pos += n done = False break if view[pos:pos + size] == temp: m = len(temp) while True: mm = m << 1 if view[pos + m:pos + mm] != temp: break temp += temp m = mm temp = memoryview(temp) while m >= size: if view[pos:pos + m] == temp[:m]: done = False pos += m m //= 2 if right or not self.args.unpad: continue while size > 0: if view[pos:pos + size] == temp[:size]: done = False pos += size break size -= 1 return pos def process(self, data: bytearray): junk = list(self.args.junk) if not junk: import string space = string.whitespace.encode('ascii') junk = [space[k - 1:k] for k in range(1, len(space))] lpos = 0 rpos = 0 if self.args.nocase: work = data.lower() junk = [j.lower() for j in junk] else: work = data if self.args.left: lpos = self._trimfast(memoryview(work), *junk) if self.args.right: work.reverse() junk = [bytes(reversed(j)) for j in junk] rpos = self._trimfast(memoryview(work), *junk, right=True) work.reverse() view = memoryview(data) if lpos: view = view[+lpos:] if rpos: view = view[:-rpos] return view
class u16
-
This unit is implemented in
refinery.units.encoding.u16
and has the following commandline Interface:usage: u16 [-h] [-L] [-Q] [-0] [-v] [-R] [-F] Encodes and decodes UTF-16 encoded string data. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class u16(Unit): """ Encodes and decodes UTF-16 encoded string data. """ def reverse(self, data): return data.decode(self.codec).encode('utf-16LE') def process(self, data): return data.decode('utf-16').encode(self.codec) @classmethod def handles(self, data: bytearray): view = memoryview(data) if not any(view[0::2]): return True if not any(view[1::2]): return True
class ucrypt (size=13, salt=b'AA')
-
This unit is implemented in
refinery.units.crypto.keyderive.unixcrypt
and has the following commandline Interface:usage: ucrypt [-h] [-L] [-Q] [-0] [-v] [size] [salt] Implements the classic Unix crypt algorithm. positional arguments: size The number of bytes to generate, default is 13. salt Salt for the derivation, the default is "AA". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class ucrypt(KeyDerivation): """ Implements the classic Unix crypt algorithm. """ def __init__( self, size: Arg(help='The number of bytes to generate, default is 13.') = 13, salt: Arg(help='Salt for the derivation, the default is "AA".') = B'AA' ): super().__init__(size=size, salt=salt) def process(self, data): crypted = bytes(UnixCrypt(data, salt=self.args.salt)) if len(crypted) < self.args.size: raise RefineryPartialResult( F'unix crypt only provided {len(crypted)} bytes, but {self.args.size} ' F'were requested.', partial=crypted ) return crypted[:self.args.size]
class url (plus=False, hex=False)
-
This unit is implemented in
refinery.units.encoding.url
and has the following commandline Interface:usage: url [-h] [-L] [-Q] [-0] [-v] [-R] [-p] [-x] Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the following symbols: _, ., -, ~, \, /. Every other character is escaped by hex-encoding it and prefixing it with a percent symbol. optional arguments: -p, --plus also replace plus signs by spaces -x, --hex hex encode every character in reverse mode generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class url(Unit): """ Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the following symbols: `_`, `.`, `-`, `~`, `\\`, `/`. Every other character is escaped by hex-encoding it and prefixing it with a percent symbol. """ def __init__( self, plus: Arg.Switch('-p', help='also replace plus signs by spaces') = False, hex : Arg.Switch('-x', help='hex encode every character in reverse mode') = False ): super().__init__(plus=plus, hex=hex) def process(self, data): if self.args.plus: data = data.replace(B'+', B' ') data = unquote_to_bytes(bytes(data)) data = re.sub( B'%[uU]([0-9a-fA-F]{4})', lambda m: int(m[1], 16).to_bytes(2, 'little'), data) return data def reverse(self, data): if self.args.hex: result = bytearray(len(data) * 3) offset = 0 for byte in data: result[offset + 0] = 0x25 offset += 1 result[offset:offset + 2] = B'%02X' % byte offset += 2 return result elif self.args.plus: def replace(m): c = m[0][0] return b'+' if c == 0x20 else B'%%%02X' % c else: def replace(m): return B'%%%02X' % m[0][0] return re.sub(B'[^a-zA-Z0-9_.-~\\/]', replace, data)
class urlfix (meta=False, keep=0)
-
This unit is implemented in
refinery.units.misc.urlfix
and has the following commandline Interface:usage: urlfix [-h] [-L] [-Q] [-0] [-v] [-m] [-k] Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output. optional arguments: -m, --meta Extract the query string parameters as metadata. -k, --keep If specified once, keeps the it keeps the URL params and query string. If specified twice, it keeps the URL fragment as well. At this level, the unit still filters out anything that does not parse as a URL. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class urlfix(Unit): """ Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output. """ def __init__( self, meta: Arg.Switch('-m', help='Extract the query string parameters as metadata.') = False, keep: Arg.Counts('-k', help=( 'If specified once, keeps the it keeps the URL params and query string. If specified ' 'twice, it keeps the URL fragment as well. At this level, the unit still filters out ' 'anything that does not parse as a URL.' )) = 0 ): super().__init__(keep=keep, meta=meta) def process(self, data): def fix(string): return quote(unquote(string)) keep = self.args.keep meta = self.args.meta parsed = urlparse(data.decode(self.codec)) if not parsed.scheme or not parsed.netloc: return None query_dict = {key: unquote(value) for key, value in parse_qsl(parsed.query)} query_string = '&'.join(F'{key}={quote(value)}' for key, value in query_dict.items()) replacements = dict( netloc=parsed.netloc.lower(), params=fix(parsed.params), path=fix(parsed.path), query=query_string, fragment=fix(parsed.fragment), ) if keep < 2: replacements.update(fragment='') if keep < 1: replacements.update(params='', query='') url = urlunparse(parsed._replace(**replacements)) url = url.encode(self.codec) if meta: url = self.labelled(url, **query_dict) return url
class urlguards
-
This unit is implemented in
refinery.units.pattern.urlguards
and has the following commandline Interface:usage: urlguards [-h] [-L] [-Q] [-0] [-v] Restores the original URLs from their 'protected' versions as generated by Outlook protection and ProofPoint. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class urlguards(Unit): """ Restores the original URLs from their 'protected' versions as generated by Outlook protection and ProofPoint. """ _PP3RLENC = { letter: rl for rl, letter in enumerate( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789-_', 2 ) } @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v([12])/url\?([:;/_=!?#&.,\w\%\-\+|]+)') def _proofpointV2(self, match): version = int(match[1]) self.log_info('proofpoint match:', version) argmatch = re.match( R'^u=(.+?)&(?:amp;)?{}='.format('k' if version == 1 else '[dc]'), match[2], flags=re.DOTALL ) if not argmatch: self.log_warn('not able to translate unexpected proofpoint format:', match) return match[0] encoded = argmatch[1] if match[1] == '2': encoded = encoded.translate(str.maketrans('-_', '%/')) return unescape(unquote(encoded)) @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v3/__(.+?)__;(.*?)![-\w!?$]+') def _proofpointV3(self, match): data = unquote(match[1]) cmap = match[2] + '=' * (-len(match[2]) % 4) cmap = urlsafe_b64decode(cmap).decode('UTF-8') cursor = 0 result = '' for k in range(len(cmap)): ast = data.find('*', cursor) if ast < 0: break result += data[cursor:ast] if data[ast + 1] == '*': end = self._PP3RLENC[data[ast + 2]] result += cmap[k:end] ast += 2 else: result += cmap[k] cursor = ast + 1 self.log_debug(result) self.log_debug(data[cursor:]) return result + data[cursor:] @unguard(r'https?://\w+.safelinks\.protection\.outlook\.com/([:;/_=!?#&.,\w\%\-\+|]+)') def _outlook(self, match): result = match[0] self.log_info('outlook match:', result) parsed = urlparse(result) params = parse_qs(parsed.query) try: result = unquote(params['url'][0]) except Exception: pass return result @unguard(r'https?://outlook.office.com/actions/ei\?u=([:;/_=!?#&.,\w\%\-\+|]+)') def _outlook_image_proxy(self, match): return unquote(match[1]) @unguard(r'https?://(?:[\w-]+\.)?trendmicro.com(?::\d+)?/wis/clicktime/v[12]/(?:query|clickthrough)[:;/_=!?#&.,\w\%\-\+|]+') def _trendmicro(self, match): result = match[0] self.log_info('trendmicro match:', result) parsed = urlparse(result) params = parse_qs(parsed.query) try: result = unquote(params['url'][0]) except Exception: pass return result @unicoded def process(self, data: str) -> str: newsize, size = 0, len(data) while newsize != size: for handler in ( self._proofpointV2, self._proofpointV3, self._outlook, self._outlook_image_proxy, self._trendmicro ): data = handler(data) size = newsize newsize = len(data) return data
class urn (size='N:N', keep=False, sort=False)
-
This unit is implemented in
refinery.units.meta.urn
and has the following commandline Interface:usage: urn [-h] [-L] [-Q] [-0] [-v] [-k] [-s] [a:b] Treat the chunks in the current frame as items in an urn and produce every possible sequence that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent to generating all possible permutations of these chunks. positional arguments: a:b Generate sequences of length x, where x is in [a:b]. The default value is N:N, where N is the number of chunks in the current frame. optional arguments: -k, --keep Chunks are not returned back to the urn after being drawn. -s, --sort The order of items does not matter; for the output, chunks are sorted according to their original position in the frame. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class urn(Unit): """ Treat the chunks in the current frame as items in an urn and produce every possible sequence that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent to generating all possible permutations of these chunks. """ def __init__(self, size: Arg.String(metavar='a:b', help=( 'Generate sequences of length x, where x is in [a:b]. The default value is {default}, ' 'where N is the number of chunks in the current frame.')) = 'N:N', keep: Arg.Switch('-k', help=( 'Chunks are not returned back to the urn after being drawn.')) = False, sort: Arg.Switch('-s', help=( 'The order of items does not matter; for the output, chunks are sorted according to ' 'their original position in the frame.')) = False ): super().__init__(size=size, keep=keep, sort=sort) def process(self, data: Chunk): yield from data.temp def filter(self, chunks: Iterable[Chunk]): it = iter(chunks) head = next(it) buffer = [bytes(head)] buffer.extend(bytes(c) for c in it) head = head.copy(meta=True, data=False) head.meta['N'] = len(buffer) size = sliceobj(self.args.size, head) a = size.start or 1 b = size.stop or len(buffer) b = max(b, a + 1) c = size.step or 1 self.log_debug(F'using size [{a}:{b}:{c}]') s = 1 if self.args.sort else 0 k = 1 if self.args.keep else 0 m = (s << 1) | k method = { 0b00: lambda i, r: product(i, repeat=r), 0b01: combinations, 0b10: combinations_with_replacement, 0b11: permutations }[m] self.log_info(F'choosing {method.__name__}') for n in range(a, b, c): self.log_debug(F'generating sequences of length {n}') for head.temp in method(buffer, n): yield head
class uuenc
-
This unit is implemented in
refinery.units.encoding.uuenc
and has the following commandline Interface:usage: uuenc [-h] [-L] [-Q] [-0] [-v] [-R] [-F] Unit for uuencode. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class uuenc(Unit): """ Unit for uuencode. """ def process(self, data): header = re.search( B'^begin ([0-7]{3}) (.*?)$', data, flags=re.M) if header is None: raise ValueError('invalid uu header') output = bytearray() view = memoryview(data) breaks = [m.end() for m in iter(re.finditer(B'^', data, flags=re.M))] eol = False for k, br in enumerate(itertools.islice(breaks, 1, None)): if eol and view[br:br + 3] == b'end': path = header[2] if path != B'-': output = self.labelled(output, path=path) return output count = view[br] - 0x20 if count not in range(0x41): raise ValueError(F'Invalid length encoding 0x{view[br]:02X} in line {k}.') count %= 0x40 cursor = len(output) q, r = divmod(count, 3) q += int(bool(r)) end = br + 1 + q * 4 for b in range(br + 1, end, 4): chunk = 0 for j in range(4): character = view[b + j] if character not in range(0x21, 0x61): raise ValueError(F'Invalid character 0x{character:02X} in line {k}.') chunk = ((character - 0x20) % 0x40) | (chunk << 6) output.extend(chunk.to_bytes(3, 'big')) del output[cursor + count:] eol = count == 0 if len(output) < cursor + count: break raise RefineryPartialResult(F'Data truncated in line {k}', output) def reverse(self, data): meta = metavars(data) path = meta.get('path', None) name = path and pathlib.Path(path).name or '-' view = memoryview(data) with MemoryFile() as stream: stream.write(B'begin 666 ') stream.write(name.encode(self.codec)) for k in range(0, len(view), 45): slice = view[k:k + 45] stream.write_byte(0x0A) stream.write_byte(0x20 + len(slice)) for chunk in chunks.unpack(slice, 3, bigendian=True, pad=True): for j in range(3, -1, -1): stream.write_byte(0x20 + (((chunk >> j * 6) & 0x3F) or 0x40)) stream.write(B'\n`\nend\n') return stream.getvalue() @classmethod def handles(self, data): if len(data) < 16: return False if data[:6] == B'begin ': return set(data[6:9]) <= set(B'01234567')
class vaddr (*name, base=None)
-
This unit is implemented in
refinery.units.formats.exe.vaddr
and has the following commandline Interface:usage: vaddr [-h] [-L] [-Q] [-0] [-v] [-R] [-b ADDR] [name [name ...]] Converts a metadata variable holding a file offset to a virtual address. This unit only works when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you would like to retain the original value, it is recommended to use the put unit first to create a copy of an already existing variable, and then convert the copy. positional arguments: name The name of a metadata variable holding an integer. optional arguments: -b, --base ADDR Optionally specify a custom base address B. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class vaddr(Unit): """ Converts a metadata variable holding a file offset to a virtual address. This unit only works when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you would like to retain the original value, it is recommended to use the `refinery.put` unit first to create a copy of an already existing variable, and then convert the copy. """ def __init__( self, *name: Arg(type=str, help='The name of a metadata variable holding an integer.'), base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None ): return super().__init__(names=name, base=base) def process(self, data): try: exe = Executable.Load(data, self.args.base) except Exception: self.log_warn('unable to parse input as executable; no variable conversion was performed') return data meta = metavars(data) for name in self.args.names: value = meta[name] meta[name] = exe.location_from_offset(value).virtual.position return data def reverse(self, data): try: exe = Executable.Load(data, self.args.base) except Exception: self.log_warn('unable to parse input as executable; no variable conversion was performed') return data meta = metavars(data) for name in self.args.names: value = meta[name] meta[name] = exe.location_from_address(value).physical.position return data
class vbapc (raw=False)
-
This unit is implemented in
refinery.units.formats.office.vbapc
and has the following commandline Interface:usage: vbapc [-h] [-L] [-Q] [-0] [-v] [-r] Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to decompile the disassembled p-code. This unit is specifically useful for macro documents that use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the p-code functionality that the document will actually execute. optional arguments: -r, --raw Return disassembled p-code, do not try to decompile. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class vbapc(Unit): """ Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to decompile the disassembled p-code. This unit is specifically useful for macro documents that use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the p-code functionality that the document will actually execute. """ def __init__(self, raw: Unit.Arg.Switch('-r', help='Return disassembled p-code, do not try to decompile.') = False): super().__init__(raw=raw) @Unit.Requires('oletools', 'formats', 'office', 'extended') def _pcodedmp(): with NoLogging(): import pcodedmp.pcodedmp return pcodedmp.pcodedmp def process(self, data): class args: disasmOnly = True verbose = False with io.StringIO() as output: with VirtualFileSystem() as vfs: vf = vfs.new(data) self._pcodedmp.processFile(vf, args, output) code = output.getvalue() if not self.args.raw: from refinery.lib.thirdparty.pcode2code import Parser parser = Parser(code) parser.parseInput() parser.processInput(False) code = parser.getOutput() code = re.sub(R'(?m)^((?:Sub|Function).*?)$(?!\n[^\s])', r'\n\1', code) return code.encode(self.codec)
class vbastr (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.office.vbastr
and has the following commandline Interface:usage: vbastr [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract VBA macro variables from Office documents. The items are extracted in a directory hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same as the variable's name. The variable can define a caption, a control tip text, and a value; the unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class vbastr(PathExtractorUnit): """ Extract VBA macro variables from Office documents. The items are extracted in a directory hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same as the variable's name. The variable can define a caption, a control tip text, and a value; the unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively. """ @PathExtractorUnit.Requires('oletools', 'formats', 'office') def _olevba(): from oletools import olevba return olevba def unpack(self, value): try: parser = self._olevba.VBA_Parser('.', data=bytes(value), relaxed=True) except self._olevba.FileOpenError: raise ValueError('Input data not recognized by VBA parser') try: for path, name, vars in parser.extract_form_strings_extended(): if not vars: continue name = _txt(vars['name']) for ext, key in { 'cap': 'caption', 'tip': 'control_tip_text', 'val': 'value', }.items(): value = _bin(vars.get(key)) if not value: continue yield UnpackResult(F'{path!s}/{name!s}/{name}.{ext}', value) except self._olevba.oleform.OleFormParsingError as error: from collections import Counter self.log_debug(str(error)) self.log_info('extended form extraction failed with error; falling back to simple method') form_strings = list(parser.extract_form_strings()) name_counter = Counter(name for _, name, _ in form_strings) dedup = Counter() for path, name, string in form_strings: if string is None: continue if name_counter[name] > 1: dedup[name] += 1 name = F'{name!s}.v{dedup[name]}' yield UnpackResult(F'{path!s}/{name!s}.val', _bin(string))
class vigenere (key, alphabet=b'abcdefghijklmnopqrstuvwxyz', operator='add', case_sensitive=False, ignore_unknown=False)
-
This unit is implemented in
refinery.units.crypto.cipher.vigenere
and has the following commandline Interface:usage: vigenere [-h] [-L] [-Q] [-0] [-v] [-R] [-: OP] [-c] [-i] key [alphabet] Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher. positional arguments: key The encryption key alphabet The alphabet, by default the Latin one is used: "abcdefghijklmnopqrstuvwxyz" optional arguments: -:, --operator OP Choose the vigenere block operation. The default is add, and the available options are: add, sub, xor -c, --case-sensitive Unless this option is set, the key will be case insensitive. Uppercase letters from the input are transformed using the same shift as would be the lowercase variant, but case is retained. -i, --ignore-unknown Unless this option is set, the key stream will be iterated even for letters that are not contained in the alphabet. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class vigenere(Unit): """ Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher. """ def __init__( self, key: Arg(help='The encryption key'), alphabet: Arg( help='The alphabet, by default the Latin one is used: "{default}"' ) = b'abcdefghijklmnopqrstuvwxyz', operator: Arg.Choice('-:', choices=['add', 'sub', 'xor'], metavar='OP', help=( 'Choose the vigenere block operation. The default is {default}, and the available options are: {choices}')) = 'add', case_sensitive: Arg.Switch('-c', help=( 'Unless this option is set, the key will be case insensitive. Uppercase letters from the input are transformed ' 'using the same shift as would be the lowercase variant, but case is retained.')) = False, ignore_unknown: Arg.Switch('-i', help=( 'Unless this option is set, the key stream will be iterated even ' 'for letters that are not contained in the alphabet.' )) = False ): if not callable(operator): operator = { 'add': __add__, 'sub': __sub__, 'xor': __xor__, }.get(operator.lower(), None) if operator is None: raise ValueError(F'The value {operator!r} is not valid as an operator.') self.superinit(super(), **vars()) def _tabula_recta(self, data, reverse=True): key: str = self.args.key.decode(self.codec) alphabet: str = self.args.alphabet.decode(self.codec) operator = self.args.operator case_sensitive: bool = self.args.case_sensitive ignore_unknown: bool = self.args.ignore_unknown if not case_sensitive: key = key.lower() alphabet = alphabet.lower() if len(set(alphabet)) != len(alphabet): raise ValueError('Duplicate entries detected in alphabet.') if not set(key) <= set(alphabet): diff = set(key) - set(alphabet) diff = ', '.join(diff) raise ValueError(F'key contains letters which are not from the given alphabet: {diff}') self.log_info(F'using key {key} and alphabet {alphabet}') keystream = cycle(key) alph_size = len(alphabet) if reverse: operator = _opeator_inverse[operator] for letter in data: uppercase = not case_sensitive and letter.isupper() if uppercase: letter = letter.lower() try: position = alphabet.index(letter) except ValueError: yield letter if not ignore_unknown: next(keystream) continue shift = alphabet.index(next(keystream)) result = alphabet[operator(position, shift) % alph_size] yield result.upper() if uppercase else result @unicoded def process(self, data): return ''.join(self._tabula_recta(data, True)) @unicoded def reverse(self, data): return ''.join(self._tabula_recta(data, False))
class vmemref (*address, take=None, base=None)
-
This unit is implemented in
refinery.units.formats.exe.vmemref
and has the following commandline Interface:usage: vmemref [-h] [-L] [-Q] [-0] [-v] [-t SIZE] [-b ADDR] [ADDR [ADDR ...]] The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual address for memory references. For each memory reference, the unit looks up the corresponding section and file offset for the reference. It then returns all data from that section starting at the given offset. positional arguments: ADDR Specify the address of a function to scan. If no argument is given, the unit will scan all functions for memory references. optional arguments: -t, --take SIZE Optionally specify the number of bytes to read from each reference; by default, all data until the end of the section is returned. -b, --base ADDR Optionally specify a custom base address B. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class vmemref(Unit): """ The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual address for memory references. For each memory reference, the unit looks up the corresponding section and file offset for the reference. It then returns all data from that section starting at the given offset. """ @Unit.Requires('smda', 'all') def _smda(): import smda import smda.Disassembler import smda.DisassemblyResult return smda def _memory_references( self, exe: Executable, function: SmdaFunction, codes: Container[Range], max_dereference: int = 1 ): def is_valid_data_address(address): if not isinstance(address, int): return False if address not in exe: return False if address in instructions: return False for code in codes: if address in code: return False return True def dereference(address): return int.from_bytes(exe[address:address + pointer_size], exe.byte_order().value) pointer_size = exe.pointer_size // 8 instructions = {op.offset for op in function.getInstructions()} references = set() for op in function.getInstructions(): try: refs = list(op.getDataRefs()) except Exception: continue for address in refs: try: address = int(address) except Exception: continue times_dereferenced = 0 while is_valid_data_address(address) and address not in references: references.add(address) times_dereferenced += 1 if max_dereference and max_dereference > 0 and times_dereferenced > max_dereference: break try: address = dereference(address) except Exception: break return references def __init__( self, *address: Arg.Number(metavar='ADDR', help=( 'Specify the address of a function to scan. If no argument is given, the unit will scan' ' all functions for memory references.')), take: Arg.Number('-t', metavar='SIZE', help=( 'Optionally specify the number of bytes to read from each reference; by default, all ' 'data until the end of the section is returned.')) = None, base: Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None, ): super().__init__(address=address, take=take, base=base) def process(self, data): smda = self._smda take = self.args.take exe = Executable.Load(data, self.args.base) fmt = exe.pointer_size // 4 addresses = self.args.address self.log_info(R'disassembling and exploring call graph using smda') with NoLogging(): cfg = smda.Disassembler.SmdaConfig() cfg.CALCULATE_SCC = False cfg.CALCULATE_NESTING = False cfg.TIMEOUT = 600 dsm = smda.Disassembler.Disassembler(cfg) _input = data if not isinstance(_input, bytes): _input = bytes(data) graph = dsm.disassembleUnmappedBuffer(_input) self.log_info('collecting code addresses for memory reference exclusion list') visits = set() avoid = set() for symbol in exe.symbols(): if not symbol.code: continue avoid.add(exe.location_from_address(symbol.address).virtual.box) if addresses: reset = visits.clear else: def reset(): pass self.log_info('scanning executable for functions') with NoLogging(): addresses = [pfn.offset for pfn in graph.getFunctions()] addresses.sort() for a in addresses: reset() address, function = min(graph.xcfg.items(), key=lambda t: (t[0] >= a, abs(t[0] - a))) self.log_debug(F'scanning function: 0x{address:0{fmt}X}') refs = list(self._memory_references(exe, function, avoid)) refs.sort(reverse=True) last_start = None for ref in refs: if ref in visits: continue visits.add(ref) try: box = exe.location_from_address(ref) end = box.physical.box.upper if take is not None: end = min(ref + take, end) if last_start is not None: end = min(last_start, end) last_start = box.physical.position except CompartmentNotFound: self.log_info(F'memory reference could not be resolved: 0x{ref:0{fmt}X}') else: yield exe.data[last_start:end]
class vsect (*paths, meta=False, synthetic=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)
-
This unit is implemented in
refinery.units.formats.exe.vsect
and has the following commandline Interface:usage: vsect [-h] [-L] [-Q] [-0] [-v] [-m] [-s] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract sections/segments from PE, ELF, and MachO executables. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -m, --meta Populates the metadata variables vaddr and vsize containing the virtual address and size of each section, respectively. -s, --synthetic Include synthesized sections: These represent data regions that are outside the sections as listed by the executable metadata, such as headers and overlays. -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class vsect(PathExtractorUnit): """ Extract sections/segments from PE, ELF, and MachO executables. """ def __init__( self, *paths, meta: Arg.Switch('-m', help=( 'Populates the metadata variables vaddr and vsize containing the virtual address and size ' 'of each section, respectively.')) = False, synthetic: Arg.Switch('-s', help=( 'Include synthesized sections: These represent data regions that are outside the sections ' 'as listed by the executable metadata, such as headers and overlays.')) = False, **keywords ): super().__init__(*paths, meta=meta, synthetic=synthetic, **keywords) def unpack(self, data): exe = Executable.Load(data) mv = memoryview(data) for k, section in enumerate(exe.sections()): if section.synthetic and not self.args.synthetic: continue start = section.physical.lower end = section.physical.upper va = section.virtual.lower vs = len(section.virtual) kwargs = {'offset': start} if self.args.meta: if va is not None: kwargs['vaddr'] = va if vs is not None: kwargs['vsize'] = vs name = section.name if not name: addr = F'{section.virtual.lower:0{exe.pointer_size // 4}X}' self.log_warn(F'section {k} had no name, synthesizing name from virtual address 0x{addr}') name = F'.{addr}' yield UnpackResult(name, mv[start:end], **kwargs)
class vsnip (*addresses, ascii=False, utf16=False, until=b'', base=None)
-
This unit is implemented in
refinery.units.formats.exe.vsnip
and has the following commandline Interface:usage: vsnip [-h] [-L] [-Q] [-0] [-v] [-a | -u | -t B] [-b ADDR] [start:count:align [start:count:align ...]] Extract data from PE, ELF, and MachO files based on virtual offsets. positional arguments: start:count:align Use Python slice syntax to describe an area of virtual memory to read. If a chunksize is specified, then the unit will always read a multiple of that number of bytes optional arguments: -a, --ascii Read ASCII strings; equivalent to -th:00 -u, --utf16 Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2) -t, --until B Read until sequence B is read. -b, --base ADDR Optionally specify a custom base address B. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class vsnip(Unit): """ Extract data from PE, ELF, and MachO files based on virtual offsets. """ def __init__( self, *addresses: Arg.Bounds(metavar='start:count:align', help=( 'Use Python slice syntax to describe an area of virtual memory to read. If a chunksize is ' 'specified, then the unit will always read a multiple of that number of bytes')), ascii: Arg.Switch('-a', group='END', help='Read ASCII strings; equivalent to -th:00') = False, utf16: Arg.Switch('-u', group='END', help='Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)') = False, until: Arg.Binary('-t', group='END', help='Read until sequence {varname} is read.') = B'', base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None, ): if sum(1 for t in (until, utf16, ascii) if t) > 1: raise ValueError('Only one of utf16, ascii, and until can be specified.') return super().__init__(addresses=addresses, utf16=utf16, ascii=ascii, until=until, base=base) def process(self, data: bytearray): until = self.args.until addrs = self.args.addresses if self.args.ascii: until = B'\0' if self.args.utf16: until = B'\0\0' addrs = (slice(a.start, a.stop, 2) for a in addrs) exe = Executable.Load(data, self.args.base) for addr in addrs: area = MemoryArea(addr) location = exe.location_from_address(area.start) offset = location.physical.position max_offset = location.physical.box.upper if not until: end = max_offset else: end = offset - 1 align = area.align while True: end = data.find(until, end + 1) if end not in range(offset, max_offset): raise EndOfStringNotFound if (end - offset) % align == 0: break if area.count: end = min(end, offset + area.count) yield self.labelled(data[offset:end], offset=offset)
class vstack (*address, stop=None, base=None, arch=Arch.X32, engine=_engine.unicorn, meta_registers=False, timeout=None, patch_range=slice(5, None, None), write_range=slice(1, None, None), wait=20, wait_calls=False, skip_calls=0, stack_size=65536, stack_push=None, block_size=4096, max_visits=65536, log_writes_in_calls=False, log_stack_addresses=False, log_other_addresses=False, log_zero_overwrites=False, log_stack_cookies=False)
-
This unit is implemented in
refinery.units.formats.exe.vstack
and has the following commandline Interface:usage: vstack [-h] [-L] [-Q] [-0] [-v] [-s stop] [-b Addr] [-a Arch] [-e _engine] [-r] [-t N] [-p MIN:MAX] [-n MIN:MAX] [-w N] [-c | -C] [-S N] [-u REG] [-B N] [-V N] [-W] [-X] [-Y] [-Z] [-E] [start [start ...]] The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and extracts data patches that are written to the stack during emulation. Emulation is halted as soon as a certain number of instructions has not performed any memory writes, or when an error occurs. By default, most registers are set to the current location in the emulated stack. However, if you want to initialize certain registers differently, you can set an environment variable to the desired value. positional arguments: start Specify the (virtual) addresses of a stack string instruction sequences. optional arguments: -s, --stop stop Optional: Stop when reaching this address. -b, --base Addr Optionally specify a custom base address B. -a, --arch Arch Specify for blob inputs: x32, x64, arm32, arm64, mips16, mips32, mips64, ppc32, ppc64, sparc32, sparc64 -e, --engine _engine The emulator engine. The default is unicorn, options are: speakeasy, icicle, unicorn -r, --meta-registers Consume register initialization values from the chunk's metadata. If the value is a byte string, the data will be mapped. -t, --timeout N Optionally stop emulating after a given number of instructions. -p, --patch-range MIN:MAX Extract only patches that are in the given range, default is 5:. -n, --write-range MIN:MAX Log only writes whose size is in the given range, default is 1:. -w, --wait N When this many instructions did not write to memory, emulation is halted. The default is 20. -c, --wait-calls Wait indefinitely when inside a function call. -C, --skip-calls Skip function calls entirely. Use twice to treat each call as allocating memory. -S, --stack-size N Optionally specify the stack size. The default is 0x10000. -u, --stack-push REG Push the value of a register to the stack before beginning emulation; implies -r. -B, --block-size N Standard memory block size for the emulator, 0x1000 by default. -V, --max-visits N Maximum number of times a code address is visited. Default is 65536. -W, --log-writes-in-calls Log writes of values that occur in functions calls. -X, --log-stack-addresses Log writes of values that are stack addresses. -Y, --log-other-addresses Log writes of values that are addresses to mapped segments. -Z, --log-zero-overwrites Log writes of zeros to memory that contained nonzero values. -E, --log-stack-cookies Log writes that look like stack cookies. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class vstack(Unit): """ The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and extracts data patches that are written to the stack during emulation. Emulation is halted as soon as a certain number of instructions has not performed any memory writes, or when an error occurs. By default, most registers are set to the current location in the emulated stack. However, if you want to initialize certain registers differently, you can set an environment variable to the desired value. """ @Unit.Requires('intervaltree', 'default', 'extended') def _intervaltree(): import intervaltree return intervaltree @Unit.Requires('capstone', 'default', 'extended') def _capstone(): import capstone return capstone @Unit.Requires('unicorn', 'default', 'extended') def _unicorn(): with NoLogging(): import unicorn return unicorn @Unit.Requires('speakeasy-emulator', 'extended') def _speakeasy(): import speakeasy return speakeasy @Unit.Requires('icicle-emu', 'all') def _icicle(): import icicle return icicle def __init__( self, *address: Arg.NumSeq(metavar='start', help='Specify the (virtual) addresses of a stack string instruction sequences.'), stop: Arg.Number('-s', metavar='stop', help='Optional: Stop when reaching this address.') = None, base: Arg.Number('-b', metavar='Addr', help='Optionally specify a custom base address B.') = None, arch: Arg.Option('-a', help='Specify for blob inputs: {choices}', choices=Arch) = Arch.X32, engine: Arg.Option('-e', choices=_engine, help='The emulator engine. The default is {default}, options are: {choices}') = _engine.unicorn, meta_registers: Arg.Switch('-r', help=( 'Consume register initialization values from the chunk\'s metadata. If the value is a byte string, ' 'the data will be mapped.')) = False, timeout: Arg.Number('-t', help='Optionally stop emulating after a given number of instructions.') = None, patch_range: Arg.Bounds('-p', metavar='MIN:MAX', help='Extract only patches that are in the given range, default is {default}.') = slice(5, None), write_range: Arg.Bounds('-n', metavar='MIN:MAX', help='Log only writes whose size is in the given range, default is {default}.') = slice(1, None), wait: Arg.Number('-w', help=( 'When this many instructions did not write to memory, emulation is halted. The default is {default}.')) = 20, wait_calls: Arg.Switch('-c', group='CALL', help='Wait indefinitely when inside a function call.') = False, skip_calls: Arg.Counts('-C', group='CALL', help='Skip function calls entirely. Use twice to treat each call as allocating memory.') = 0, stack_size: Arg.Number('-S', help='Optionally specify the stack size. The default is 0x{default:X}.') = 0x10000, stack_push: Arg('-u', action='append', type=str, metavar='REG', help='Push the value of a register to the stack before beginning emulation; implies -r.') = None, block_size: Arg.Number('-B', help='Standard memory block size for the emulator, 0x{default:X} by default.') = 0x1000, max_visits: Arg.Number('-V', help='Maximum number of times a code address is visited. Default is {default}.') = 0x10000, log_writes_in_calls: Arg.Switch('-W', help='Log writes of values that occur in functions calls.') = False, log_stack_addresses: Arg.Switch('-X', help='Log writes of values that are stack addresses.') = False, log_other_addresses: Arg.Switch('-Y', help='Log writes of values that are addresses to mapped segments.') = False, log_zero_overwrites: Arg.Switch('-Z', help='Log writes of zeros to memory that contained nonzero values.') = False, log_stack_cookies : Arg.Switch('-E', help='Log writes that look like stack cookies.') = False, ): super().__init__( address=address, stop=stop, base=base, arch=Arg.AsOption(arch, Arch), engine=Arg.AsOption(engine, _engine), meta_registers=meta_registers, timeout=timeout, patch_range=patch_range, write_range=write_range, wait=wait, stack_size=stack_size, stack_push=stack_push, wait_calls=wait_calls, skip_calls=skip_calls, block_size=block_size, max_visits=max_visits, log_writes_in_calls=log_writes_in_calls, log_stack_addresses=log_stack_addresses, log_other_addresses=log_other_addresses, log_zero_overwrites=log_zero_overwrites, log_stack_cookies=log_stack_cookies ) def process(self, data): meta = metavars(data) args = self.args engine: _engine = args.engine flags = Hook.Default self.log_debug(F'attempting to use {engine.name}') getattr(self, F'_{engine.name}') if engine is _engine.speakeasy: flags |= Hook.ApiCall class Emu(engine.value, VStackEmulatorMixin): pass emu = Emu( data, args.base, args.arch, flags, args.block_size, args.stack_size, ) cfg = EmuConfig( args.wait_calls, args.skip_calls, args.write_range, args.wait, args.block_size, args.stack_size, args.max_visits, args.log_stack_cookies, args.log_writes_in_calls, args.log_stack_addresses, args.log_other_addresses, args.log_zero_overwrites, ) register_values = {} emu.reset(None) if args.meta_registers or args.stack_push: for var, value in list(meta.items()): try: register = emu.lookup_register(var) except LookupError: continue meta.discard(var) register_values[register] = var, value def parse_address(a: Union[int, bytes]): if isinstance(a, int): return a a = a.decode(self.codec) if m := re.fullmatch('(?i)([A-F0-9]+)H?', a): return int(m[1], 16) try: return PythonExpression.Evaluate(a, meta) except ParserVariableMissing: pass symbols = list(emu.exe.symbols()) for filter in [ lambda s: s.get_name().casefold() == a.casefold(), lambda s: s.name == a, lambda s: s.code, lambda s: s.exported ]: symbols = [s for s in symbols if filter(s)] if len(symbols) == 1: return symbols[0].address if len(symbols) > 1: raise RuntimeError(F'there are {len(symbols)} exported function symbol named "{a}", please specify the address') if not symbols: raise LookupError(F'no symbol with name "{a}" was found') addresses = [parse_address(a) for a in args.address] if not addresses: for symbol in emu.exe.symbols(): if symbol.name is None: addresses.append(symbol.address) break for address in addresses: tree = self._intervaltree.IntervalTree() state = EmuState(cfg, tree, address, emu.exe.pointer_size // 4, stop=args.stop) emu.reset(state) for reg in emu.general_purpose_registers(): if reg not in register_values: state.init_registers.append(reg) for reg, (var, value) in register_values.items(): if isinstance(value, int): self.log_info(F'setting {var} to integer value 0x{value:X}') emu.set_register(reg, value) continue if isinstance(value, str): value = value.encode() if isbuffer(value): base = emu.malloc(len(value)) emu.mem_write(base, bytes(value)) emu.set_register(reg, base) self.log_info(F'setting {var} to mapped buffer of size 0x{len(value):X}') continue _tn = value.__class__.__name__ self.log_warn(F'canot interpret value of type {_tn} for register {var}') if push := args.stack_push: for reg in push: emu.push_register(reg) timeout = args.timeout if timeout is not None: self.log_info(F'setting timeout of {timeout} steps') state.ticks = timeout try: emu.emulate(address, args.stop) except EmulationError: pass yield from state.synthesized tree.merge_overlaps() it: Iterator[Interval] = iter(tree) for interval in it: size = interval.end - interval.begin - 1 if size not in bounds[args.patch_range]: continue try: patch = emu.mem_read(interval.begin, size) except Exception as error: width = emu.exe.pointer_size // 4 self.log_info(F'error reading 0x{interval.begin:0{width}X}:{size}: {error!s}') continue if not any(patch): continue self.log_info(F'memory patch at {state.fmt(interval.begin)} of size {size}') yield patch
class winreg (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.winreg
and has the following commandline Interface:usage: winreg [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract values from a Windows registry hive or from a registry export (.reg file). positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class winreg(PathExtractorUnit): """ Extract values from a Windows registry hive or from a registry export (.reg file). """ @PathExtractorUnit.Requires('python-registry', 'formats') def _registry(): import Registry import Registry.Registry import Registry.RegistryParse return Registry @staticmethod def _walk(patterns: List[PathPattern], key: RegistryKey, *path: str): here = '/'.join(path) if not any(p.reach(here) for p in patterns): winreg.log_debug(F'pruning search at {here}') return for value in key.values(): def raw(v: RegistryValue = value): return v.raw_data() vpath = F'{here}/{value.name()}' yield UnpackResult(vpath, raw) for subkey in key.subkeys(): yield from winreg._walk(patterns, subkey, *path, subkey.name()) def _unpack_hive(self, data: bytearray): try: with MemoryFile(data) as stream: root = self._registry.Registry.Registry(stream).root() yield from self._walk(self._patterns, root, root.name()) except self._registry.RegistryParse.ParseException: raise ParseException def _decode_registry_export(self, data: str): def REG_BINARY(data: str) -> bytes: return bytes.fromhex(re.sub('[^a-f0-9]+', '', data)) def REG_SZ(data: str) -> bytes: return data.encode(self.codec) | esc(quoted=True) | bytes def REG_EXPAND_SZ(data: str): return REG_BINARY(data).decode('UTF-16LE').rstrip('\0').encode(self.codec) def REG_MULTI_SZ(data: str): data = REG_BINARY(data).decode('UTF-16LE').split('\0') for string in data: if string: yield string.encode(self.codec) def REG_DWORD(data: str): value = int(data, 16) return F'0x{value:X}'.encode(self.codec) def REG_QWORD(data: str): value = int.from_bytes(REG_BINARY(data), 'little') return F'0x{value:X}'.encode(self.codec) class Missing: def __init__(self, name: str): self.name = name def __str__(self): return self.name REG_NONE = REG_EXPAND_SZ REG_DWORD_BIG_ENDIAN = Missing('REG_DWORD_BIG_ENDIAN') REG_LINK = Missing('REG_LINK') REG_RESOURCE_LIST = Missing('REG_RESOURCE_LIST') REG_FULL_RESOURCE_DESCRIPTOR = Missing('REG_FULL_RESOURCE_DESCRIPTOR') REG_RESOURCE_REQUIREMENTS_LIST = Missing('REG_RESOURCE_REQUIREMENTS_LIST') prefix, _, encoded = data.partition(':') try: decoder = { 'hex(0)' : REG_NONE, 'hex(1)' : REG_SZ, 'hex(2)' : REG_EXPAND_SZ, 'hex(3)' : REG_BINARY, 'hex' : REG_BINARY, 'hex(4)' : REG_DWORD, 'dword' : REG_DWORD, 'hex(5)' : REG_DWORD_BIG_ENDIAN, 'hex(6)' : REG_LINK, 'hex(7)' : REG_MULTI_SZ, 'hex(8)' : REG_RESOURCE_LIST, 'hex(9)' : REG_FULL_RESOURCE_DESCRIPTOR, 'hex(a)' : REG_RESOURCE_REQUIREMENTS_LIST, 'hex(b)' : REG_QWORD, }[prefix] except KeyError: decoder = REG_SZ encoded = data if isinstance(decoder, Missing): self.log_warn(F'Found registry type {decoder!s}; no decoder implemented.') return self.log_debug(F'decoding as {decoder.__name__}: {encoded}') it = decoder(encoded) if not inspect.isgenerator(it): it = (it,) yield from it def _unpack_file(self, data: bytearray): for codec in ('utf16', 'utf-16le', 'utf8'): try: reg = data.decode(codec).splitlines(keepends=True) except UnicodeError: continue if reg[0].startswith('Windows Registry Editor'): break else: raise ParseException config = WinRegFileParser() config.read_string(''.join(reg[1:])) for key in config.sections(): self.log_debug(key) for value in config[key]: name = next(iter(shlex.split(value))) path = Path(key) / Path(name) data = config[key][value] decoded = list(self._decode_registry_export(data)) if len(decoded) == 1: yield UnpackResult(str(path), decoded[0]) continue for k, d in enumerate(decoded): yield UnpackResult(F'{path!s}.{k}', d) def unpack(self, data): with contextlib.suppress(ParseException): yield from self._unpack_hive(data) return yield from self._unpack_file(data)
class wshenc (marker=True)
-
This unit is implemented in
refinery.units.encoding.wshenc
and has the following commandline Interface:usage: wshenc [-h] [-L] [-Q] [-0] [-v] [-R] [-m] Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE). optional arguments: -m, --no-marker Do not require magic marker when encoding and do not search for marker when decoding. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class wshenc(Unit): """ Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE). """ _MARKER_INIT = RB'#@~^BINREF==' _MARKER_STOP = RB'BINREF==^#~@' _CHUNKS = ( 0x57, 0x6E, 0x7B, 0x4A, 0x4C, 0x41, 0x0B, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x4A, 0x4C, 0x41, 0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x14, 0x15, 0x15, 0x15, 0x16, 0x16, 0x16, 0x17, 0x17, 0x17, 0x18, 0x18, 0x18, 0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B, 0x1B, 0x1C, 0x1C, 0x1C, 0x1D, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x2E, 0x2D, 0x32, 0x47, 0x75, 0x30, 0x7A, 0x52, 0x21, 0x56, 0x60, 0x29, 0x42, 0x71, 0x5B, 0x6A, 0x5E, 0x38, 0x2F, 0x49, 0x33, 0x26, 0x5C, 0x3D, 0x49, 0x62, 0x58, 0x41, 0x7D, 0x3A, 0x34, 0x29, 0x35, 0x32, 0x36, 0x65, 0x5B, 0x20, 0x39, 0x76, 0x7C, 0x5C, 0x72, 0x7A, 0x56, 0x43, 0x7F, 0x73, 0x38, 0x6B, 0x66, 0x39, 0x63, 0x4E, 0x70, 0x33, 0x45, 0x45, 0x2B, 0x6B, 0x68, 0x68, 0x62, 0x71, 0x51, 0x59, 0x4F, 0x66, 0x78, 0x09, 0x76, 0x5E, 0x62, 0x31, 0x7D, 0x44, 0x64, 0x4A, 0x23, 0x54, 0x6D, 0x75, 0x43, 0x71, 0x4A, 0x4C, 0x41, 0x7E, 0x3A, 0x60, 0x4A, 0x4C, 0x41, 0x5E, 0x7E, 0x53, 0x40, 0x4C, 0x40, 0x77, 0x45, 0x42, 0x4A, 0x2C, 0x27, 0x61, 0x2A, 0x48, 0x5D, 0x74, 0x72, 0x22, 0x27, 0x75, 0x4B, 0x37, 0x31, 0x6F, 0x44, 0x37, 0x4E, 0x79, 0x4D, 0x3B, 0x59, 0x52, 0x4C, 0x2F, 0x22, 0x50, 0x6F, 0x54, 0x67, 0x26, 0x6A, 0x2A, 0x72, 0x47, 0x7D, 0x6A, 0x64, 0x74, 0x39, 0x2D, 0x54, 0x7B, 0x20, 0x2B, 0x3F, 0x7F, 0x2D, 0x38, 0x2E, 0x2C, 0x77, 0x4C, 0x30, 0x67, 0x5D, 0x6E, 0x53, 0x7E, 0x6B, 0x47, 0x6C, 0x66, 0x34, 0x6F, 0x35, 0x78, 0x79, 0x25, 0x5D, 0x74, 0x21, 0x30, 0x43, 0x64, 0x23, 0x26, 0x4D, 0x5A, 0x76, 0x52, 0x5B, 0x25, 0x63, 0x6C, 0x24, 0x3F, 0x48, 0x2B, 0x7B, 0x55, 0x28, 0x78, 0x70, 0x23, 0x29, 0x69, 0x41, 0x28, 0x2E, 0x34, 0x73, 0x4C, 0x09, 0x59, 0x21, 0x2A, 0x33, 0x24, 0x44, 0x7F, 0x4E, 0x3F, 0x6D, 0x50, 0x77, 0x55, 0x09, 0x3B, 0x53, 0x56, 0x55, 0x7C, 0x73, 0x69, 0x3A, 0x35, 0x61, 0x5F, 0x61, 0x63, 0x65, 0x4B, 0x50, 0x46, 0x58, 0x67, 0x58, 0x3B, 0x51, 0x31, 0x57, 0x49, 0x69, 0x22, 0x4F, 0x6C, 0x6D, 0x46, 0x5A, 0x4D, 0x68, 0x48, 0x25, 0x7C, 0x27, 0x28, 0x36, 0x5C, 0x46, 0x70, 0x3D, 0x4A, 0x6E, 0x24, 0x32, 0x7A, 0x79, 0x41, 0x2F, 0x37, 0x3D, 0x5F, 0x60, 0x5F, 0x4B, 0x51, 0x4F, 0x5A, 0x20, 0x42, 0x2C, 0x36, 0x65, 0x57) _OFFSETS = ( 0, 1, 2, 0, 1, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1, 0, 0, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 1) _ENCODER = { 0x09 : [0x37, 0x69, 0x64], 0x0B : [0x0B, 0x0B, 0x0B], 0x0C : [0x0C, 0x0C, 0x0C], 0x0E : [0x0E, 0x0E, 0x0E], 0x0F : [0x0F, 0x0F, 0x0F], 0x10 : [0x10, 0x10, 0x10], 0x11 : [0x11, 0x11, 0x11], 0x12 : [0x12, 0x12, 0x12], 0x13 : [0x13, 0x13, 0x13], 0x14 : [0x14, 0x14, 0x14], 0x15 : [0x15, 0x15, 0x15], 0x16 : [0x16, 0x16, 0x16], 0x17 : [0x17, 0x17, 0x17], 0x18 : [0x18, 0x18, 0x18], 0x19 : [0x19, 0x19, 0x19], 0x1A : [0x1A, 0x1A, 0x1A], 0x1B : [0x1B, 0x1B, 0x1B], 0x1C : [0x1C, 0x1C, 0x1C], 0x1D : [0x1D, 0x1D, 0x1D], 0x1E : [0x1E, 0x1E, 0x1E], 0x1F : [0x1F, 0x1F, 0x1F], 0x20 : [0x7E, 0x2C, 0x50], 0x21 : [0x5A, 0x65, 0x22], 0x22 : [0x45, 0x72, 0x4A], 0x23 : [0x3A, 0x5B, 0x61], 0x24 : [0x79, 0x66, 0x5E], 0x25 : [0x59, 0x75, 0x5D], 0x26 : [0x27, 0x4C, 0x5B], 0x27 : [0x76, 0x45, 0x42], 0x28 : [0x63, 0x76, 0x60], 0x29 : [0x62, 0x2A, 0x23], 0x2A : [0x4D, 0x43, 0x65], 0x2B : [0x51, 0x33, 0x5F], 0x2C : [0x53, 0x42, 0x7E], 0x2D : [0x52, 0x20, 0x4F], 0x2E : [0x20, 0x63, 0x52], 0x2F : [0x26, 0x4A, 0x7A], 0x30 : [0x54, 0x5A, 0x21], 0x31 : [0x71, 0x38, 0x46], 0x32 : [0x2B, 0x79, 0x20], 0x33 : [0x66, 0x32, 0x26], 0x34 : [0x2A, 0x57, 0x63], 0x35 : [0x58, 0x6C, 0x2A], 0x36 : [0x7F, 0x2B, 0x76], 0x37 : [0x7B, 0x46, 0x47], 0x38 : [0x30, 0x52, 0x25], 0x39 : [0x31, 0x4F, 0x2C], 0x3A : [0x6C, 0x3D, 0x29], 0x3B : [0x49, 0x70, 0x69], 0x3D : [0x78, 0x7B, 0x27], 0x3F : [0x5F, 0x51, 0x67], 0x40 : [0x40, None, 0x40], 0x41 : [0x29, 0x7A, 0x62], 0x42 : [0x24, 0x7E, 0x41], 0x43 : [0x2F, 0x3B, 0x5A], 0x44 : [0x39, 0x47, 0x66], 0x45 : [0x33, 0x41, 0x32], 0x46 : [0x6F, 0x77, 0x73], 0x47 : [0x21, 0x56, 0x4D], 0x48 : [0x75, 0x5F, 0x43], 0x49 : [0x28, 0x26, 0x71], 0x4A : [0x42, 0x78, 0x39], 0x4B : [0x46, 0x6E, 0x7C], 0x4C : [0x4A, 0x64, 0x53], 0x4D : [0x5C, 0x74, 0x48], 0x4E : [0x48, 0x67, 0x31], 0x4F : [0x36, 0x7D, 0x72], 0x50 : [0x4B, 0x68, 0x6E], 0x51 : [0x7D, 0x35, 0x70], 0x52 : [0x5D, 0x22, 0x49], 0x53 : [0x6A, 0x55, 0x3F], 0x54 : [0x50, 0x3A, 0x4B], 0x55 : [0x69, 0x60, 0x6A], 0x56 : [0x23, 0x6A, 0x2E], 0x57 : [0x09, 0x71, 0x7F], 0x58 : [0x70, 0x6F, 0x28], 0x59 : [0x65, 0x49, 0x35], 0x5A : [0x74, 0x5C, 0x7D], 0x5B : [0x2C, 0x5D, 0x24], 0x5C : [0x77, 0x27, 0x2D], 0x5D : [0x44, 0x59, 0x54], 0x5E : [0x3F, 0x25, 0x37], 0x5F : [0x6D, 0x7C, 0x7B], 0x60 : [0x7C, 0x23, 0x3D], 0x61 : [0x43, 0x6D, 0x6C], 0x62 : [0x38, 0x28, 0x34], 0x63 : [0x5E, 0x31, 0x6D], 0x64 : [0x5B, 0x39, 0x4E], 0x65 : [0x6E, 0x7F, 0x2B], 0x66 : [0x57, 0x36, 0x30], 0x67 : [0x4C, 0x54, 0x6F], 0x68 : [0x34, 0x34, 0x74], 0x69 : [0x72, 0x62, 0x6B], 0x6A : [0x25, 0x4E, 0x4C], 0x6B : [0x56, 0x30, 0x33], 0x6C : [0x73, 0x5E, 0x56], 0x6D : [0x68, 0x73, 0x3A], 0x6E : [0x55, 0x09, 0x78], 0x6F : [0x47, 0x4B, 0x57], 0x70 : [0x32, 0x61, 0x77], 0x71 : [0x35, 0x24, 0x3B], 0x72 : [0x2E, 0x4D, 0x44], 0x73 : [0x64, 0x6B, 0x2F], 0x74 : [0x4F, 0x44, 0x59], 0x75 : [0x3B, 0x21, 0x45], 0x76 : [0x2D, 0x37, 0x5C], 0x77 : [0x41, 0x53, 0x68], 0x78 : [0x61, 0x58, 0x36], 0x79 : [0x7A, 0x48, 0x58], 0x7A : [0x22, 0x2E, 0x79], 0x7B : [0x60, 0x50, 0x09], 0x7C : [0x6B, 0x2D, 0x75], 0x7D : [0x4E, 0x29, 0x38], 0x7E : [0x3D, 0x3F, 0x55], 0x7F : [0x67, 0x2F, 0x51] } _ESCAPE = { 0x40: B'@$', 0x3C: B'@!', 0x3E: B'@*', 0x0D: B'@#', 0x0A: B'@&', } _UNESCAPE = { B'@$': B'@', B'@!': B'<', B'@*': B'>', B'@#': B'\r', B'@&': B'\n', } def __init__( self, marker: Arg.Switch('-m', '--no-marker', off=True, help=( 'Do not require magic marker when encoding and do not search for ' 'marker when decoding.') ) = True ): super().__init__(marker=marker) @classmethod def _chunk(cls, byte, index): k = byte - 9 c = cls._CHUNKS[k * 3 : k * 3 + 3] return c[cls._OFFSETS[index % 64]] def _escape(self, iterable): escapes = bytes(self._ESCAPE) if self.args.marker: yield from self._MARKER_INIT for byte in iterable: if byte in escapes: yield from self._ESCAPE[byte] else: yield byte if self.args.marker: yield from self._MARKER_STOP def _unescape(self, data): def unescaper(m): return self._UNESCAPE[m[0]] return re.sub(RB'@[$!*#&]', unescaper, data) @classmethod def _decoded(cls, data): index = -1 for byte in data: if byte < 128: index += 1 if (byte == 9 or 31 < byte < 128) and byte != 60 and byte != 62 and byte != 64: byte = cls._chunk(byte, index) yield byte @classmethod def _encoded(cls, data): for i, byte in enumerate(data): try: sequence = cls._ENCODER[byte] except KeyError: yield byte else: offset = cls._OFFSETS[i % 0x40] yield sequence[offset] def reverse(self, data): return bytearray(self._escape(self._encoded(data))) def process(self, data): if self.args.marker: match = formats.wshenc.search(data) if not match: raise ValueError('Encoded script marker was not found.') data = match[0][12:-12] return bytearray(self._decoded(self._unescape(data)))
class xchacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
This unit is implemented in
refinery.units.crypto.cipher.chacha
and has the following commandline Interface:usage: xchacha [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce] XChaCha encryption and decryption. The nonce must be 24 bytes long. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. optional arguments: -s, --stateful Do not reset the key stream while processing the chunks of one frame. -d, --discard N Discard the first N bytes of the keystream, 0 by default. -m, --magic MAGIC The magic constant; depends on the key size by default. -x, --offset N Optionally specify the stream index, default is 0. -r, --rounds N The number of rounds. Has to be an even number. Default is 20. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class xchacha(LatinCipherUnit): """ XChaCha encryption and decryption. The nonce must be 24 bytes long. """ def keystream(self) -> Iterable[int]: kdp, kdn, nonce = struct.unpack('<Q8s8s', self.args.nonce) yield from LatinX( ChaChaCipher, (0, 1, 2, 3, 12, 13, 14, 15), self.args.key, kdn, kdp, nonce, self.args.magic, self.args.rounds, self.args.offset, )
class xfcc (variable='count', relative=False)
-
This unit is implemented in
refinery.units.meta.xfcc
and has the following commandline Interface:usage: xfcc [-h] [-L] [-Q] [-0] [-v] [-r] [variable] The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames of input. It consumes all frames in the current and counts the number of times each item occurs. It converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of every leaf has this leaf as its only child. The leaves of this tree have been enriched with a meta variable containing the number of times the corresponding chunk has occurred in the input frame tree. positional arguments: variable The variable which is used as the accumulator optional arguments: -r, --relative Normalize the accumulator to a number between 0 and 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xfcc(Unit): """ The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames of input. It consumes all frames in the current and counts the number of times each item occurs. It converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of every leaf has this leaf as its only child. The leaves of this tree have been enriched with a meta variable containing the number of times the corresponding chunk has occurred in the input frame tree. """ def __init__( self, variable: Arg(help='The variable which is used as the accumulator') = 'count', relative: Arg.Switch('-r', help='Normalize the accumulator to a number between 0 and 1.') = False ): super().__init__(variable=variable, relative=relative) self._trunk = None self._store: Dict[Chunk, int] = defaultdict(int) def finish(self): vn = self.args.variable rc = self.args.relative if rc and self._store: maximum = max(self._store.values()) for index, (chunk, count) in enumerate(self._store.items()): if rc: count /= maximum chunk.path[-2] = 0 chunk.path[-1] = index chunk.meta[vn] = count yield chunk self._store.clear() def _getcount(self, chunk): try: count = int(chunk.meta[self.args.variable]) except (AttributeError, KeyError, TypeError): return 1 else: return count def filter(self, chunks: Iterable[Chunk]): it = iter(chunks) try: head = next(it) except StopIteration: return if len(head.path) < 2: self.log_warn(F'the current frame is nested {len(head.path)} layers deep, at least two layers are required.') yield head yield from it return trunk = head.path[:-2] store = self._store if trunk != self._trunk: yield from self.finish() self._trunk = trunk store[head] += self._getcount(head) for chunk in it: store[chunk] += self._getcount(chunk)
class xj0 (fmt='', all=False, one=False, raw=False)
-
This unit is implemented in
refinery.units.formats.json
and has the following commandline Interface:usage: xj0 [-h] [-L] [-Q] [-0] [-v] [-a | -x] [-r] [fmt] Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to extract remaining fields as metadata: String values are extracted only if they do not exceed 80 characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and lists of the latter are also extracted. positional arguments: fmt Format expression for the output chunk; may use previously extracted JSON items. The default is , which represents the input data. optional arguments: -a, --all Extract all other fields as metadata regardless of length and type. -x, --one Do not extract any other fields as metadata. -r, --raw Disable conversion of JSON strings to binary strings in metadata generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xj0(Unit): """ Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to extract remaining fields as metadata: String values are extracted only if they do not exceed 80 characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and lists of the latter are also extracted. """ def __init__( self, fmt: Unit.Arg.String(help=( 'Format expression for the output chunk; may use previously extracted JSON items. The default ' 'is {default}, which represents the input data.')) = '', all: Unit.Arg.Switch('-a', group='META', help='Extract all other fields as metadata regardless of length and type.') = False, one: Unit.Arg.Switch('-x', group='META', help='Do not extract any other fields as metadata.') = False, raw: Unit.Arg.Switch('-r', help='Disable conversion of JSON strings to binary strings in metadata') = False, ): super().__init__(fmt=fmt, one=one, raw=raw, all=all) def process(self, data: Chunk): def convert(value, iskey=False): if self.args.raw: return value if isinstance(value, (float, int, bool)): return value if isinstance(value, str): return value.encode(self.codec) if iskey: raise TypeError if isinstance(value, dict): return {convert(k): convert(v) for k, v in value.items()} if isinstance(value, list): return [convert(k) for k in value] def acceptable(key, value, nested=False, convert=False): if not is_valid_variable_name(key): self.log_info(F'rejecting item with invalid name {key}') return None if isinstance(value, (float, int, bool)): return value if isinstance(value, dict): if not self.args.all: self.log_info(F'rejecting item {key} with dictionary value') return False return True if isinstance(value, list): if nested: self.log_info(F'rejecting item {key} containing a doubly nested list') return False return all(acceptable(key, t, True) for t in value) if isinstance(value, str): if not self.args.all: if len(value) not in range(1, 80): self.log_info(F'rejecting string item {key} because {len(value)} exceeds the length limit') return False if '\n' in value: self.log_info(F'rejecting string item {key} because it contains line breaks') return False return True return False jdoc: dict = json.loads(data) if not isinstance(jdoc, dict): raise ValueError('The input must be a JSON dictionary.') meta = metavars(data) args = {k: convert(v) for k, v in jdoc.items() if acceptable(k, v)} used = set() data[:] = meta.format_bin(self.args.fmt, self.codec, [data], args, used) for u in used: args.pop(u, None) if not self.args.one: data.meta.update(args) return data
class xjl
-
This unit is implemented in
refinery.units.formats.json
and has the following commandline Interface:usage: xjl [-h] [-L] [-Q] [-0] [-v] [-R] Returns all JSON elements from a JSON iterable as individual outputs. When reversed, the unit collects all chunks in the frame and wraps them as a JSON list. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class xjl(Unit): """ Returns all JSON elements from a JSON iterable as individual outputs. When reversed, the unit collects all chunks in the frame and wraps them as a JSON list. """ def process(self, data): try: doc: Union[list, dict] = json.loads(data) except Exception: from refinery.units.pattern.carve_json import carve_json doc = data | carve_json | json.loads try: it = doc.values() except AttributeError: it = doc for item in it: yield json.dumps(item, indent=4).encode(self.codec) def reverse(self, data): return json.dumps(data.temp).encode(self.codec) def filter(self, chunks: Iterable[Chunk]): if not self.args.reverse: yield from chunks from refinery.lib.tools import begin if it := begin(chunks): head, rest = it collected = [head.decode(self.codec)] collected.extend(chunk.decode(self.codec) for chunk in rest) head.temp = collected yield head
class xkey (range=slice(1, 32, None))
-
This unit is implemented in
refinery.units.misc.xkey
and has the following commandline Interface:usage: xkey [-h] [-L] [-Q] [-0] [-v] [start:end:step] The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key, and where the plaintext also has one letter that occurs with overwhelming frequency. This is often the case for the zero byte in binary formats such as PE files, and the space character in text files. Based on this assumption, the unit computes the most likely key. This can be useful to decrypt PE and uncompressed text files that were encrypted byte-wise using a short key. positional arguments: start:end:step range of length values to try in Python slice syntax, the default is 1:32. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xkey(Unit): """ The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key, and where the plaintext also has one letter that occurs with overwhelming frequency. This is often the case for the zero byte in binary formats such as PE files, and the space character in text files. Based on this assumption, the unit computes the most likely key. This can be useful to decrypt PE and uncompressed text files that were encrypted byte-wise using a short key. """ def __init__( self, range: Arg.Bounds(help='range of length values to try in Python slice syntax, the default is {default}.') = slice(1, 32), ): super().__init__(range=range) def process(self, data: bytearray): score = 0 guess = None bounds: slice = self.args.range view = memoryview(data) n = len(view) if n <= 1: return view start = bounds.start or 1 stop = min(bounds.stop or n, n) if bounds.step is not None: step = bounds.step if bounds.start is None: start *= step else: step = 1 self.log_debug(F'received input range [{bounds.start}:{bounds.stop}:{bounds.step}], using [{start}:{stop}:{step}]') for _count in range(start, stop + 1, step): _guess = [Counter(view[j::_count]).most_common(1)[0] for j in range(_count)] _score = sum(letter_count for _, letter_count in _guess) / n # This scaling accounts for the smaller probability of larger keys. No proper statistical analysis has been # conducted to derive it; there might be plenty of room for improvement here. _score = _score * ((n - _count) / (n - 1)) ** _count logmsg = F'got score {_score * 100:5.2f}% for length {_count}' if _score > score: self.log_info(logmsg) score = _score guess = bytearray(value for value, _ in _guess) else: self.log_debug(logmsg) return guess
class xlmdeobf (extract_only=False, sort_formulas=False, with_ms_excel=False, day=-1, output_formula_format='CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format='CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent=False, start_point='', password='', output_level=0, timeout=0)
-
This unit is implemented in
refinery.units.formats.office.xlmdeobf
and has the following commandline Interface:usage: xlmdeobf [-h] [-L] [-Q] [-0] [-v] [-x] [-s] [-X] [-d N] [-O FMT] [-E FMT] [-I] [-c CELL] [-p PASSWORD] [-o N] [-t N] Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros. optional arguments: -x, --extract-only Only extract cells without any emulation. -s, --sort-formulas Sort extracted formulas based on their cell address (implies -x). -X, --with-ms-excel Use MS Excel to process XLS files. -d, --day N Specify the day of month -O, --output-format FMT Specify the format for output formulas (using [[CELL-ADDR]], [[INT- FORMULA]], and [[STATUS]]) -E, --extract-format FMT Specify the format for extracted formulas (using [[CELL-ADDR]], [[CELL-FORMULA]], and [[CELL-VALUE]]) -I, --no-indent Do not show indent before formulas -c, --start-point CELL Start interpretation from a specific cell address -p, --password PASSWORD Password to decrypt the protected document -o, --output-level N Set the level of details to be shown (0:all commands, 1: commands no jump 2:important commands 3:strings in important commands). -t, --timeout N Stop emulation after N seconds (0: not interruption N>0: stop emulation after N seconds) generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xlmdeobf(Unit): """ Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros. """ def __init__( self, extract_only: Unit.Arg.Switch( '-x', help='Only extract cells without any emulation.' ) = False, sort_formulas: Unit.Arg.Switch( '-s', '--sort-formulas', help='Sort extracted formulas based on their cell address (implies -x).', ) = False, with_ms_excel: Unit.Arg.Switch( '-X', '--with-ms-excel', help='Use MS Excel to process XLS files.' ) = False, day: Unit.Arg.Number( '-d', '--day', help='Specify the day of month', ) = -1, output_formula_format: Unit.Arg( '-O', '--output-format', type=str, metavar='FMT', help='Specify the format for output formulas (using [[CELL-ADDR]], [[INT-FORMULA]], and [[STATUS]])', ) = 'CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format: Unit.Arg( '-E', '--extract-format', metavar='FMT', type=str, help='Specify the format for extracted formulas (using [[CELL-ADDR]], [[CELL-FORMULA]], and [[CELL-VALUE]])', ) = 'CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent: Unit.Arg.Switch( '-I', '--no-indent', help='Do not show indent before formulas', ) = False, start_point: Unit.Arg( '-c', '--start-point', type=str, help='Start interpretation from a specific cell address', metavar='CELL', ) = '', password: Unit.Arg( '-p', '--password', type=str, help='Password to decrypt the protected document', ) = '', output_level: Unit.Arg.Number( '-o', '--output-level', help=( 'Set the level of details to be shown (0:all commands, 1: commands no jump 2:important ' 'commands 3:strings in important commands).' ), ) = 0, timeout: Unit.Arg.Number( '-t', '--timeout', help='Stop emulation after N seconds (0: not interruption N>0: stop emulation after N seconds)', ) = 0, ): extract_only = sort_formulas or extract_only self.superinit(super(), **vars()) @Unit.Requires('XLMMacroDeobfuscator', 'formats', 'office') def _process_file(): from XLMMacroDeobfuscator.configs import settings settings.SILENT = True from XLMMacroDeobfuscator.deobfuscator import process_file return process_file def process(self, data: bytearray): with VirtualFileSystem() as vfs, NoLogging(): result = self._process_file( file=vfs.new(data), noninteractive=True, return_deobfuscated=True, extract_only=self.args.extract_only, silent=True, sort_formulas=self.args.sort_formulas, defined_names=False, with_ms_excel=self.args.with_ms_excel, start_with_shell=False, day=self.args.day, output_formula_format=self.args.output_formula_format, extract_formula_format=self.args.extract_formula_format, no_indent=self.args.no_indent, start_point=self.args.start_point, password=self.args.password, output_level=self.args.output_level, timeout=self.args.timeout, ) return '\n'.join(result).encode(self.codec)
class xlxtr (*references)
-
This unit is implemented in
refinery.units.formats.office.xlxtr
and has the following commandline Interface:usage: xlxtr [-h] [-L] [-Q] [-0] [-v] [reference [reference ...]] Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet reference is of the form B1 or 1.2, both specifying the first cell of the second column. A cell range can be specified as B1:C12, or 1.2:C12, or 1.2:12.3. Finally, the unit will always refer to the first sheet in the document and to change this, specify the sheet name or index separated by a hashtag, i.e. sheet#B1:C12 or 1#B1:C12. Note that indices are 1-based. To get all elements of one sheet, use sheet#. The unit If parsing a sheet reference fails, the script will assume that the given reference specifies a sheet. positional arguments: reference A sheet reference to be extracted. If no sheet references are given, the unit lists all sheet names. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xlxtr(_ExcelUnit): """ Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet reference is of the form `B1` or `1.2`, both specifying the first cell of the second column. A cell range can be specified as `B1:C12`, or `1.2:C12`, or `1.2:12.3`. Finally, the unit will always refer to the first sheet in the document and to change this, specify the sheet name or index separated by a hashtag, i.e. `sheet#B1:C12` or `1#B1:C12`. Note that indices are 1-based. To get all elements of one sheet, use `sheet#`. The unit If parsing a sheet reference fails, the script will assume that the given reference specifies a sheet. """ def __init__(self, *references: Arg(metavar='reference', type=SheetReference, help=( 'A sheet reference to be extracted. ' 'If no sheet references are given, the unit lists all sheet names.' ))): if not references: references = [SheetReference('*')] super().__init__(references=references) def process(self, data): wb = Workbook(data, self) for ref in self.args.references: ref: SheetReference for k, name in enumerate(wb.sheets()): if not ref.match(k, name): continue for r, row in enumerate(wb.get_sheet_data(name), 1): for c, value in enumerate(row, 1): if (r, c) not in ref: continue if value is None: continue yield self.labelled( str(value).encode(self.codec), row=r, col=c, ref=_rc2ref(r, c), sheet=name )
class xor (argument, bigendian=False, blocksize=None)
-
This unit is implemented in
refinery.units.blockwise.xor
and has the following commandline Interface:usage: xor [-h] [-L] [-Q] [-0] [-v] [-E] [-B N] argument Form the exclusive or of the input data with the given argument. positional arguments: argument A single numeric expression which provides the right argument to the operation, where the left argument is each block in the input data. This argument can also contain a sequence of bytes which is then split into blocks of the same size as the input data and used cyclically. optional arguments: -E, --bigendian Read chunks in big endian. -B, --blocksize N The size of each block in bytes, default is 1. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xor(BinaryOperationWithAutoBlockAdjustment): """ Form the exclusive or of the input data with the given argument. """ @staticmethod def operate(a, b): return a ^ b @staticmethod def inplace(a, b): a ^= b def _fastblock(self, data): try: return super()._fastblock(data) except FastBlockError as E: try: from Cryptodome.Util.strxor import strxor except ModuleNotFoundError: raise E else: from itertools import islice size = len(data) arg0 = self._normalize_argument(*self._argument_parse_hook(self.args.argument[0])) take = len(data) // self.blocksize + 1 argb = self.unchunk(islice(arg0, take)) del argb[size:] return strxor(data, argb)
class xsalsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
This unit is implemented in
refinery.units.crypto.cipher.salsa
and has the following commandline Interface:usage: xsalsa [-h] [-L] [-Q] [-0] [-v] [-R] [-s] [-d N] [-m MAGIC] [-x N] [-r N] key [nonce] XSalsa encryption and decryption. The nonce must be 24 bytes long. positional arguments: key The encryption key. nonce The nonce. Default is the string REFINERY. optional arguments: -s, --stateful Do not reset the key stream while processing the chunks of one frame. -d, --discard N Discard the first N bytes of the keystream, 0 by default. -m, --magic MAGIC The magic constant; depends on the key size by default. -x, --offset N Optionally specify the stream index, default is 0. -r, --rounds N The number of rounds. Has to be an even number. Default is 20. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class xsalsa(LatinCipherUnit): """ XSalsa encryption and decryption. The nonce must be 24 bytes long. """ def keystream(self) -> Iterable[int]: kdn, kdp, nonce = struct.unpack('<8sQ8s', self.args.nonce) yield from LatinX( SalsaCipher, (0, 5, 10, 15, 6, 7, 8, 9), self.args.key, kdn, kdp, nonce, self.args.magic, self.args.rounds, self.args.offset, )
class xt (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xt
and has the following commandline Interface:usage: xt [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from archives. The unit tries to identify the archive format and use the correct extractor. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xt(ArchiveUnit): """ Extract files from archives. The unit tries to identify the archive format and use the correct extractor. """ @classmethod def handles(cls, data: bytearray) -> Optional[bool]: out = False for engine in cls.handlers(): engine_verdict = engine.handles(data) if engine_verdict is True: return True if engine_verdict is None: out = None return out @staticmethod def handlers(): """ Returns all archive handlers supported by the unit. """ from refinery.units.formats.office.xtone import xtone yield xtone from refinery.units.formats.archive.xtgz import xtgz yield xtgz from refinery.units.formats.email import xtmail yield xtmail from refinery.units.formats.pdf import xtpdf yield xtpdf from refinery.units.formats.archive.xtasar import xtasar yield xtasar from refinery.units.formats.office.xtrtf import xtrtf yield xtrtf from refinery.units.formats.archive.xtzpaq import xtzpaq yield xtzpaq from refinery.units.formats.pe.dotnet.dnsfx import dnsfx yield dnsfx from refinery.units.formats.archive.xtnsis import xtnsis yield xtnsis from refinery.units.formats.archive.xtnode import xtnode yield xtnode from refinery.units.formats.archive.xtace import xtace yield xtace from refinery.units.formats.archive.xtcab import xtcab yield xtcab from refinery.units.formats.archive.xtcpio import xtcpio yield xtcpio from refinery.units.formats.archive.xtiso import xtiso yield xtiso from refinery.units.formats.archive.xtpyi import xtpyi yield xtpyi from refinery.units.formats.archive.xttar import xttar yield xttar from refinery.units.formats.archive.xtiss import xtiss yield xtiss from refinery.units.formats.archive.xtzip import xtzip yield xtzip from refinery.units.formats.archive.xt7z import xt7z yield xt7z from refinery.units.formats.msi import xtmsi yield xtmsi from refinery.units.formats.archive.xtmacho import xtmacho yield xtmacho from refinery.units.formats.archive.xtnuitka import xtnuitka yield xtnuitka from refinery.units.formats.office.xtdoc import xtdoc yield xtdoc from refinery.units.formats.json import xtjson yield xtjson from refinery.units.formats.exe.vsect import vsect yield vsect def unpack(self, data): fallback: List[Type[ArchiveUnit]] = [] errors = {} pos_args = self.args.paths key_args = dict( list=self.args.list, path=self.args.path, date=self.args.date, join_path=self.args.join, drop_path=self.args.drop, ) if self.args.pwd: key_args.update(pwd=self.args.pwd) if self.args.regex: key_args.update(regex=self.args.regex) class unpacker: unit = self def __init__(self, handler: Type[ArchiveUnit], fallback: bool): self.success = False self.handler = handler self.fallback = fallback def __iter__(self): handler = self.handler if self.fallback: verdict = True else: verdict = handler.handles(data) if verdict is False: self.unit.log_info(F'rejected: {handler.name}') elif verdict is True: if not self.fallback: self.unit.log_info(F'accepted: {handler.name}') try: unit = handler(*pos_args, **key_args) unit.args.lenient = self.unit.args.lenient unit.args.quiet = self.unit.args.quiet except TypeError as error: self.unit.log_debug('handler construction failed:', error) return try: test_unpack = not self.unit.args.list for item in unit.unpack(data): if test_unpack: item.get_data() test_unpack = False yield item except Exception as error: if not self.fallback: errors[handler.name] = error if isinstance(error, MultipleArchives): self.unit.log_warn(error) else: self.unit.log_debug('handler unpacking failed:', error) else: self.success = True elif verdict is None: fallback.append(handler) for handler in self.handlers(): self.CustomPathSeparator = handler.CustomPathSeparator it = unpacker(handler, fallback=False) yield from it if it.success: return self.log_debug('fallback order:', lambda: ', '.join(h.name for h in fallback)) for handler in fallback: it = unpacker(handler, fallback=True) yield from it if it.success: return if not errors: raise ValueError('input data did not match any known archive format') for name, error in errors.items(): self.log_info(F'error when trying to unpack with {name}:', error) raise RefineryException('none of the available unpackers could handle this data')
Static methods
def handlers()
-
Returns all archive handlers supported by the unit.
Expand source code Browse git
@staticmethod def handlers(): """ Returns all archive handlers supported by the unit. """ from refinery.units.formats.office.xtone import xtone yield xtone from refinery.units.formats.archive.xtgz import xtgz yield xtgz from refinery.units.formats.email import xtmail yield xtmail from refinery.units.formats.pdf import xtpdf yield xtpdf from refinery.units.formats.archive.xtasar import xtasar yield xtasar from refinery.units.formats.office.xtrtf import xtrtf yield xtrtf from refinery.units.formats.archive.xtzpaq import xtzpaq yield xtzpaq from refinery.units.formats.pe.dotnet.dnsfx import dnsfx yield dnsfx from refinery.units.formats.archive.xtnsis import xtnsis yield xtnsis from refinery.units.formats.archive.xtnode import xtnode yield xtnode from refinery.units.formats.archive.xtace import xtace yield xtace from refinery.units.formats.archive.xtcab import xtcab yield xtcab from refinery.units.formats.archive.xtcpio import xtcpio yield xtcpio from refinery.units.formats.archive.xtiso import xtiso yield xtiso from refinery.units.formats.archive.xtpyi import xtpyi yield xtpyi from refinery.units.formats.archive.xttar import xttar yield xttar from refinery.units.formats.archive.xtiss import xtiss yield xtiss from refinery.units.formats.archive.xtzip import xtzip yield xtzip from refinery.units.formats.archive.xt7z import xt7z yield xt7z from refinery.units.formats.msi import xtmsi yield xtmsi from refinery.units.formats.archive.xtmacho import xtmacho yield xtmacho from refinery.units.formats.archive.xtnuitka import xtnuitka yield xtnuitka from refinery.units.formats.office.xtdoc import xtdoc yield xtdoc from refinery.units.formats.json import xtjson yield xtjson from refinery.units.formats.exe.vsect import vsect yield vsect
class xt7z (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xt7z
and has the following commandline Interface:usage: xt7z [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from a 7zip archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xt7z(ArchiveUnit): """ Extract files from a 7zip archive. """ @ArchiveUnit.Requires('py7zr', 'arc', 'default', 'extended') def _py7zr(): import py7zr import py7zr.exceptions return py7zr def unpack(self, data: bytearray): for match in re.finditer(re.escape(B'7z\xBC\xAF\x27\x1C'), data): start = match.start() if start != 0: self.log_info(F'found a header at offset 0x{start:X}, trying to extract from there.') try: yield from self._unpack_from(data, start) except self._py7zr.Bad7zFile: continue else: break def _unpack_from(self, data: bytearray, zp: int = 0): def mk7z(**keywords): return self._py7zr.SevenZipFile(MemoryFile(mv[zp:]), **keywords) pwd = self.args.pwd mv = memoryview(data) def test(archive: SevenZipFile): if self.args.list: archive.list() return False return archive.testzip() if pwd: try: archive = mk7z(password=pwd.decode(self.codec)) except self._py7zr.Bad7zFile: raise ValueError('corrupt archive; the password is likely invalid.') else: def passwords(): yield None yield from self._COMMON_PASSWORDS for pwd in passwords(): if pwd is None: self.log_debug(U'trying empty password') else: self.log_debug(F'trying password: {pwd}') try: archive = mk7z(password=pwd) problem = test(archive) except self._py7zr.PasswordRequired: problem = True except self._py7zr.UnsupportedCompressionMethodError as E: raise ValueError(E.message) except self._py7zr.exceptions.InternalError: # ignore internal errors during testzip break except SystemError: problem = True except Exception: if pwd is None: raise problem = True if not problem: break else: raise ValueError('a password is required and none of the default passwords worked.') for info in archive.list(): def extract(archive: SevenZipFile = archive, info: FileInfo = info): archive.reset() return archive.read([info.filename]).get(info.filename).read() if info.is_directory: continue yield self._pack(info.filename, info.creationtime, extract, crc32=info.crc32, uncompressed=info.uncompressed) @classmethod def handles(cls, data: bytearray) -> bool: return B'7z\xBC\xAF\x27\x1C' in data
class xtace (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtace
and has the following commandline Interface:usage: xtace [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from an ACE archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtace(ArchiveUnit): """ Extract files from an ACE archive. """ def unpack(self, data): ace = acefile.open(MemoryFile(data, read_as_bytes=True)) for member in ace.getmembers(): member: acefile.AceMember comment = {} if not member.comment else {'comment': member.comment} yield self._pack( member.filename, member.datetime, lambda a=ace, m=member: a.read(m, pwd=self.args.pwd), **comment ) @classmethod def handles(cls, data: bytearray) -> bool: return b'**ACE**' in data[:0x100]
class xtasar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtasar
and has the following commandline Interface:usage: xtasar [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from a ASAR archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtasar(ArchiveUnit): """ Extract files from a ASAR archive. """ def unpack(self, data: bytearray): def _unpack(dir: JSONDict, *path): for name, listing in dir.get('files', {}).items(): yield from _unpack(listing, *path, name) try: offset = dir['offset'] size = dir['size'] except KeyError: return try: offset = int(offset) + header.base end = int(size) + offset except TypeError: self.log_warn(F'unable to convert offset "{offset}" and size "{size}" to integers') return if not path: self.log_warn(F'not processing item at root with offset {offset} and size {size}') return yield UnpackResult( '/'.join(path), lambda a=offset, b=end: data[a:b], offset=offset ) header = AsarHeader(data) self.log_debug(F'header read successfully, base offset is {header.base}.') yield from _unpack(header.directory) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return data.startswith(b'\04\0\0\0') and data[0x10:0x18] == B'{"files"'
class xtcab (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtcab
and has the following commandline Interface:usage: xtcab [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from CAB (cabinet) archives. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtcab(ArchiveUnit): """ Extract files from CAB (cabinet) archives. """ @ArchiveUnit.Requires('cabarchive', 'arc', 'default', 'extended') def _cabarchive(): import cabarchive return cabarchive def unpack(self, data: bytearray): arc = self._cabarchive.CabArchive(data) for item in arc.find_files('*'): yield self._pack(item.filename, datetime.combine(item.date, item.time), item.buf) @classmethod def handles(cls, data: bytearray): return data.startswith(B'MSCF')
class xtcpio (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtcpio
and has the following commandline Interface:usage: xtcpio [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from a CPIO archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtcpio(ArchiveUnit): """ Extract files from a CPIO archive. """ def unpack(self, data): def cpio(): with suppress(EOF): return CPIOEntry(reader) reader = StructReader(memoryview(data)) for entry in iter(cpio, None): if entry.name == 'TRAILER!!!': break yield self._pack(entry.name, entry.mtime, entry.data) @classmethod def handles(cls, data: bytearray) -> bool: for signature in (B'\x71\xC7', B'\xC7\x71', B'0707'): if data.startswith(signature): if B'TRAILER!!' in data: return True else: return None return False
class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.office.xtdoc
and has the following commandline Interface:usage: xtdoc [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract files from an OLE document such as a Microsoft Word DOCX file. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtdoc(PathExtractorUnit): """ Extract files from an OLE document such as a Microsoft Word DOCX file. """ @PathExtractorUnit.Requires('olefile', 'formats', 'office', 'extended') def _olefile(): import olefile return olefile def unpack(self, data): with MemoryFile(data) as stream: try: oledoc = self._olefile.OleFileIO(stream) except OSError as error: self.log_info(F'error, {error}, treating input as zip file') yield from xtzip().unpack(data) return for item in oledoc.listdir(): if not item or not item[-1]: continue path = '/'.join(item) olestream = oledoc.openstream(path) c0 = ord(item[-1][:1]) if c0 < 20: item[-1] = F'[{c0:d}]{item[-1][1:]}' path = '/'.join(item) path = convert_msi_name(path) self.log_debug('exploring:', path) yield UnpackResult(path, olestream.read()) @classmethod def handles(self, data: bytearray) -> Optional[bool]: if data.startswith(B'\xD0\xCF\x11\xE0'): return True if xtzip.handles(data): return sum(1 for marker in [ B'[Content_Types].xml', B'word/document.xml', B'docProps/core.xml', ] if marker in data) >= 2
class xtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False)
-
This unit is implemented in
refinery.units.crypto.cipher.xtea
and has the following commandline Interface:usage: xtea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] key XTEA encryption and decryption. positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -s, --swap Decode blocks as big endian rather than little endian. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class xtea(TEAUnit, cipher=BlockCipherFactory(XTEA)): """ XTEA encryption and decryption. """ pass
class xtgz (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtgz
and has the following commandline Interface:usage: xtgz [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract a file from a GZip archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtgz(ArchiveUnit): """ Extract a file from a GZip archive. """ def unpack(self, data: bytearray): archive = GzipHeader(data) path = archive.name date = archive.mtime date = date and datetime.fromtimestamp(date) or None if path is None: try: meta = metavars(data) path = Path(meta['path']) except KeyError: path = 'ungz' else: self.log_warn(path) suffix = path.suffix if suffix.lower() == '.gz': path = path.with_suffix('') else: path = path.with_suffix(F'{suffix}.ungz') path = path.as_posix() yield self._pack(path, date, archive.data) @classmethod def handles(cls, data: bytearray) -> bool: return data.startswith(B'\x1F\x8B')
class xthtml (*paths, outer=False, attributes=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.html
and has the following commandline Interface:usage: xthtml [-h] [-L] [-Q] [-0] [-v] [-F] [-o] [-a] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the given tag. The main purpose is to extract scripts from HTML documents. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -o, --outer Include the HTML tags for an extracted element. -a, --attributes Populate chunk metadata with HTML tag attributes. -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xthtml(XMLToPathExtractorUnit): """ The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the given tag. The main purpose is to extract scripts from HTML documents. """ def __init__( self, *paths, outer: Arg.Switch('-o', help='Include the HTML tags for an extracted element.') = False, attributes: Arg.Switch('-a', help='Populate chunk metadata with HTML tag attributes.') = False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path' ): super().__init__( *paths, outer=outer, attributes=attributes, format='{tag}', path=path, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, ) def unpack(self, data): html = HTMLTreeParser() html.feed(data.decode(self.codec)) root = html.tos root.reindex() meta = metavars(data) path = self._make_path_builder(meta, root) while root.parent: self.log_info(F'tag was not closed: {root.tag}') root = root.parent while len(root.children) == 1: child, = root.children if child.tag != root.tag: break root = child def tree(root: HTMLNode, *parts: str): def outer(root: HTMLNode = root): return root.recover(inner=False).encode(self.codec) def inner(root: HTMLNode = root): return root.recover().encode(self.codec) tagpath = '/'.join(parts) meta = {} if self.args.attributes: meta.update(root.attributes) if root.root: yield UnpackResult(tagpath, inner, **meta) elif self.args.outer: yield UnpackResult(tagpath, outer, **meta) else: yield UnpackResult(tagpath, inner, **meta) for child in root.children: if child.textual: continue yield from tree(child, *parts, path(child)) yield from tree(root, path(root)) @classmethod def handles(self, data: bytearray): from refinery.lib import mime info = mime.get_cached_file_magic_info(data) if info.extension == 'html': return True if info.mime.endswith('html'): return True return False
class xtiso (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs='auto')
-
This unit is implemented in
refinery.units.formats.archive.xtiso
and has the following commandline Interface:usage: xtiso [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-s TYPE] [path [path ...]] Extract files from a ISO archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -s, --fs TYPE Specify a file system (udf, joliet, rr, iso, auto) extension to use. The default setting auto will automatically detect the first of the other available options and use it. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtiso(ArchiveUnit): """ Extract files from a ISO archive. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs: Arg.Choice('-s', metavar='TYPE', choices=_ISO_FILE_SYSTEMS, help=( 'Specify a file system ({choices}) extension to use. The default setting {default} will automatically ' 'detect the first of the other available options and use it.')) = 'auto' ): if fs not in _ISO_FILE_SYSTEMS: raise ValueError(F'invalid file system {fs}: must be udf, joliet, rr, iso, or auto.') super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, path=path, date=date, fs=fs ) @ArchiveUnit.Requires('pycdlib', 'arc', 'default', 'extended') def _pycdlib(): import pycdlib import pycdlib.dates def fixed_parse(self, datestr): datestr = datestr[:-3] + b'00\0' return original_parse(self, datestr) original_parse = pycdlib.dates.VolumeDescriptorDate.parse pycdlib.dates.VolumeDescriptorDate.parse = fixed_parse return pycdlib @staticmethod def _strip_revision(name: str): base, split, revision = name.partition(';') return base if split and revision.isdigit() else name def unpack(self, data): if not self.handles(data): self.log_warn('The data does not look like an ISO file.') with MemoryFile(data, read_as_bytes=True) as stream: iso = self._pycdlib.PyCdlib() iso.open_fp(stream) fs = self.args.fs if fs != 'auto': mkfacade = { 'iso' : iso.get_iso9660_facade, 'udf' : iso.get_udf_facade, 'joliet' : iso.get_joliet_facade, 'rr' : iso.get_rock_ridge_facade, } facade = mkfacade[fs]() elif iso.has_udf(): self.log_info('using format: udf') facade = iso.get_udf_facade() elif iso.has_joliet(): self.log_info('using format: joliet') facade = iso.get_joliet_facade() elif iso.has_rock_ridge(): self.log_info('using format: rr') facade = iso.get_rock_ridge_facade() else: self.log_info('using format: iso') facade = iso.get_iso9660_facade() for root, _, files in facade.walk('/'): root = root.rstrip('/') for name in files: name = name.lstrip('/') path = F'{root}/{name}' try: info = facade.get_record(path) date = info.date except Exception: info = None date = None else: date = datetime.datetime( date.years_since_1900 + 1900, date.month, date.day_of_month, date.hour, date.minute, date.second, tzinfo=datetime.timezone(datetime.timedelta(minutes=15 * date.gmtoffset)) ) def extract(info=info, path=path): if info: buffer = MemoryFile(bytearray(info.data_length)) else: buffer = MemoryFile(bytearray()) facade.get_file_from_iso_fp(buffer, path) return buffer.getvalue() yield self._pack(self._strip_revision(path), date, extract) @classmethod def handles(cls, data: bytearray) -> bool: return any(data[k] == B'CD001' for k in ( slice(0x8001, 0x8006), slice(0x8801, 0x8806), slice(0x9001, 0x9006), ))
class xtiss (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtiss
and has the following commandline Interface:usage: xtiss [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extracts files from Install Shield Setup files. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtiss(ArchiveUnit): """ Extracts files from Install Shield Setup files. """ def unpack(self, data: bytearray): offset = max(data.rfind(magic) for magic in ISSReader.MAGIC) if offset < 0: raise ValueError('ISS magic not found.') data[:offset] = [] reader = ISSReader(data) count = reader.iss_archive_header() self.log_info(F'archive contains {count} files according to header') for _ in range(count): name, data = reader.iss_file() yield self._pack(name, None, data) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return data.startswith(B'MZ') and any(data.find(m) > 0 for m in ISSReader.MAGIC)
class xtjson (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.json
and has the following commandline Interface:usage: xtjson [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract values from a JSON document. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtjson(PathExtractorUnit): """ Extract values from a JSON document. """ CustomPathSeparator = '.' def unpack(self, data): sep = self.CustomPathSeparator def crawl(path, cursor): if isinstance(cursor, dict): for key, value in cursor.items(): yield from crawl(F'{path}{sep}{key}', value) elif isinstance(cursor, list): for key, value in enumerate(cursor): yield from crawl(F'{path}{sep}{key:d}', value) if path: yield path, cursor, cursor.__class__.__name__ for path, item, typename in crawl('', json.loads(data)): def extract(item=item): if isinstance(item, (list, dict)): dumped = json.dumps(item, indent=4) else: dumped = str(item) return dumped.encode('latin1') yield UnpackResult(path, extract, type=typename) @classmethod def handles(self, data: bytearray) -> Optional[bool]: return bool(checks.json.fullmatch(data))
class xtmacho (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtmacho
and has the following commandline Interface:usage: xtmacho [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)." positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtmacho(ArchiveUnit): """ Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)." """ _SIGNATURE_BE = B'\xCA\xFE\xBA\xBE' _SIGNATURE_LE = B'\xBE\xBA\xFE\xCA' def unpack(self, data: bytearray): view = memoryview(data) signature = bytes(view[:4]) try: reader = StructReader(view, bigendian={ self._SIGNATURE_BE: True, self._SIGNATURE_LE: False, }[signature]) except KeyError as KE: raise ValueError('Not a MachO universal binary; invalid magic header bytes.') from KE else: reader.seekset(4) count = reader.u32() self.log_info(F'reading {count} embedded executables') while count > 0: fa = FatArch(reader) self.log_info(F'reading item of size 0x{len(fa.data):08X}, arch {fa.cputype.name}') yield self._pack(fa.cputype.name, None, fa.data) count -= 1 @classmethod def handles(cls, data: bytearray): return data[:4] in ( cls._SIGNATURE_BE, cls._SIGNATURE_LE, )
class xtmagtape
-
This unit is implemented in
refinery.units.formats.archive.xtmagtape
and has the following commandline Interface:usage: xtmagtape [-h] [-L] [-Q] [-0] [-v] Extract files from SIMH magtape files. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xtmagtape(Unit): """ Extract files from SIMH magtape files. """ def process(self, data: bytearray): reader = StructReader(data) for r in itertools.count(): buffer = MemoryFile() for k in itertools.count(): try: head = reader.peek(4) size = reader.read_integer(24) mark = reader.read_byte() except EOFError: self.log_info('end of file while reading chunk header, terminating') return if not any(head): if k == 0: return break if mark != 0: self.log_warn(F'error code 0x{mark:02X} in record {r}.{k}') buffer.write(reader.read(size)) if reader.peek(4) != head: if reader.tell() % 2 and reader.peek(5)[1:] == head: padding = reader.read_byte() if padding != 0: self.log_info(F'nonzero padding byte in record {r}.{k}') else: raise ValueError('Invalid footer, data is corrupted.') reader.seekrel(4) yield buffer.getbuffer()
class xtmail (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.email
and has the following commandline Interface:usage: xtmail [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract files and body from EMail messages. The unit supports both the Outlook message format and regular MIME documents. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtmail(PathExtractorUnit): """ Extract files and body from EMail messages. The unit supports both the Outlook message format and regular MIME documents. """ def _get_headparts(self, head): mw = mimewords() mw = partial(mw.process.__wrapped__.__wrapped__, mw) jh = defaultdict(list) for key, value in head: jh[key].append(mw(''.join(t.lstrip() for t in value.splitlines(False)))) jh = {k: v[0] if len(v) == 1 else [t for t in v if t] for k, v in jh.items()} yield UnpackResult('headers.txt', lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec)) yield UnpackResult('headers.json', lambda jsn=jh: json.dumps(jsn, indent=4).encode(self.codec)) @PathExtractorUnit.Requires('extract-msg<=0.41.0', 'formats', 'office', 'default', 'extended') def _extract_msg(): import extract_msg.message import extract_msg.enums return extract_msg def _get_parts_outlook(self, data): def ensure_bytes(data): return data if isinstance(data, bytes) else data.encode(self.codec) def make_message(name, msg): with NoLogging(): try: htm = msg.htmlBody except Exception: htm = None try: txt = msg.body except Exception: txt = None if txt: yield UnpackResult(F'{name}.txt', ensure_bytes(txt)) if htm: yield UnpackResult(F'{name}.htm', ensure_bytes(htm)) msgcount = 0 with NoLogging(): class ForgivingMessage(self._extract_msg.message.Message): """ If parsing the input bytes fails early, the "__open" private attribute may not yet exist. This hack prevents an exception to occur in the destructor. """ def __getattr__(self, key: str): if key.endswith('_open'): return False raise AttributeError(key) msg = ForgivingMessage(bytes(data)) yield from self._get_headparts(msg.header.items()) yield from make_message('body', msg) def attachments(msg): for attachment in getattr(msg, 'attachments', ()): yield attachment if attachment.type == 'data': continue yield from attachments(attachment.data) for attachment in attachments(msg): at = attachment.type if at is self._extract_msg.enums.AttachmentType.MSG: msgcount += 1 yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data) continue if not isbuffer(attachment.data): self.log_warn(F'unknown attachment of type {at}, please report this!') continue path = attachment.longFilename or attachment.shortFilename yield UnpackResult(F'attachments/{path}', attachment.data) @PathExtractorUnit.Requires('chardet', 'default', 'extended') def _chardet(): import chardet return chardet def _get_parts_regular(self, data: bytes): try: info = self._chardet.detect(data) msg = data.decode(info['encoding']) except UnicodeDecodeError: raise ValueError('This is not a plaintext email message.') else: msg = Parser().parsestr(msg) yield from self._get_headparts(msg.items()) for k, part in enumerate(msg.walk()): path = part.get_filename() elog = None if path is None: extension = file_extension(part.get_content_type(), 'txt') path = F'body.{extension}' else: path = path | mimewords | str path = F'attachments/{path}' try: data = part.get_payload(decode=True) except Exception as E: try: data = part.get_payload(decode=False) except Exception as E: elog = str(E) data = None else: from refinery import carve self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}') if isinstance(data, str): data = data.encode('latin1') if isbuffer(data): data = next(data | carve('b64', stripspace=True, single=True, decode=True)) else: elog = str(E) data = None if not data: if elog is not None: self.log_warn(F'could not get content of message part {k}: {elog!s}') continue yield UnpackResult(path, data) def unpack(self, data): try: yield from self._get_parts_outlook(data) except Exception: self.log_debug('failed parsing input as Outlook message') yield from self._get_parts_regular(data) @classmethod def handles(cls, data: bytearray) -> bool: markers = [ b'\nReceived:\x20from' b'\nSubject:\x20', b'\nTo:\x20', b'\nFrom:\x20', B'\nMessage-ID:\x20', b'\nBcc:\x20', b'\nContent-Transfer-Encoding:\x20', b'\nContent-Type:\x20', b'\nReturn-Path:\x20', ] if data.startswith(B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'): markers = [marker.decode('latin1').encode('utf-16le') for marker in markers] counter = 0 for marker in markers: if re.search(re.escape(marker), data, flags=re.IGNORECASE): counter += 1 if counter >= 3: return True return False
class xtmsi (*paths, list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, nocab=False)
-
This unit is implemented in
refinery.units.formats.msi
and has the following commandline Interface:usage: xtmsi [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-P NAME] [-j | -d] [-z | -e] [-r] [-N] [path [path ...]] Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file MsiTables.json contains parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in a virtual folder named "Action". positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -N, --nocab Do not list and extract embedded CAB archives. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtmsi(xtdoc): """ Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file {FN} contains parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in a virtual folder named "Action". """ _SYNTHETIC_STREAMS_FILENAME = 'MsiTables.json' _SYNTHETIC_STREAMS_TOPLEVEL = 'MsiTables' # https://learn.microsoft.com/en-us/windows/win32/msi/summary-list-of-all-custom-action-types _CUSTOM_ACTION_TYPES = { 0x01: 'DLL file stored in a Binary table stream.', 0x02: 'EXE file stored in a Binary table stream.', 0x05: 'JScript file stored in a Binary table stream.', 0x06: 'VBScript file stored in a Binary table stream.', 0x11: 'DLL file that is installed with a product.', 0x12: 'EXE file that is installed with a product.', 0x13: 'Displays a specified error message and returns failure, terminating the installation.', 0x15: 'JScript file that is installed with a product.', 0x16: 'VBScript file that is installed with a product.', 0x22: 'EXE file having a path referencing a directory.', 0x23: 'Directory set with formatted text.', 0x25: 'JScript text stored in this sequence table.', 0x26: 'VBScript text stored in this sequence table.', 0x32: 'EXE file having a path specified by a property value.', 0x33: 'Property set with formatted text.', 0x35: 'JScript text specified by a property value.', 0x36: 'VBScript text specified by a property value.', } def __init__( self, *paths, list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, nocab: Arg.Switch('-N', help='Do not list and extract embedded CAB archives.') = False, **kw, ): super().__init__( *paths, list=list, path=path, join_path=join_path, drop_path=drop_path, nocab=nocab, fuzzy=fuzzy, exact=exact, regex=regex, **kw, ) def unpack(self, data): streams = {result.path: result for result in super().unpack(data)} def stream(name: str): return streams.pop(name).get_data() def column_formats(table: Dict[str, MSITableColumnInfo]) -> str: return ''.join(v.struct_format for v in table.values()) def stream_to_rows(data: ByteStr, row_format: str): row_size = struct.calcsize(F'<{row_format}') row_count = int(len(data) / row_size) reader = StructReader(data) columns = [reader.read_struct(F'<{sc * row_count}') for sc in row_format] for i in range(row_count): yield [c[i] for c in columns] tables: Dict[str, Dict[str, MSITableColumnInfo]] = collections.defaultdict(collections.OrderedDict) strings = MSIStringData(stream('!_StringData'), stream('!_StringPool')) for tbl_name_id, col_number, col_name_id, col_attributes in stream_to_rows(stream('!_Columns'), 'HHHH'): tbl_name = strings.ref(tbl_name_id) col_name = strings.ref(col_name_id) tables[tbl_name][col_name] = MSITableColumnInfo(col_number, col_attributes) table_names_given = {strings.ref(k) for k in chunks.unpack(stream('!_Tables'), 2, False)} table_names_known = set(tables) for name in table_names_known - table_names_given: self.log_warn(F'table name known but not given: {name}') for name in table_names_given - table_names_known: self.log_warn(F'table name given but not known: {name}') class ScriptItem(NamedTuple): row_index: int extension: Optional[str] processed_table_data: Dict[str, List[Dict[str, str]]] = {} tbl_properties: Dict[str, str] = {} tbl_files: Dict[str, str] = {} tbl_components: Dict[str, str] = {} postprocessing: List[ScriptItem] = [] def format_string(string: str): # https://learn.microsoft.com/en-us/windows/win32/msi/formatted def _replace(match: re.Match[str]): _replace.done = False prefix, name = match.groups() if not prefix: tbl = tbl_properties elif prefix in '%': name = name.rstrip('%').upper() return F'%{name}%' elif prefix in '!#': tbl = tbl_files elif prefix in '$': tbl = tbl_components else: raise ValueError return tbl.get(name, '') while True: _replace.done = True string = re.sub(R'''(?x) \[ # open square bracket (?![~\\]) # not followed by escapes ([%$!#]?) # any of the valid prefix characters ([^[\]{}]+) # no brackets or braces \]''', _replace, string) if _replace.done: break string = re.sub(r'\[\\(.)\]', r'\1', string) string = string.replace('[~]', '\0') return string for table_name, table in tables.items(): stream_name = F'!{table_name}' if stream_name not in streams: continue processed = [] info = list(table.values()) for r, row in enumerate(stream_to_rows(stream(stream_name), column_formats(table))): values = [] for index, value in enumerate(row): vt = info[index].type if vt is MsiType.Long: if value != 0: value -= 0x80000000 elif vt is MsiType.Short: if value != 0: value -= 0x8000 elif value in strings: value = strings.ref(value) elif not info[index].is_integer: value = '' values.append(value) if table_name == 'Property': tbl_properties[values[0]] = values[1] if table_name == 'File': tbl_properties[values[0]] = values[2] if table_name == 'Component': tbl_properties[values[0]] = F'%{values[2]}%' entry = dict(zip(table, values)) einfo = {t: i for t, i in zip(table, info)} if table_name == 'MsiFileHash': entry['Hash'] = struct.pack( '<IIII', row[2] ^ 0x80000000, row[3] ^ 0x80000000, row[4] ^ 0x80000000, row[5] ^ 0x80000000, ).hex() if table_name == 'CustomAction': code = row[1] & 0x3F try: entry['Comment'] = self._CUSTOM_ACTION_TYPES[code] except LookupError: pass t = einfo.get('Target') c = {0x25: 'js', 0x26: 'vbs', 0x33: None} if code in c and t and not t.is_integer: postprocessing.append(ScriptItem(r, c[code])) processed.append(entry) if processed: processed_table_data[table_name] = processed ca = processed_table_data.get('CustomAction', None) for item in postprocessing: entry = ca[item.row_index] try: path: str = entry['Action'] data: str = entry['Target'] except KeyError: continue root = F'Action/{path}' if item.extension: path = F'{root}.{item.extension}' streams[path] = UnpackResult(path, data.encode(self.codec)) continue data = format_string(data) parts = [part.partition('\x02') for part in data.split('\x01')] if not all(part[1] == '\x02' for part in parts): continue for name, _, script in parts: if not name.lower().startswith('script'): continue if not script: continue path = F'{root}.{name}' streams[path] = UnpackResult(path, script.encode(self.codec)) for ignored_stream in [ '[5]SummaryInformation', '[5]DocumentSummaryInformation', '[5]DigitalSignature', '[5]MsiDigitalSignatureEx' ]: streams.pop(ignored_stream, None) inconsistencies = 0 w1 = len(str(len(strings))) w2 = len(str(max(max(strings.computed_ref_count), max(strings.provided_ref_count)))) for k in range(len(strings)): c = strings.computed_ref_count[k] p = strings.provided_ref_count[k] if c != p and not self.log_debug(F'string {k:0{w1}d} reference count computed={c:0{w2}d} provided={p:0{w2}d}'): inconsistencies += 1 if inconsistencies: self.log_info(F'found {inconsistencies} incorrect string reference counts') def fix_msi_path(path: str): prefix, dot, name = path.partition('.') if dot == '.' and prefix in processed_table_data: path = F'{prefix}/{name}' return path if self.args.nocab: cabs = {} else: def _iscab(path): return media_info and any(item.get('Cabinet', '') == F'#{path}' for item in media_info) media_info: List[JSONDict] = processed_table_data.get('Media', []) cabs: Dict[str, UnpackResult] = { path: item for path, item in streams.items() if _iscab(path)} for cab in cabs: self.log_info(F'found cab file: {cab}') if cabs: from refinery.units.formats.archive.xtcab import xtcab file_names: Dict[str, JSONDict] = {} for file_info in processed_table_data.get('File', []): try: src_name = file_info['File'] dst_name = file_info['FileName'] except KeyError: continue _, _, long = dst_name.partition('|') dst_name = long or dst_name file_names[src_name] = dst_name for path, cab in cabs.items(): try: unpacked: List[UnpackResult] = list(xtcab().unpack(cab.get_data())) except Exception as e: self.log_info(F'unable to extract embedded cab file: {e!s}') continue base, dot, ext = path.rpartition('.') if dot == '.' and ext.lower() == 'cab': path = base else: del streams[path] cab.path = F'{path}.cab' streams[cab.path] = cab for result in unpacked: sub_path = file_names.get(result.path, result.path) sub_path = self._get_path_separator().join((path, sub_path)) streams[sub_path] = result streams = {fix_msi_path(path): item for path, item in streams.items()} ds = UnpackResult(self._SYNTHETIC_STREAMS_FILENAME, json.dumps(processed_table_data, indent=4).encode(self.codec)) streams[ds.path] = ds converter = csv() for key, data in processed_table_data.items(): sk = key.strip('_') if sk not in processed_table_data: key = sk try: tbl = UnpackResult(F'{self._SYNTHETIC_STREAMS_TOPLEVEL}/{key}.csv', converter.json_to_csv(data)) except Exception: continue streams[tbl.path] = tbl for path in sorted(streams): streams[path].path = path yield streams[path] @classmethod def handles(self, data: bytearray): if not data.startswith(B'\xD0\xCF\x11\xE0'): return False return FileMagicInfo(data).extension == 'msi'
class xtnode (*paths, entry=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date')
-
This unit is implemented in
refinery.units.formats.archive.xtnode
and has the following commandline Interface:usage: xtnode [-h] [-L] [-Q] [-0] [-v] [-F] [-u] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [path [path ...]] Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two utilities that are commonly used to generate stand-alone executables. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -u, --entry Only extract the entry point. -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtnode(ArchiveUnit): """ Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two utilities that are commonly used to generate stand-alone executables. """ _NEXE_SENTINEL = B'<nexe~~sentinel>' _PKG_PAYLOAD_P = B'PAYLOAD_POSITION' _PKG_PAYLOAD_S = B'PAYLOAD_SIZE' _PKG_PRELUDE_P = B'PRELUDE_POSITION' _PKG_PRELUDE_S = B'PRELUDE_SIZE' _PKG_COMMON_JS = B'sourceMappingURL=common.js.map' def __init__( self, *paths, entry: Arg.Switch('-u', help='Only extract the entry point.') = False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', ): super().__init__(*paths, entry=entry, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, path=path, date=date) def unpack(self, data: ByteStr) -> Iterable[UnpackResult]: if self._is_nexe(data): self.log_info('unpacking as nexe') yield from self._unpack_nexe(data) return if self._is_pkg(data): self.log_info('unpacking as pkg') yield from self._unpack_pkg(data) return def _unpack_nexe(self, data: ByteStr): try: ep = re.compile( RB"entry\s*=\s*path\.resolve\(path\.dirname\(process\.execPath\),\s*(%s)\)" % formats.string) ep, = ep.finditer(data) except Exception: ep = None self.log_info('could not identify entry point') else: ep = ep.group(1) | esc(quoted=True) | str self.log_info(F'entry point: {ep}') view = memoryview(data) for marker in re.finditer(re.escape(self._NEXE_SENTINEL), data): end = marker.end() + 16 sizes = data[marker.end():end] if sizes.startswith(b"')"): continue reader = StructReader(sizes) code_size = int(reader.f64()) blob_size = int(reader.f64()) start = marker.start() - code_size - blob_size try: reader = StructReader(view[start:end]) code = reader.read_exactly(code_size) blob = reader.read_exactly(blob_size) except EOFError: self.log_debug(F'found marker at 0x{marker.start():X}, but failed to read data') continue else: self.log_debug(F'found marker at 0x{marker.start():X}, data start at {start:X}') for rsrc in re.finditer(RB'process\.__nexe\s*=', code): rsrc = JSONReader(code[rsrc.end():]) rsrc = rsrc.read_json() if len(rsrc) == 1: _, rsrc = rsrc.popitem() for path, (offset, length) in rsrc.items(): end = offset + length if ep and self.args.entry and path != ep: continue yield UnpackResult(path, blob[offset:end]) def _unpack_pkg(self, data: ByteStr): def _extract_coordinates(*v: bytes): for name in v: pattern = name + BR'''\s{0,3}=\s{0,3}(['"])([\s\d]+)\1''' value, = re.finditer(pattern, data) yield int(value.group(2).decode('utf8').strip(), 0) def _extract_data(*v: bytes): try: offset, length = _extract_coordinates(*v) except Exception: return None return data[offset:offset + length] payload = _extract_data(self._PKG_PAYLOAD_P, self._PKG_PAYLOAD_S) if not payload: raise ValueError('unable to extract payload') prelude = _extract_data(self._PKG_PRELUDE_P, self._PKG_PRELUDE_S) if not prelude: raise ValueError('unable to extract prelude') mapping = re.search(re.escape(self._PKG_COMMON_JS) + BR'\s*\},\s*\{', prelude) if not mapping: raise ValueError('unable to find common.js mapping') reader = JSONReader(prelude[mapping.end() - 1:]) files: Dict[str, dict] = reader.read_json() if files is None: raise ValueError('failed to read file list') entry = reader.skip_comma().read_string() links = reader.skip_comma().read_json() # _unknown1 = reader.skip_comma().read_json() # _unknown2 = reader.skip_comma().read_terminated_array(B')').strip() root = next(iter(files)) skip = 0 view = memoryview(payload) for k in range(len(root) + 1): test = root[:k].rstrip('/').rstrip('\\') if not all(path.startswith(test) for path in files): root = test[:-1] skip = k - 1 break entry = entry[skip:] self.log_info(F'detected root directory {root}, entry point is {entry}') for src, dst in links.items(): new_files = {} self.log_info('link src:', src[skip:]) self.log_info('link dst:', dst[skip:]) for path, location in files.items(): if not path.startswith(src): continue new_path = dst + path[len(src):] new_files[new_path] = location self.log_debug('synthesizing linked file:', new_path) files.update(new_files) for path, location in files.items(): path = path[skip:] if entry and self.args.entry and path != entry: continue data = None for kind, (offset, length) in location.items(): stop = offset + length if kind == '3': # metadata continue if kind == '2': # unknown continue if kind in '01': data = view[offset:stop] if data is not None: yield UnpackResult(path, data) @classmethod def _is_nexe(cls, data: ByteStr) -> bool: return cls._NEXE_SENTINEL in data @classmethod def _is_pkg(cls, data: ByteStr) -> bool: if cls._PKG_PAYLOAD_P not in data: return False if cls._PKG_PAYLOAD_S not in data: return False if cls._PKG_PRELUDE_P not in data: return False if cls._PKG_PRELUDE_S not in data: return False if cls._PKG_COMMON_JS not in data: return False return True @classmethod def handles(cls, data: ByteStr) -> Optional[bool]: return cls._is_nexe(data) or cls._is_pkg(data)
class xtnsis (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtnsis
and has the following commandline Interface:usage: xtnsis [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from NSIS archives. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtnsis(ArchiveUnit): """ Extract files from NSIS archives. """ @classmethod def _find_archive_offset(cls, data: bytearray, before: int = -1, flawmax=2): def signatures(*magics): for changes in range(flawmax + 1): for magic in magics: if not changes: yield 0, magic continue for positions in itertools.permutations(range(len(magic)), r=changes): signature = bytearray(magic) for p in positions: signature[p] = 0x2E yield changes, bytes(signature) best_guess = None search_space = memoryview(data) for flaws, sig in signatures(*NSArchive.MAGICS): if flaws > 1: search_space = search_space[:0x20_000] matches = [m.start() - 4 for m in re.finditer(sig, search_space, flags=re.DOTALL)] if before >= 0: matches = [match for match in matches if match < before] matches.reverse() archive = None for match in matches: if match % 0x200 == 0: archive = match break if not archive: if matches and not best_guess: best_guess = matches[-1] else: msg = F'Archive signature was found at offset 0x{archive:X}' if flaws > 0: msg = F'{msg}; it has {flaws} imperfections and was likely modified' cls.log_info(F'{msg}.') return archive if best_guess: cls.log_info(F'A signature was found at offset 0x{best_guess:08X}; it is not properly aligned.') return best_guess return None def unpack(self, data): memory = memoryview(data) before = -1 _error = None while True: offset = self._find_archive_offset(data, before) if offset is None: _error = _error or ValueError('Unable to find an NSIS archive marker.') raise _error try: arc = NSArchive(memory[offset:]) except Exception as e: _error = e before = offset else: break def info(): yield F'{arc.header.type.name} archive' yield F'compression type {arc.method.value}' yield F'mystery value 0x{arc.header.unknown_value:X}' yield 'solid archive' if arc.solid else 'fragmented archive' yield '64-bit header' if arc.header.is64bit else '32-bit header' yield 'unicode' if arc.header.unicode else 'ascii' self.log_info(', '.join(info())) for item in arc.header.items: yield self._pack(item.path, item.mtime, lambda i=item: arc._extract_item(i).data) yield self._pack('setup.bin', None, arc.header_data) yield self._pack('setup.nsis', None, arc.script.encode(self.codec)) @classmethod def handles(cls, data: bytearray) -> bool: return any(magic in data for magic in NSArchive.MAGICS)
class xtnuitka (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.archive.xtnuitka
and has the following commandline Interface:usage: xtnuitka [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extracts files packed by Nuitka using the --onefile option. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtnuitka(PathExtractorUnit): """ Extracts files packed by Nuitka using the --onefile option. """ _MAGIC = B'KA' @PathExtractorUnit.Requires('pyzstd', 'arc') def _pyzstd(): import pyzstd return pyzstd def unpack(self, data: ByteStr) -> Iterable[UnpackResult]: class NuitkaData(Struct): unit = self def __init__(self, reader: StructReader): self.magic = reader.read_exactly(2) self.compression_flag = reader.read_exactly(1) if self.compressed: zd = self.unit._pyzstd.ZstdDecompressor() reader = StructReader(zd.decompress(reader.read())) self.files = {} self.truncated = False while not reader.eof: path = reader.read_w_string('utf-16') if not path: break size = reader.u64() data = reader.read(size) if len(data) == size: self.files[path] = data else: self.truncated = True @property def compressed(self): return self.compression_flag == b'Y' if data.startswith(b'MZ'): arcs = list(self._pe_candidates(data)) else: arcs = [data] for arc in arcs: archive = NuitkaData(arc) if archive.truncated: self.log_warn('the archive is truncated') if archive.magic != self._MAGIC: self.log_warn('the archive data does not start with the correct magic sequence') for path, data in archive.files.items(): yield UnpackResult(path, data) @classmethod def handles(cls, data: ByteStr) -> Optional[bool]: if data.startswith(b'MZ'): try: next(cls._pe_candidates(data)) except StopIteration: return False else: return data.startswith(cls._MAGIC) @classmethod def _pe_candidates(cls, data: ByteStr): from refinery.units.formats.pe.peoverlay import peoverlay blob = data | peoverlay | bytearray if blob.startswith(cls._MAGIC): yield blob from refinery.units.formats.pe.perc import perc for blob in data | perc: if blob.startswith(cls._MAGIC): yield blob
class xtone (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.office.xtone
and has the following commandline Interface:usage: xtone [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract embedded files from OneNote documents. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtone(PathExtractorUnit): """ Extract embedded files from OneNote documents. """ @PathExtractorUnit.Requires('pyonenote', 'formats', 'office', 'extended') def _pyOneNote(): import pyOneNote import pyOneNote.OneDocument return pyOneNote.OneDocument def unpack(self, data: bytearray): with MemoryFile(memoryview(data)) as stream: one = self._pyOneNote.OneDocment(stream) for guid, file in one.get_files().items(): chunk = file['content'] try: extension = file['extension'] except KeyError: extension = F'.{get_cached_file_magic_info(chunk).extension}' yield UnpackResult(F'{guid}{extension}', chunk) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return UUID('e4525c7b-8cd8-a74d-aeb1-5378d02996d3').bytes in data
class xtp (*pattern, filter=0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)
-
This unit is implemented in
refinery.units.pattern.xtp
and has the following commandline Interface:usage: xtp [-h] [-L] [-Q] [-0] [-v] [-f] [-n N] [-m N] [-e N] [-x] [-r] [-l] [-t N] [pattern [pattern ...]] Extract Patterns: Uses regular expressions to extract indicators from the input data and optionally filters these results heuristically. The unit is designed to extract indicators such as domain names and IP addresses, see below for a complete list. To extract data formats such as hex-encoded data, use carve. positional arguments: pattern Choose the pattern to extract. The unit uses ('hostname', 'url', 'email') by default. Use an asterix character to select all available patterns. The available patterns are: domain, email, guid, ipv4, ipv6, md5, sha1, sha256, hostname, socket, subdomain, url, btc, pem, xmr, path, winpath, nixpath, environment-variable optional arguments: -f, --filter If this setting is enabled, the xtp unit will attempt to reduce the number of false positives by certain crude heuristics. Specify multiple times to make the filtering more aggressive. -n, --min N Matches must have length at least N. -m, --max N Matches must have length at most N. -e, --len N Matches must be of length N. -x, --stripspace Strip all whitespace from input data. -r, --duplicates Yield every (transformed) Match, even when it was found before. -l, --longest Sort results by length. -t, --take N Return only the first N occurrences in order of appearance. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xtp(PatternExtractor): """ Extract Patterns: Uses regular expressions to extract indicators from the input data and optionally filters these results heuristically. The unit is designed to extract indicators such as domain names and IP addresses, see below for a complete list. To extract data formats such as hex-encoded data, use `refinery.carve`. """ def __init__( self, *pattern: Arg('pattern', type=str, default=( indicators.hostname.name, indicators.url.name, indicators.email.name, ), help=( 'Choose the pattern to extract. The unit uses {{default}} by default. Use an ' 'asterix character to select all available patterns. The available patterns ' 'are: {}'.format(', '.join(p.display for p in indicators)) ) ), filter: Arg('-f', dest='filter', action='count', help=( 'If this setting is enabled, the xtp unit will attempt to reduce the number ' 'of false positives by certain crude heuristics. Specify multiple times to ' 'make the filtering more aggressive.' ) ) = 0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None ): self.superinit(super(), **vars(), ascii=True, utf16=True) patterns = { p for name in pattern for p in indicators if fnmatch(p.display, name) } # if indicators.hostname in patterns: # patterns.remove(indicators.hostname) # patterns.add(indicators.ipv4) # patterns.add(indicators.domain) patterns = [F'(?P<{p.name}>{p.value})' for p in patterns] if not patterns: raise RefineryCriticalException('The given mask does not match any known indicator pattern.') pattern = '|'.join(patterns) self.args.pattern = re.compile(pattern.encode(self.codec), flags=re.DOTALL) self.args.filter = filter _ALPHABETIC = ascii_letters.encode('ASCII') _LEGITIMATE_HOSTS = { 'acm.org' : 1, 'adobe.com' : 1, 'aka.ms' : 1, 'android.com' : 1, 'apache.org' : 1, 'apple.com' : 1, 'archive.org' : 2, 'azure.com' : 1, 'baidu.com' : 2, 'bootstrapcdn.com' : 2, 'cdnjs.cloudflare.com' : 4, 'comodo.net' : 1, 'comodoca.com' : 1, 'curl.haxx.se' : 1, 'curl.se' : 1, 'digicert.com' : 1, 'dublincore.org' : 1, 'facebook.com' : 4, 'fontawesome.com' : 1, 'github.com' : 3, 'globalsign.com' : 1, 'globalsign.net' : 1, 'godaddy.com' : 1, 'google.com' : 4, 'googleapis.com' : 5, 'googleusercontent.com' : 5, 'gov' : 2, 'gstatic.com' : 2, 'iana.org' : 1, 'intel.com' : 1, 'jquery.com' : 1, 'jsdelivr.net' : 2, 'live.com' : 1, 'microsoft.com' : 1, 'msdn.com' : 1, 'msn.com' : 1, 'newtonsoft.com' : 3, # json.net 'nuget.org' : 3, 'office.com' : 1, 'office365.com' : 2, 'openssl.org' : 1, 'openxmlformats.org' : 1, 'oracle.com' : 1, 'purl.org' : 1, 'python.org' : 1, 'schema.org' : 2, 'sectigo.com' : 1, 'skype.com' : 1, 'sourceforge.net' : 4, 'stackoverflow.com' : 1, 'sun.com' : 1, 'sway-cdn.com' : 1, 'sway-extensions.com' : 1, 'symantec.com' : 1, 'symauth.com' : 1, 'symcb.com' : 1, 'symcd.com' : 1, 'sysinternals.com' : 3, 'thawte.com' : 1, 'unicode.org' : 2, 'usertrust.com' : 1, 'verisign.com' : 1, 'w3.org' : 1, 'wikipedia.org' : 1, 'wolfram.com' : 1, 'xml.org' : 1, 'xmlsoap.org' : 1, 'yahoo.com' : 1, } for _ext in [ 'build', 'data', 'do', 'help', 'java', 'md', 'mov', 'name', 'py', 'so', 'sys', 'zip', ]: _LEGITIMATE_HOSTS[_ext] = 4 _DOMAIN_WHITELIST = [ 'system.net', 'wscript.shell', ] _BRACKETING = { B"'"[0]: B"'", B'"'[0]: B'"', B'('[0]: B')', B'{'[0]: B'}', B'['[0]: B']', B'<'[0]: B'>', } def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str, value: bytes): term = self._BRACKETING.get(data[pos - 1], None) if term: pos = value.find(term) if pos > 0: value = value[:pos] if not self.args.filter: return value if name == indicators.hostname.name: if all(part.isdigit() for part in value.split(B'.')): name = indicators.ipv4.name elif B'.' not in value: name = indicators.ipv6.name else: name = indicators.domain.name if name == indicators.ipv4.name: ocets = [int(x) for x in value.split(B'.')] if ocets.count(0) >= 3: return None if self.args.filter > 2 and sum(ocets) < 10: return None for area in ( bytes(data[pos - 20 : pos + 20]), bytes(data[pos * 2 - 40 : pos * 2 + 40 : 2]), bytes(data[pos * 2 - 41 : pos * 2 + 39 : 2]), ): if B'version' in area.lower(): return None ip = ip_address(value.decode(self.codec)) if not ip.is_global: if self.args.filter >= 3 or not ip.is_private: return None elif name in { indicators.url.name, indicators.socket.name, indicators.hostname.name, indicators.domain.name, indicators.subdomain.name }: if self.args.filter >= 2: if LetterWeights.IOC(value) < 0.6: self.log_info(F'excluding indicator because with low score: {value}', clip=True) return None if name != indicators.url.name and len(value) > 0x100: self.log_info(F'excluding indicator because it is too long: {value}', clip=True) return None ioc = value.decode(self.codec) if '://' not in ioc: ioc = F'tcp://{ioc}' parts = urlparse(ioc) host, _, _ = parts.netloc.partition(':') hl = host.lower() for white, level in self._LEGITIMATE_HOSTS.items(): if self.args.filter >= level and (hl == white or hl.endswith(F'.{white}')): self.log_info(F'excluding indicator because domain {hl} is whitelisted via {white}: {value}', clip=True) self.log_debug(F'reduce level below {level} to allow, current level is {self.args.filter}') return None if name == indicators.url.name: scheme = parts.scheme.lower() for p in ('http', 'https', 'ftp', 'file', 'mailto'): if scheme.endswith(p): pos = scheme.find(p) value = value[pos:] break if any(hl == w for w in self._DOMAIN_WHITELIST): self.log_info(F'excluding indicator because domain {hl} is whitelisted: {value}') return None if name in { indicators.hostname.name, indicators.domain.name, indicators.subdomain.name }: if data[pos - 1] in b'/\\' and self.args.filter >= 2: return None hostparts = host.split('.') if self.args.filter >= 2: if not all(p.isdigit() for p in hostparts) and all(len(p) < 4 for p in hostparts): self.log_info(F'excluding host with too many short parts: {value}') return None if self.args.filter >= 3: if len(hostparts) <= sum(3 for p in hostparts if p != p.lower() and p != p.upper()): self.log_info(F'excluding host with too many mixed case parts: {value}') return None # These heuristics attempt to filter out member access to variables in # scripts which can be mistaken for domains because of the TLD inflation # we've had. uppercase = sum(1 for c in host if c.isalpha() and c.upper() == c) lowercase = sum(1 for c in host if c.isalpha() and c.lower() == c) if lowercase and uppercase: caseratio = uppercase / lowercase if 0.1 < caseratio < 0.9: self.log_info(F'excluding indicator with too much uppercase letters: {value}') return None if all(x.isidentifier() for x in hostparts): if len(hostparts) == 2 and hostparts[0] in ('this', 'self'): self.log_info(F'excluding host that looks like a code snippet: {value}') return None if len(hostparts[-2]) < 3: self.log_info(F'excluding host with too short root domain name: {value}') return None if any(x.startswith('_') for x in hostparts): self.log_info(F'excluding host with underscores: {value}') return None if len(hostparts[-1]) > 3: prefix = '.'.join(hostparts[:-1]) seen_before = len(set(re.findall( R'{}(?:\.\w+)+'.format(prefix).encode('ascii'), data))) if seen_before > 2: self.log_debug(F'excluding indicator that was already seen: {value}') return None elif name == indicators.email.name: at = value.find(B'@') ix = 0 while value[ix] not in self._ALPHABETIC: ix += 1 return None if at - ix < 3 else value[ix:] elif name in ( indicators.path.name, indicators.winpath.name, indicators.nixpath.name, ): if len(value) < 8: self.log_info(F'excluding path because it is too short: {value}') return None if len(value) > 16 and len(re.findall(RB'\\x\d\d', value)) > len(value) // 10: self.log_info(F'excluding long path containign hex: {value}', clip=True) return None try: path_string = value.decode(self.codec) except Exception: self.log_debug(F'excluding path which did not decode: {value!r}', clip=True) return None try: path = Path(path_string) except Exception as E: self.log_debug(F'error parsing path "{path}": {E!s}') return None path_likeness = sum(v for v, x in [ (1, path.suffix), (1, path_string.startswith('/')), (2, path_string.startswith('%')), (2, path_string.startswith('\\\\')), (2, path_string[1:3] == ':\\'), ] if x) if 2 + path_likeness < min(self.args.filter, 2): self.log_info(F'excluding long path because it has no characteristic parts: {value}') return None bad_parts = 0 all_parts = len(path.parts) if self.args.filter >= 1: date_likeness = sum(1 for t in ['yyyy', 'yy', 'mm', 'dd', 'hh', 'ss'] if t in path.parts or t.upper() in path.parts) if len(value) < 20 and date_likeness >= all_parts - 1: self.log_info(F'excluding path that looks like a date format: {value}', clip=True) return None if self.args.filter >= 2: for k, part in enumerate(path.parts): if not k: drive, colon, slash = part.partition(':') if colon and len(drive) == 1 and len(slash) <= 1: continue if part[0] == part[~0] == '%': continue if len(part) == 1: continue if ( LetterWeights.Path(part) < 0.5 + (min(self.args.filter, 4) * 0.1) or (self.args.filter >= 2 and LetterWeights.Path(part[:1]) < 0.5) ): bad_parts += 1 self.log_debug(F'bad part {k + 1} in path: {part}') for filter_limit in (2, 3, 4): bad_ratio = 2 ** (filter_limit - 1) if self.args.filter >= filter_limit and bad_parts * bad_ratio >= all_parts: self.log_info(F'excluding path with bad parts: {value}', clip=True) return None return value def process(self, data): whitelist = set() def check(match: re.Match): for name, value in match.groupdict().items(): if value is not None: break else: raise RefineryCriticalException('Received empty match.') if value in whitelist: return None result = self._check_match(match.string, match.start(), name, value) if result is not None: return self.labelled(result, pattern=name) whitelist.add(value) transforms = [check] yield from self.matches_filtered(memoryview(data), self.args.pattern, *transforms)
class xtpdf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.pdf
and has the following commandline Interface:usage: xtpdf [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract objects from PDF documents. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtpdf(PathExtractorUnit): """ Extract objects from PDF documents. """ @PathExtractorUnit.Requires('pypdf>=3.1.0', 'formats', 'default', 'extended') def _pypdf2(): import pypdf import pypdf.generic return pypdf def _walk(self, blob, memo: Optional[Set[int]] = None, *path): while isinstance(blob, self._pypdf2.generic.IndirectObject): try: blob = blob.get_object() except Exception: break if memo is None: memo = {id(blob)} elif id(blob) in memo: return else: memo.add(id(blob)) try: name = blob['/F'] blob = blob['/EF']['/F'] except Exception: pass else: path = *path[:-1], F'/{name}' try: def extract(): with NoLogging(): return get_data() if TYPE_CHECKING: blob = cast(EncodedStreamObject, blob) get_data = blob.get_data except AttributeError: pass else: yield UnpackResult(''.join(path), extract, kind='object') return if isinstance(blob, self._pypdf2.generic.ByteStringObject): yield UnpackResult(''.join(path), blob, kind='bytes') return if isinstance(blob, self._pypdf2.generic.TextStringObject): yield UnpackResult(''.join(path), blob.encode(self.codec), kind='string') return if isinstance(blob, ( self._pypdf2.generic.BooleanObject, self._pypdf2.generic.ByteStringObject, self._pypdf2.generic.FloatObject, self._pypdf2.generic.NameObject, self._pypdf2.generic.NullObject, self._pypdf2.generic.NumberObject, self._pypdf2.generic.RectangleObject, )): # unhandled PDF objects return if isinstance(blob, self._pypdf2.generic.TreeObject): blob = list(blob) pdf = self._pypdf2.generic.PdfObject if isinstance(blob, list): if ( len(blob) % 2 == 0 and all(isinstance(key, str) for key in islice(iter(blob), 0, None, 2)) and all(isinstance(key, pdf) for key in islice(iter(blob), 1, None, 2)) ): blob = dict(zip(*([iter(blob)] * 2))) else: for key, value in enumerate(blob): yield from self._walk(value, memo, *path, F'/{key}') return if not isdict(blob): return for key, value in blob.items(): if not isinstance(key, str): continue if not key.startswith('/'): key = F'/{key}' yield from self._walk(value, memo, *path, key) def unpack(self, data): with MemoryFile(data, read_as_bytes=True) as stream: with NoLogging(): pdf = self._pypdf2.PdfReader(stream) catalog = pdf.trailer['/Root'] yield from self._walk(catalog) @classmethod def handles(self, data: bytearray) -> Optional[bool]: return data.startswith(B'%PDF-')
class xtpyi (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', decompile, user_code=False, unmarshal=0)
-
This unit is implemented in
refinery.units.formats.archive.xtpyi
and has the following commandline Interface:usage: xtpyi [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-c] [-u | -y] [path [path ...]] Extracts and decompiles files from a Python Installer (aka PyInstaller) archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -c, --decompile Attempt to decompile PYC files. -u, --user-code Extract only source code files from the root of the archive. These usually implement the actual domain logic. This implies the --decompile option. -y, --unmarshal (DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted packages can potentially exploit this to execute code. It is advised to only use this option inside an isolated environment. Specify twice to decompile unmarshalled Python bytecode. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtpyi(ArchiveUnit): """ Extracts and decompiles files from a Python Installer (aka PyInstaller) archive. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', decompile: Arg.Switch('-c', help='Attempt to decompile PYC files.'), user_code: Arg.Switch('-u', group='FILTER', help=( 'Extract only source code files from the root of the archive. These usually implement ' 'the actual domain logic. This implies the --decompile option.')) = False, unmarshal: Arg('-y', action='count', group='FILTER', help=( '(DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted packages can ' 'potentially exploit this to execute code. It is advised to only use this option inside ' 'an isolated environment. Specify twice to decompile unmarshalled Python bytecode.' )) = 0 ): super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, path=path, date=date, decompile=decompile, unmarshal=unmarshal, user_code=user_code, ) @ArchiveUnit.Requires('xdis', 'arc', 'python', 'extended') def _xdis(): import xdis.load import xdis.magics import xdis.marsh import xdis.op_imports import xdis.version_info import xdis A, B, C, *_ = sys.version_info version = F'{A}.{B}.{C}' canonic = F'{A}.{B}' if version not in xdis.magics.canonic_python_version: class opcode_dummy: version = float(canonic) def __init__(self, name): self.name = name def __getattr__(self, key): return opcode_dummy(F'{self.name}.{key}') def __call__(self, *a, **k): return None def __str__(self): return self.name def __repr__(self): return self.name import importlib magic = importlib.util.MAGIC_NUMBER xdis.magics.add_magic_from_int(xdis.magics.magic2int(magic), version) xdis.magics.by_magic.setdefault(magic, set()).add(version) xdis.magics.by_version[version] = magic xdis.magics.magics[canonic] = magic xdis.magics.canonic_python_version[canonic] = canonic xdis.magics.add_canonic_versions(version, canonic) xdis.op_imports.op_imports.setdefault(canonic, opcode_dummy('dummy')) del A, B, C, version import xdis.std return xdis @ArchiveUnit.Requires('uncompyle6', 'arc', 'python', 'extended') def _uncompyle6(): import uncompyle6 import uncompyle6.main return uncompyle6 @ArchiveUnit.Requires('decompyle3', 'arc', 'python') def _decompyle3(): import decompyle3 import decompyle3.main return decompyle3 def unpack(self, data): view = memoryview(data) positions = [m.start() for m in re.finditer(re.escape(PyInstallerArchiveEpilogue.MagicSignature), view)] mode = Unmarshal(min(2, int(self.args.unmarshal))) self.log_debug(F'unmarshal mode: {mode.name}') if not positions: raise LookupError('unable to find PyInstaller signature') if len(positions) > 2: # first position is expected to be the sentinel value in the unpacker stub width = max(len(F'{p:X}') for p in positions) for position in positions: self.log_info(F'magic signature found at offset 0x{position:0{width}X}') self.log_warn(F'found {len(positions) - 1} potential PyInstaller epilogue markers; using last one.') decompile = self.args.decompile uc_target = PiType.USERCODE if decompile else PiType.SOURCE archive = PyInstallerArchiveEpilogue(view, positions[-1], mode, decompile) for name, file in archive.files.items(): if self.args.user_code: if file.type != uc_target: continue if name.startswith('pyiboot'): continue yield self._pack(name, None, file.data, type=file.type.name) @classmethod def handles(cls, data: ByteStr) -> Optional[bool]: return PyInstallerArchiveEpilogue.MagicSignature in data
class xtrtf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.office.xtrtf
and has the following commandline Interface:usage: xtrtf [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract embedded objects in RTF documents. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtrtf(PathExtractorUnit): """ Extract embedded objects in RTF documents. """ @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended') def _oletools(): import oletools import oletools.rtfobj import oletools.oleobj return oletools def unpack(self, data): parser = self._oletools.rtfobj.RtfObjParser(data) parser.parse() width = len(str(len(parser.objects))) for k, item in enumerate(parser.objects): item: RtfObject path = item.filename or F'carve{k:0{width}}.bin' data = item.rawdata meta = {} if item.is_ole: if item.format_id == self._oletools.oleobj.OleObject.TYPE_EMBEDDED: meta['ole_type'] = 'EMBEDDED' elif item.format_id == self._oletools.oleobj.OleObject.TYPE_LINKED: meta['ole_type'] = 'LINKED' if item.is_package: meta['src_path'] = item.src_path meta['tmp_path'] = item.temp_path if item.clsid is not None: meta['ole_info'] = item.clsid_desc meta['ole_guid'] = item.clsid meta['ole_name'] = item.class_name if item.oledata: data = item.oledata pos = item.rawdata.find(data) if pos > 0: meta['raw_header'] = item.rawdata[:pos] if item.olepkgdata: data = item.olepkgdata pos = item.oledata.find(data) if pos >= 0: meta['ole_header'] = item.oledata[:pos] yield UnpackResult(path, data, **meta) @classmethod def handles(self, data: bytearray) -> bool: return data[:500].lower().lstrip().startswith(b'{\\rtf')
class xtsql (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.archive.xtsql
and has the following commandline Interface:usage: xtsql [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract files from SQLite3 databases. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtsql(PathExtractorUnit): """ Extract files from SQLite3 databases. """ def unpack(self, data: bytearray): def _json(object): with BytesAsStringEncoder as encoder: return encoder.dumps(object).encode(self.codec) if sys.version_info[:2] < (3, 11): raise NotImplementedError(F'python 3.11 is required to use {self.__class__.__name__}.') database = sqlite3.connect(':memory:') database.deserialize(data) cursor = database.cursor() result: dict[str, list[dict[str, int | float | str | bytes]]] = {} listing: list[tuple[str, str]] = cursor.execute( "SELECT name, sql FROM sqlite_master WHERE type='table';").fetchall() for table, spec in listing: result[table] = t = [] ct, _table, names = spec.partition(table) names = names.strip() if ( table != _table or ct.strip().upper().split() != ['CREATE', 'TABLE'] or not names.endswith(')') or not names.startswith('(') ): raise ValueError(F'Unexpeted SQL statement in master table: {spec}') names = [next(iter(name.strip().split())) for name in names[1:-1].split(',')] for row in cursor.execute(F'SELECT * FROM {table}').fetchall(): t.append(dict(zip(names, row))) yield UnpackResult('db', functools.partial(_json, result)) for table, rows in result.items(): yield UnpackResult(F'db/{table}', functools.partial(_json, rows)) for k, row in enumerate(rows): root = F'db/{table}/{k}' yield UnpackResult(root, functools.partial(_json, row)) for name, value in row.items(): path = F'{root}/{name}' if value is None: continue if isinstance(value, (int, float)): value = str(value) if isinstance(value, str): value = value.encode(self.codec) if isinstance(value, bytes): yield UnpackResult(path, value) @classmethod def handles(cls, data: bytearray): return memoryview(data)[:15] == B'SQLite format 3'
class xttar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xttar
and has the following commandline Interface:usage: xttar [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from a Tar archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xttar(ArchiveUnit): """ Extract files from a Tar archive. """ def unpack(self, data: bytearray): with MemoryFile(data) as stream: try: archive = tarfile.open(fileobj=stream) except Exception: ustar = data.find(B'ustar') if ustar < 257: raise stream.seek(ustar - 257) archive = tarfile.open(fileobj=stream) for info in archive.getmembers(): if not info.isfile(): continue extractor = archive.extractfile(info) if extractor is None: continue date = datetime.datetime.fromtimestamp(info.mtime) yield self._pack(info.name, date, lambda e=extractor: e.read()) @classmethod def handles(cls, data: bytearray) -> bool: ustar = data.find(B'ustar') if ustar >= 0: return ustar == 257 or data[ustar:ustar + 3] in (B'\x00\x30\x30', B'\x20\x20\x00') return False
class xtvba (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.office.xtvba
and has the following commandline Interface:usage: xtvba [-h] [-L] [-Q] [-0] [-v] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract VBA macro code from Office documents. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xtvba(PathExtractorUnit): """ Extract VBA macro code from Office documents. """ @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended') def _olevba(): from oletools import olevba return olevba def unpack(self, data): sentinel = uuid4() try: parser = self._olevba.VBA_Parser(sentinel, data=bytes(data), relaxed=True) except self._olevba.FileOpenError: raise ValueError('Input data not recognized by VBA parser') for p1, stream_path, p2, code in parser.extract_all_macros(): if not stream_path: if p1 == sentinel: continue if p2 == sentinel: continue yield UnpackResult(stream_path, code.encode(self.codec))
class xtw (stripspace=False, duplicates=False, longest=False, take=None)
-
This unit is implemented in
refinery.units.pattern.xtw
and has the following commandline Interface:usage: xtw [-h] [-L] [-Q] [-0] [-v] [-x] [-r] [-l] [-t N] Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works similar to the xtp unit. optional arguments: -x, --stripspace Strip all whitespace from input data. -r, --duplicates Yield every (transformed) Match, even when it was found before. -l, --longest Sort results by length. -t, --take N Return only the first N occurrences in order of appearance. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xtw(PatternExtractor): """ Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works similar to the `refinery.xtp` unit. """ def __init__(self, stripspace=False, duplicates=False, longest=False, take=None): self.superinit(super(), **vars(), ascii=True, utf16=True) def process(self, data): pattern = '|'.join(FR'(?P<{p.name}>\b{p.value}\b)' for p in wallets) pattern = FR'\b{pattern}\b'.encode('latin1') def check(match: re.Match[bytes]): for name, value in match.groupdict().items(): if value is not None: break else: raise RefineryCriticalException('Received empty match.') return self.labelled(value, kind=name) yield from self.matches_filtered(memoryview(data), pattern, check)
class xtxml (*paths, format=None, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit is implemented in
refinery.units.formats.xml
and has the following commandline Interface:usage: xtxml [-h] [-L] [-Q] [-0] [-v] [-f F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [path [path ...]] Extract values from an XML document. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -f, --format F A format expression to be applied for computing the path of an item. This must use metadata that is available on the item. The current tag can be accessed as {tag}. If no format is specified, the unit attempts to derive a good attribute from the XML tree to use for generating paths. -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xtxml(XMLToPathExtractorUnit): """ Extract values from an XML document. """ def unpack(self, data): root = xml.parse(data.strip()) meta = metavars(data) path = self._make_path_builder(meta, root) def walk(node: xml.XMLNode, *parts: str): def extract(node: xml.XMLNode = node): if not node.children: return node.content.encode(self.codec) with MemoryFile() as stream: node.write(stream) return bytes(stream.getbuffer() | ppxml) attributes = { self._normalize_key(k): self._normalize_val(v) for k, v in node.attributes.items() } if not all(is_valid_variable_name(k) for k in attributes): attributes = {F'_{k}': v for k, v in attributes.items()} yield UnpackResult('/'.join(parts), extract, **attributes) for child in node.children: yield from walk(child, *parts, path(child)) yield from walk(root, path(root))
class xtzip (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
This unit is implemented in
refinery.units.formats.archive.xtzip
and has the following commandline Interface:usage: xtzip [-h] [-L] [-Q] [-0] [-v] [-F] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from a Zip archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtzip(ArchiveUnit): """ Extract files from a Zip archive. """ @ArchiveUnit.Requires('chardet', 'default', 'extended') def _chardet(): import chardet return chardet @ArchiveUnit.Requires('pyzipper', 'arc', 'default', 'extended') def _pyzipper(): import pyzipper return pyzipper @classmethod def _carver(cls): return carve_zip def unpack(self, data: bytearray): from zipfile import ZipInfo, ZipFile def password_invalid(password: Optional[bytes]): nonlocal archive, fallback if password: archive.setpassword(password) try: archive.testzip() except NotImplementedError: if fallback: raise self.log_debug('compression method unsupported, switching to pyzipper') archive = self._pyzipper.AESZipFile(MemoryFile(data)) fallback = True return password_invalid(password) except RuntimeError as E: if 'password' not in str(E): raise return True else: if password: self.log_debug('using password:', password) return False password = bytes(self.args.pwd) fallback = False archive = ZipFile(MemoryFile(data)) passwords = [password] if not password: passwords.extend(p.encode(self.codec) for p in self._COMMON_PASSWORDS) for p in passwords: if not password_invalid(p): break else: raise RuntimeError('Archive is password-protected.') for info in archive.infolist(): def xt(archive: ZipFile = archive, info: ZipInfo = info): try: return archive.read(info.filename) except RuntimeError as E: if 'password' not in str(E): raise if not password: raise RuntimeError('archive is password-protected') else: raise RuntimeError(F'invalid password: {password.decode(self.codec)}') from E if info.filename: if info.is_dir(): continue # courtesy of https://stackoverflow.com/a/37773438/9130824 filename = info.filename if info.flag_bits & ZIP_FILENAME_UTF8_FLAG == 0: filename_bytes = filename.encode('437') try: guessed_encoding = self._chardet.detect(filename_bytes)['encoding'] except ImportError: guessed_encoding = None guessed_encoding = guessed_encoding or 'cp1252' filename = filename_bytes.decode(guessed_encoding, 'replace') try: date = datetime(*info.date_time) except Exception as e: self.log_info(F'{e!s} - unable to determine date from tuple {info.date_time} for: {filename}') date = None yield self._pack(filename, date, xt) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return data.rfind(ZipEndOfCentralDirectory.SIGNATURE) > 0
class xtzpaq (*paths, index=False, pwd=b'', date=b'date', path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)
-
This unit is implemented in
refinery.units.formats.archive.xtzpaq
and has the following commandline Interface:usage: xtzpaq [-h] [-L] [-Q] [-0] [-v] [-F] [-i] [-l] [-j | -d] [-z | -e] [-r] [-P NAME] [-D NAME] [-p PWD] [path [path ...]] Extract files from a ZPAQ archive. positional arguments: path Wildcard pattern for the path of the item to be extracted. Each item is returned as a separate output of this unit. Paths may contain wildcards; The default argument is a single wildcard, which means that every item will be extracted. If a given path yields no results, the unit performs increasingly fuzzy searches with it. This can be disabled using the --exact switch. optional arguments: -i, --index Archive is an index (no d-blocks). -l, --list Return all matching paths as UTF8-encoded output chunks. -j, --join-path Join path names with the previously existing one. If the previously existing path has a file extension, it is removed. Then, if that path already exists on disk, a numeric extension is appended to avoid conflict with the file system. -d, --drop-path Do not modify the path variable for output chunks. -z, --fuzzy Specify once to add a leading wildcard to each patterns, twice to also add a trailing wildcard. -e, --exact Path patterns never match on substrings. -r, --regex Use regular expressions instead of wildcard patterns. -P, --path NAME Name of the meta variable to receive the extracted path. The default value is "path". -D, --date NAME Name of the meta variable to receive the extracted file date. The default value is "date". -p, --pwd PWD Optionally specify an extraction password. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class xtzpaq(ArchiveUnit): """ Extract files from a ZPAQ archive. """ _MAGIC = B'\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3\x7A\x50\x51' def __init__( self, *paths, index: Arg.Switch('-i', help='Archive is an index (no d-blocks).') = False, **more ): for _code, _size in { _TCU32: 4, _TCI32: 4, _TCU16: 2, _TCI16: 2, }.items(): _item_size = array(_code).itemsize if _item_size == _size: continue raise RuntimeError( F'Expected array type "{_code}" to have entries of size {_size}, but the API ' F'reports a size of {_item_size}.') super().__init__(*paths, index=index, **more) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return cls._MAGIC in data def unpack(self, archive: bytearray): def mkdate(date) -> datetime: date = int(date) year = date // 1000000 // 10000 month = date // 100000000 % 100 day = date // 1000000 % 100 hour = date // 10000 % 100 minute = date // 100 % 100 second = date % 100 return datetime(year, month, day, hour, minute, second, 0) @dataclass class DT: date: int = 0 attr: int = 0 name: str = "" frag: List[int] = field(default_factory=list) @property def dt(self) -> Optional[datetime]: if self.date > 0: return mkdate(self.date) # TODO: implement password-protected archives # key = self.args.pwd index = self.args.index bsize: Dict[int, int] = {} # frag ID -> d block compressed size dt: Dict[str, DT] = {} # filename -> date, attr, frags frag: List[bytes] = [] # ID -> hash[20] size[4] data csize = 0 # expected offset of next non d block streaming = False journaling = False done = False dc = Decompressor() src = dc.set_input(archive) while not done and dc.read_block(): while not done: filename = dc.read_filename() if filename is None: break self.log_info('reading file', filename) comment = dc.read_comment() jsize = 0 if len(comment) >= 4 and comment[-4:] == "jDC\x01": num = re.search('^\\d+', comment) if not num: raise RuntimeError('missing size in comment') jsize = int(num[0]) if streaming: raise RuntimeError('journaling block after streaming one') journaling = True self.log_info('archive type is journaling') else: if journaling: raise RuntimeError('streaming block after journaling one') if index: raise RuntimeError('streaming block in index') streaming = True self.log_info('archive type is streaming') # Test journaling filename. The format must be # jDC[YYYYMMDDHHMMSS][t][NNNNNNNNNN] # where YYYYMMDDHHMMSS is the date, t is the type {c,d,h,i}, and # NNNNNNNNNN is the 10 digit first fragment ID for types c,d,h. # They must be in ascending lexicographical order. frag_id = 0 block_type = None if journaling: if len(filename) != 28: raise RuntimeError('filename size not 28') if filename[:3] != 'jDC': raise RuntimeError('filename not jDC') block_type = filename[17] if block_type not in 'cdhi': raise RuntimeError('type not c,d,h,i') try: mkdate(filename[3:17]) except Exception as E: raise RuntimeError('invalid date') from E frag_id = int(filename[18:28]) if not 1 <= frag_id <= 4294967295: raise RuntimeError('fragment ID out of range') seg = MemoryFile(size_limit=jsize) dc.set_output(seg) sha1 = hashlib.sha1() dc.set_hasher(sha1) dc.decompress_data() if journaling and len(seg) != jsize: raise RuntimeError('incomplete output') checksum = dc.read_segment_end() if checksum is None: self.log_debug('no checksum') elif checksum != sha1.digest(): raise RuntimeError('SHA1 mismatch') # check csize at first non-d block if csize and block_type in 'chi': if csize != offset: raise RuntimeError(F'csize={csize} does not point to offset={offset}') csize = 0 # get csize from c block seglen = len(seg) seg = StructReader(seg.getbuffer()) if block_type == 'c': if seglen < 8: raise RuntimeError("c block too small") csize = seg.u64() offset = src.tell() + 1 self.log_debug(F'csize={csize} at offset={offset}') if csize >> 63: self.log_warn('incomplete transaction at end of archive') done = True elif index and csize != 0: raise RuntimeError('nonzero csize in index') # Set csize to expected offset of first non d block # assuming 1 more byte for unread end of block marker. csize += offset if block_type == 'd': if index: raise RuntimeError('d block in index') bsize[frag_id] = src.tell() + 1 - offset # compressed size self.log_debug(F' {bsize[frag_id]} -> {len(seg)}') # Test frag size list at end. The format is f[id..id+n-1] fid n # where fid may be id or 0. sizes must sum to the rest of block. if seglen < 8: raise RuntimeError('d block too small') seg.seekset(-8) fid = seg.u32() or frag_id n = seg.u32() if fid != frag_id: raise RuntimeError('missing ID') if n > (seglen - 8) // 4: raise RuntimeError('frag list too big') fragsum = 0 # computed sum of frag sizes seg.seekset(-4 * (n + 2)) for _ in range(n): fragsum += seg.u32() if fragsum + n * 4 + 8 != seglen: raise RuntimeError('bad frag size list') # Save frag hashes and sizes. For output, save data too. seg.seekset(fragsum) data = memoryview(seg.getbuffer()) assert seg.remaining_bytes == n * 4 + 8 for i in range(n): while len(frag) <= frag_id + i: frag.append(B'') if frag[frag_id + i]: raise RuntimeError('duplicate frag ID') f = seg.u32() h = hashlib.sha1(data[:f]).digest() frag[frag_id + i] = h + f.to_bytes(4, 'little') + data[:f] data = data[f:] assert len(data) == n * 4 + 8 assert seg.remaining_bytes == 8 # Test and save h block. Format is: bsize (sha1[20] size)... # where bsize is the compressed size of the d block with the same id, # and each size corresonds to a fragment in that block. The list # must match the list in the d block if present. if block_type == 'h': if seglen % 24 != 4: raise RuntimeError('bad h block size') b = seg.u32() self.log_debug(F'[{frag_id}..{frag_id + seglen // 24}[ {b}') fragsum = 0 # uncompressed size of all frags for i in range(seglen // 24): fd = seg.read(24) if index: while len(frag) <= frag_id + i: frag.append(B'') if frag[frag_id + i]: raise RuntimeError('data in index') frag[frag_id + i] = fd elif frag_id + i >= len(frag) or len(frag[frag_id + i]) < 24: raise RuntimeError('no matching d block') elif frag[frag_id + i][:24] != fd: raise RuntimeError('frag size or hash mismatch') fragsum += int.from_bytes(fd[20:24], 'little') # Test i blocks and save files to extract. Format is: # date filename 0 na attr[0..na) ni ptr[0..ni) (to update) # 0 filename (to delete) # Date is 64 bits in YYYYMMDDHHMMSS format. if block_type == 'i': while not seg.eof: f = DT(seg.u64()) f.name = seg.read_c_string('utf8') if f.date > 0: na = seg.u32() if na > 65535: raise ValueError('attr size > 65535') f.attr = seg.read_integer(na * 8) ni = seg.u32() for i in range(ni): a = seg.u32() f.frag.append(a) if index: continue elif not 1 <= a < len(frag): raise RuntimeError('frag ID out of range') elif not frag[a]: raise LookupError('missing frag data') dt[f.name] = f if streaming: yield self._pack(filename, None, seg.getvalue()) offset = src.tell() self.log_debug(F'{offset} bytes of archive tested') if not journaling: return for name, f in dt.items(): if not f.date: continue size = sum( int.from_bytes(frag[fp][20:24], 'little') for fp in f.frag if 0 < fp < len(frag) and len(frag[fp]) >= 24 ) out = MemoryFile() for fp in f.frag: if fp < len(frag): out.write(memoryview(frag[fp])[24:]) if len(out) != size: self.log_warn('invalid size during unpacking') yield self._pack(name, f.dt, out.getvalue())
class xxh (seed=0, text=False)
-
This unit is implemented in
refinery.units.crypto.hash.xxhash
and has the following commandline Interface:usage: xxh [-h] [-L] [-Q] [-0] [-v] [-t] [seed] Implements the xxHash hashing algorithm. positional arguments: seed specify the seed value; the default is 0 optional arguments: -t, --text Output a hexadecimal representation of the hash. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level.
Expand source code Browse git
class xxh(HashUnit): """ Implements the xxHash hashing algorithm. """ def __init__( self, seed: HashUnit.Arg.Number(metavar='seed', help='specify the seed value; the default is {default}') = 0, text=False ): super().__init__(text, seed=seed) def _algorithm(self, data): return xxhash(data, self.args.seed)
class xxtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size=1)
-
This unit is implemented in
refinery.units.crypto.cipher.xxtea
and has the following commandline Interface:usage: xxtea [-h] [-L] [-Q] [-0] [-v] [-R] [-i IV] [-p P] [-m M] [-r] [-s] [-b N] key positional arguments: key The encryption key. optional arguments: -i, --iv IV Specifies the initialization vector. If none is specified, then a block of zero bytes is used. -p, --padding P Choose a padding algorithm (pkcs7, iso7816, x923, raw). The raw algorithm does nothing. By default, all other algorithms are attempted. In most cases, the data was not correctly decrypted if none of these work. -m, --mode M Choose cipher mode to be used. Possible values are: CBC, CFB, CTR, ECB, OFB, PCBC. By default, the CBC mode is used when an IV is is provided, and ECB otherwise. -r, --raw Set the padding to raw; ignored when a padding is specified. -s, --swap Decode blocks as big endian rather than little endian. -b, --block-size N Cipher block size in 32-bit words. The default value 1 implies that the input is treated as a single block, which is common behaviour of many implementations. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation.
Expand source code Browse git
class xxtea(TEAUnit, cipher=BlockCipherFactory(XXTEA)): block_size: int = 8 def __init__( self, key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size: Arg.Number('-b', help=( 'Cipher block size in 32-bit words. The default value {default} implies that the input ' 'is treated as a single block, which is common behaviour of many implementations.')) = 1 ): super().__init__(key, iv, padding, mode, raw, swap=swap, block_size=block_size) def _prepare_block(self, data: bytes): if self.args.block_size < 2: blocks, remainder = divmod(len(data), 4) if remainder: blocks += 1 self.block_size = blocks * 4 else: self.block_size = self.args.block_size * 4 def encrypt(self, data: bytes) -> bytes: self._prepare_block(data) return super().encrypt(data) def decrypt(self, data: bytes) -> bytes: self._prepare_block(data) return super().decrypt(data) def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher(block_size=self.block_size, **optionals)
class zl (level=9, window=15, zlib_header=False, gzip_header=False)
-
This unit is implemented in
refinery.units.compression.zl
and has the following commandline Interface:usage: zl [-h] [-L] [-Q] [-0] [-v] [-R] [-F] [-l N] [-w N] [-z | -g] ZLib compression and decompression. optional arguments: -l, --level N Specify a compression level between 0 and 9. -w, --window N Manually specify the window size between 8 and 15. -z, --zlib-header Use a ZLIB header. -g, --gzip-header Use a GZIP header. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class zl(Unit): """ ZLib compression and decompression. """ def __init__( self, level : Arg.Number('-l', bound=(0, 0X9), help='Specify a compression level between 0 and 9.') = 9, window : Arg.Number('-w', bound=(8, 0XF), help='Manually specify the window size between 8 and 15.') = 15, zlib_header: Arg.Switch('-z', group='MODE', help='Use a ZLIB header.') = False, gzip_header: Arg.Switch('-g', group='MODE', help='Use a GZIP header.') = False ): if zlib_header and gzip_header: raise ValueError('You can only specify one header type (ZLIB or GZIP).') return super().__init__(level=level, window=window, zlib_header=zlib_header, gzip_header=gzip_header) def _decompress_data(self, data, mode: int, step: int): zl = zlib.decompressobj(mode) memory = memoryview(data) result = bytearray() while not zl.eof: read = min(step, len(memory)) try: chunk = zl.decompress(memory[:read]) except zlib.error as e: raise RefineryPartialResult(exception_to_string(e), result) from e else: result.extend(chunk) consumed = read - len(zl.unused_data) if not memory or consumed == 0: break memory = memory[consumed:] return result, memory def process(self, data): if data[0] == 0x78 or data[0:2] == B'\x1F\x8B' or self.args.zlib_header or self.args.gzip_header: modes = [self.args.window | 0x20, -self.args.window] else: modes = [-self.args.window, self.args.window | 0x20] modes.extend([0x10 | self.args.window, 0]) view = memoryview(data) step = 32 if self.leniency > 0 else len(data) for k in itertools.count(1): error = None rest = view for mode in modes: try: out, rest = self._decompress_data(view, mode, step) except Exception as e: error = error or e else: self.log_info(F'used mode {mode} to decompress chunk {k}') yield out error = None break if error: raise error if not rest: break if len(rest) == len(view): break if len(rest) > len(view): raise RuntimeError('Decompressor returned more tail data than input data.') yield out view = rest if k <= 0: raise ValueError('Could not detect any zlib stream.') def reverse(self, data): mode = -self.args.window if self.args.zlib_header: mode = -mode if self.args.gzip_header: mode = -mode | 0x10 self.log_debug(F'using mode {mode:+2d} for compression') zl = zlib.compressobj(self.args.level, zlib.DEFLATED, mode) zz = zl.compress(data) return zz + zl.flush(zlib.Z_FINISH) @classmethod def handles(self, data: bytearray): for sig in ( B'\x1F\x8B', # gzip header B'\x78\x01', # zlib low compression B'\x78\x9C', # zlib medium compression B'\x78\xDA', # zlib high compression ): if data[:2] == sig: return True
class zstd
-
This unit is implemented in
refinery.units.compression.zstd
and has the following commandline Interface:usage: zstd [-h] [-L] [-Q] [-0] [-v] [-R] [-F] ZStandard (ZSTD) compression and decompression. generic options: -h, --help Show this help message and exit. -L, --lenient Allow partial results as output. -Q, --quiet Disables all log output. -0, --devnull Do not produce any output. -v, --verbose Specify up to two times to increase log level. -R, --reverse Use the reverse operation. -F, --iff Only apply unit if it can handle the input format. Specify twice to drop all other chunks.
Expand source code Browse git
class zstd(Unit): """ ZStandard (ZSTD) compression and decompression. """ @Unit.Requires('pyzstd', 'all') def _pyzstd(): import pyzstd return pyzstd def process(self, data): zd = self._pyzstd.ZstdDecompressor() return zd.decompress(data) def reverse(self, data): zc = self._pyzstd.ZstdCompressor() return zc.compress(data) + zc.flush() @classmethod def handles(self, data: bytearray) -> bool: return data[:4] == B'\x28\xB5\x2F\xFD'