Module refinery.shell
Shell-Like Unit Interface
Any unit from the refinery
module can also be imported from this module. When imported from here,
the units are initialized differently: They can be given string arguments as they would receive on
the command line. For example:
>>> from refinery.shell import *
>>> emit('ABC', 'DEF') [ pop('t') | xor('var:t') | pack('-R') ] | str
'575'
This especially gives easier access to the powerful refinery.lib.meta
variables and the entire
multibin format expressions, see refinery.lib.argformats
.
Expand source code Browse git
"""
# Shell-Like Unit Interface
Any unit from the `refinery` module can also be imported from this module. When imported from here,
the units are initialized differently: They can be given string arguments as they would receive on
the command line. For example:
>>> from refinery.shell import *
>>> emit('ABC', 'DEF') [ pop('t') | xor('var:t') | pack('-R') ] | str
'575'
This especially gives easier access to the powerful `refinery.lib.meta` variables and the entire
multibin format expressions, see `refinery.lib.argformats`.
"""
from functools import wraps
from refinery import __unit_loader__, Unit
with __unit_loader__:
__all__ = sorted(__unit_loader__.units, key=lambda x: x.lower())
class __pdoc2__:
def __class_getitem__(*_):
return ''
def __getattr__(name):
with __unit_loader__:
unit: Unit = __unit_loader__.resolve(name)
if unit is None:
raise AttributeError(name)
class _unit(unit):
def __new__(cls, *args, **kwargs):
return unit.assemble(*args, **kwargs)
return wraps(unit, updated=[])(_unit)
def __dir__():
return __all__
Units
class a3x (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script bytecode. The unit also works on compiled AutoIt executables.
Expand source code Browse git
class a3x(PathExtractorUnit): """ Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script bytecode. The unit also works on compiled AutoIt executables. """ def unpack(self, data: bytearray): view = memoryview(data) cursor = 0 errors: Dict[int, Exception] = {} script_count = 0 truncated: Set[A3xRecord] = set() intact: Set[A3xRecord] = set() def _package(records: Iterable[A3xRecord]) -> Generator[UnpackResult, None, None]: for k, record in enumerate(records, 1): self.log_info(F'record {k} type:', record.type) self.log_info(F'record {k} path:', record.src_path) if record.path is None: continue yield UnpackResult( record.path, record.extract, srcpath=record.src_path, created=record.created.isoformat(' ', 'seconds'), written=record.written.isoformat(' ', 'seconds'), ) while cursor < len(view): self.log_debug(F'searching at offset 0x{cursor:08X}') nc = data.find(A3xScript.MAGIC, cursor) if nc >= 0: cursor = nc else: rp = data.find(A3xRecord.MAGIC, cursor) - A3xScript.WIDTH if rp <= cursor: break cursor = rp try: script = A3xScript(view[cursor:]) except Exception as E: errors[cursor] = E cursor += 1 continue else: valid = script.has_valid_magic() if valid: _m = 'correct' else: _m = 'invalid' if not script.body: cursor += A3xScript.WIDTH if not script.has_valid_magic(): cursor += len(A3xRecord.MAGIC) continue if script.truncated: _a = 'truncated' truncated.update(script.body) else: script_count += 1 _a = 'intact' intact.update(script.body) self.log_info( F'{_a} script of type', script.type, F'and length 0x{len(script):08X}', F'with {len(script.body)} records and {_m} magic:', script.magic ) cursor += len(script) if script.truncated: if not script.has_valid_magic(): cursor += len(A3xRecord.MAGIC) continue yield from _package(script.body) remaining = truncated - intact if remaining: self.log_warn('emitting records from truncated scripts') yield from _package(remaining) return elif truncated: self.log_debug('good news: intact scripts contained all records from truncated scripts') if script_count == 0: error = None for offset, error in errors.items(): self.log_warn(F'error at offset 0x{offset:08X}:', error) if error: raise error @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return A3xScript.MAGIC in data or A3xRecord.MAGIC in data
class add (argument, bigendian=False, blocksize=None)
-
Add the given argument to each block.
Expand source code Browse git
class add(BinaryOperationWithAutoBlockAdjustment): """ Add the given argument to each block. """ @staticmethod def operate(a, b): return a + b @staticmethod def inplace(a, b): a += b
class adler32 (text=False)
-
Returns the Adler32 Hash of the input data.
Expand source code Browse git
class adler32(HashUnit): """ Returns the Adler32 Hash of the input data. """ def _algorithm(self, data: bytes) -> bytes: return struct.pack('>I', zlib.adler32(data))
class aes (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
AES encryption and decryption.
Expand source code Browse git
class aes(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(AES)): """ AES encryption and decryption. """ pass
class alu (operator, *argument, seed=0, prologue=None, epilogue=None, inc=False, dec=False, cbc=False, bigendian=False, blocksize=None, precision=None)
-
The arithmetic-logical unit. It allows you to specify a custom Python expression where the following variables are allowed:
- the variable
A
: same asV[0]
- the variable
B
: current block - the variable
N
: number of bytes in the input - the variable
K
: current index in the input - the variable
S
: the internal state value - the variable
V
: the vector of arguments - the variable
I
: function that casts to a signed int in current precision - the variable
U
: function that casts to unsigned int in current precision - the variable
R
: function;R(x,4)
rotates x by 4 to the right - the variable
L
: function;L(x,4)
rotates x by 4 to the left - the variable
M
: function;M(x,8)
picks the lower 8 bits of x - the variable
X
: function that negates the bits of the input
(The rotation operations are interpreted as shifts when arbitrary precision is used.)
Each block of the input is replaced by the value of this expression. Additionally, it is possible to specify prologue and epilogue expressions which are used to update the state variable
S
before and after the update of each block, respectively.Expand source code Browse git
class alu(ArithmeticUnit): """ The arithmetic-logical unit. It allows you to specify a custom Python expression where the following variables are allowed: - the variable `A`: same as `V[0]` - the variable `B`: current block - the variable `N`: number of bytes in the input - the variable `K`: current index in the input - the variable `S`: the internal state value - the variable `V`: the vector of arguments - the variable `I`: function that casts to a signed int in current precision - the variable `U`: function that casts to unsigned int in current precision - the variable `R`: function; `R(x,4)` rotates x by 4 to the right - the variable `L`: function; `L(x,4)` rotates x by 4 to the left - the variable `M`: function; `M(x,8)` picks the lower 8 bits of x - the variable `X`: function that negates the bits of the input (The rotation operations are interpreted as shifts when arbitrary precision is used.) Each block of the input is replaced by the value of this expression. Additionally, it is possible to specify prologue and epilogue expressions which are used to update the state variable `S` before and after the update of each block, respectively. """ @staticmethod def _parse_op(definition, default=None): definition = definition or default if not definition: raise ValueError('No definition given') return definition def __init__( self, operator: Arg(type=str, help='A Python expression defining the operation.'), *argument, seed: Arg('-s', type=str, help=( 'Optional seed value for the state variable S. The default is zero. This can be an expression ' 'involving the variable N.')) = 0, prologue: Arg('-p', type=str, metavar='E', help=( 'Optional expression with which the state variable S is updated before a block is operated on.')) = None, epilogue: Arg('-e', type=str, metavar='E', group='EPI', help=( 'Optional expression with which the state variable S is updated after a block was operated on.')) = None, inc: Arg('-I', group='EPI', help='equivalent to --epilogue=S+1') = False, dec: Arg('-D', group='EPI', help='equivalent to --epilogue=S-1') = False, cbc: Arg('-X', group='EPI', help='equivalent to --epilogue=(B)') = False, bigendian=False, blocksize=None, precision=None ): for flag, flag_is_set, expression in [ ('--cbc', cbc, '(B)'), ('--inc', inc, 'S+1'), ('--dec', dec, 'S-1'), ]: if flag_is_set: if epilogue is not None: raise ValueError( F'Ambiguous specification; epilogue was already set to {epilogue} ' F'when {flag} was parsed.' ) epilogue = expression self._index = IndexCounter() super().__init__( self._index, *argument, bigendian=bigendian, blocksize=blocksize, precision=precision, seed=seed, operator=self._parse_op(operator), prologue=self._parse_op(prologue, 'S'), epilogue=self._parse_op(epilogue, 'S'), ) @property def _is_ecb(self): return not self.args.epilogue and not self.args.prologue def _fastblock(self, _): raise FastBlockError def process(self, data): context = dict(metavars(data)) seed = self.args.seed fbits = self.fbits fmask = self.fmask if isinstance(seed, str): seed = PythonExpression(seed, 'N', constants=metavars(data), mask=fmask) if callable(seed): seed = seed(context, N=len(data)) self._index.init(self.fmask) def _expression(definition: str): return PythonExpression(definition, *'IBASMNVRLX', all_variables_allowed=True, mask=fmask) prologue = _expression(self.args.prologue).expression epilogue = _expression(self.args.epilogue).expression operator = _expression(self.args.operator).expression def cast_unsigned(n) -> int: return int(n) & fmask def cast_signed(n) -> int: n = int(n) & fmask if n >> (fbits - 1): return -((~n + 1) & fmask) else: return n if fbits is INF: def rotate_r(n, k): return n >> k def rotate_l(n, k): return n << k else: def rotate_r(n, k): return (n >> k) | (n << (fbits - k)) & fmask def rotate_l(n, k): return (n << k) | (n >> (fbits - k)) & fmask def negate_bits(n): return n ^ fmask def mask_to_bits(x, b): return x & ((1 << b) - 1) context.update( N=len(data), S=seed, I=cast_signed, U=cast_unsigned, R=rotate_r, L=rotate_l, X=negate_bits, M=mask_to_bits, ) def operate(block, index, *args): context.update(K=index, B=block, V=args) if args: context['A'] = args[0] context['S'] = eval(prologue, None, context) context['B'] = eval(operator, None, context) context['S'] = eval(epilogue, None, context) return context['B'] placeholder = self.operate self.operate = operate try: result = super().process(data) finally: self.operate = placeholder return result @staticmethod def operate(block, index, *args): raise RuntimeError('This operate method cannot be called.') def inplace(self, block, *args) -> None: super().inplace(block, *args)
- the variable
class aplib
-
APLib compression and decompression.
Expand source code Browse git
class aplib(Unit): """ APLib compression and decompression. """ def reverse(self, buf): return compressor(buf).compress() def process(self, buf): view = memoryview(buf) size = 0 if view[:4] == B'AP32': size = int.from_bytes(buf[4:8], 'little') if size > 0x80: size = 0 else: self.log_info(F'detected aPLib header of size {size}') return decompressor(view[size:]).decompress() @classmethod def handles(self, data: bytearray): if data[:4] == B'AP32': return True return None
class asm (mode='x32', *, count=None, until=None, no_address=False, no_hexdump=False)
-
Disassembles the input data using capstone and produces a human-readable disassembly listing. It internally uses the
opc
unit for this, which is an alternative option if you are looking for more programmatic disassembly.Expand source code Browse git
class asm(opc): """ Disassembles the input data using capstone and produces a human-readable disassembly listing. It internally uses the `refinery.opc` unit for this, which is an alternative option if you are looking for more programmatic disassembly. """ def __init__( self, mode='x32', *, count=None, until=None, no_address: Arg.Switch('-A', help='Disable address display.') = False, no_hexdump: Arg.Switch('-H', help='Disable opcodes hexdump.') = False, ): super().__init__( mode=mode, nvar='_name', avar='_addr', ovar='_arg', count=count, until=until, no_address=no_address, no_hexdump=no_hexdump, ) def process(self, data): insns = list(super().process(data)) if not insns: return no_address = self.args.no_address no_hexdump = self.args.no_hexdump def _hl(x): return len(hex(x)) args_width = max(len(insn['_args']) for insn in insns) memo_width = max(len(insn['_name']) for insn in insns) addr_width = max(_hl(insn['_addr']) for insn in insns) if no_address: addr_width = 0 memo_width = memo_width + 2 max_data_bytes_count = max(len(c) for c in insns) padding = addr_width + memo_width + args_width + 10 metrics_opc = HexDumpMetrics(max_data_bytes_count, padding=padding) for insn in insns: hd = one(hexdump(insn, metrics_opc)) name = insn.meta.pop('_name') args = insn.meta.pop('_args') addr = insn.meta.pop('_addr') msg = F' {name:<{memo_width}} {args:<{args_width}}' if not no_hexdump: msg = F'{msg} ; {hd}' if not no_address: msg = F'{addr:0{addr_width}X}: {msg}' yield msg.encode(self.codec)
class atbash
-
https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st century, except for picking out crypto nerds.
Expand source code Browse git
class atbash(Unit): """ https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st century, except for picking out crypto nerds. """ def process(self, data: bytearray): uc = range(B'A'[0], B'Z'[0] + 1) lc = range(B'a'[0], B'z'[0] + 1) for k, letter in enumerate(data): if letter in uc: data[k] = uc[~uc.index(letter)] continue if letter in lc: data[k] = lc[~lc.index(letter)] continue return data reverse = process
class autoxor (range=slice(1, 32, None))
-
Assumes a XOR-encoded input and automatically attempts to find the correct XOR key. The method is based on the assumption that the plaintext input contains one letter that occurs with a much higher frequency than all other letters; this is the case for the null byte in PEs, and also for the space character in many text files.
Expand source code Browse git
class autoxor(xkey): """ Assumes a XOR-encoded input and automatically attempts to find the correct XOR key. The method is based on the assumption that the plaintext input contains one letter that occurs with a much higher frequency than all other letters; this is the case for the null byte in PEs, and also for the space character in many text files. """ def process(self, data: bytearray): key = super().process(data) if not key: self.log_warn('No key was found; returning original data.') return data bin, = data | xor(key) txt, = bin | xor(0x20) if re.fullmatch(BR'[\s!-~]+', txt) and not txt.isspace(): key = bytes(key | xor(0x20)) bin = txt return self.labelled(bin, key=key)
class b32
-
Base32 encoding and decoding.
Expand source code Browse git
class b32(Unit): """ Base32 encoding and decoding. """ def reverse(self, data): return base64.b32encode(data) def process(self, data: bytearray): before_padding = 0 for before_padding in range(len(data), 0, -1): if data[before_padding - 1:before_padding] != B'=': break padding_size = -before_padding % 8 missing = before_padding + padding_size - len(data) if missing > 0: self.log_info(F'detected incorrect padding: added {missing} padding characters') data.extend(B'=' * missing) if missing < 0: self.log_info(F'detected incorrect padding: removed {-missing} padding characters') data[padding_size + before_padding:] = [] return base64.b32decode(data, casefold=True)
class b58
-
Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the alphabet omits digits and letters that look similar.
Expand source code Browse git
class b58(base): """ Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the alphabet omits digits and letters that look similar. """ def __init__(self): super().__init__(b'123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz')
class b62
-
Base62 encoding and decoding.
Expand source code Browse git
class b62(base): """ Base62 encoding and decoding. """ def __init__(self): super().__init__(b'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')
class b64 (urlsafe=False)
-
Base64 encoding and decoding.
Expand source code Browse git
class b64(Unit): """ Base64 encoding and decoding. """ def __init__(self, urlsafe: Arg.Switch('-u', help='use URL-safe alphabet') = False): super().__init__(urlsafe=urlsafe) def reverse(self, data): altchars = None if self.args.urlsafe: altchars = B'-_' return base64.b64encode(data, altchars=altchars) def process(self, data: bytearray): if not data: return data if len(data) == 1: raise ValueError('single byte can not be base64-decoded.') data.extend(B'===') altchars = None if (B'-' in data or B'_' in data) and (B'+' not in data and B'/' not in data) or self.args.urlsafe: altchars = B'-_' return base64.b64decode(data, altchars=altchars) @classmethod def handles(self, data: bytearray) -> bool: from refinery.lib.patterns import formats if not formats.spaced_b64.value.fullmatch(data): return False histogram = set() lcase_count = 0 ucase_count = 0 digit_count = 0 other_count = 0 total_count = len(data) for byte in data: histogram.add(byte) if len(histogram) > 60: return True elif byte in range(0x61, 0x7B): lcase_count += 1 elif byte in range(0x41, 0x5B): ucase_count += 1 elif byte in range(0x30, 0x40): digit_count += 1 elif byte in B'\v\f\t\r\n\x20': total_count -= 1 else: other_count += 1 for c in (lcase_count, ucase_count, digit_count, other_count): # Call this a false positive if more than 2/3ds of the data # consist of a single category of letters. if c * 3 > total_count * 2: return False return True
class b65536
-
Base65536 encoding and decoding. A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set.
Expand source code Browse git
class b65536(Unit): """ Base65536 encoding and decoding. A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set. """ def reverse(self, data): if not data: return B'' output = MemoryFile() length = len(data) for x in range(0, length, 2): b1 = data[x] b2 = data[x + 1] if x + 1 < length else -1 code_point = _BLOCK_START[b2] + b1 output.write(chr(code_point).encode()) return output.getvalue() def process(self, data): if not data: return B'' done = False output = MemoryFile() for ch in data.decode(): code_point = ord(ch) b1 = code_point & ((1 << 8) - 1) try: b2 = _B2[code_point - b1] except KeyError: self.log_info('Invalid base65536 code point: %d, skipping' % code_point) continue b = b1.to_bytes(1, "little") if b2 == -1 else b1.to_bytes(1, "little") + b2.to_bytes(1, "little") if len(b) == 1: if done: raise ValueError('base65536 sequence continued after final byte') done = True output.write(b) return output.getvalue()
class b85
-
Base85 encoding and decoding.
Expand source code Browse git
class b85(Unit): """ Base85 encoding and decoding. """ def reverse(self, data): return base64.b85encode(data) def process(self, data): if re.search(BR'\s', data) is not None: data = re.sub(BR'\s+', B'', data) return base64.b85decode(data) @classmethod def handles(self, data: bytearray): from refinery.lib.patterns import formats return formats.spaced_b85.value.fullmatch(data)
class b92
-
Base92 encoding and decoding.
Expand source code Browse git
class b92(Unit): """ Base92 encoding and decoding. """ def reverse(self, data): if not data: return B'~' reader = StructReader(data, bigendian=True) output = MemoryFile() while reader.remaining_bits > 0: try: block = reader.read_integer(13) except EOFError: count = reader.remaining_bits block = reader.read_integer(count) self.log_debug(F'reading {count} remaining bits: {block:0{count}b}') shift = 6 - count if shift >= 0: block <<= shift self.log_debug(F'encoding block: {block:06b}') output.write_byte(_B92_ALPHABET[block]) break block <<= 13 - count self.log_debug(F'encoding block: {block:013b}') hi, lo = divmod(block, 91) output.write_byte(_B92_ALPHABET[hi]) output.write_byte(_B92_ALPHABET[lo]) return output.getvalue() def process(self, data): if data == B'~': return B'' output = MemoryFile() buffer = 0 length = 0 view = memoryview(data) q, r = divmod(len(view), 2) if r > 0: bits = 6 tail = _B92_DECODING[data[~0]] else: bits = 13 tail = _B92_DECODING[data[~1]] * 91 + _B92_DECODING[data[~0]] view = view[:(q - 1) * 2] it = iter(view) for a, b in zip(it, it): block = _B92_DECODING[a] * 91 + _B92_DECODING[b] assert length < 8 buffer <<= 13 buffer |= block length += 13 size, length = divmod(length, 8) assert size > 0 output.write((buffer >> length).to_bytes(size, 'big')) buffer &= (1 << length) - 1 missing = 8 - length shift = bits - missing if shift < 8: bytecount = 1 else: bytecount = 2 shift -= 8 missing += 8 if shift < 0: raise RefineryPartialResult( F'Invalid padding, missing {-shift} bits.', output.getvalue()) buffer <<= missing buffer |= tail >> shift length += missing output.write(buffer.to_bytes(bytecount, 'big')) if tail & ((1 << shift) - 1) != 0: raise RefineryPartialResult( F'Invalid padding, lower {shift} bits of {tail:0{bits}b} are not zero.', output.getvalue()) return output.getvalue() @classmethod def handles(self, data: bytearray): from refinery.lib.patterns import formats return formats.b92.value.fullmatch(data)
class base (base=0, strip_padding=False, little_endian=False, strict_digits=False)
-
Encodes and decodes integers in arbitrary base.
Expand source code Browse git
class base(Unit): """ Encodes and decodes integers in arbitrary base. """ def __init__( self, base: Arg(type=numseq, metavar='base|alphabet', help=( R'Either the base to be used or an alphabet. If an explicit alphabet is given, its length ' R'determines the base. The default base 0 treats the input as a Python integer literal. If ' F'a numeric base is given, digits from the alphabet "{_DEFAULT_ALPH_STR}" are used. ')) = 0, strip_padding: Arg.Switch('-s', help='Do not add leading zeros to the output.') = False, little_endian: Arg.Switch('-e', help='Use little endian byte order instead of big endian.') = False, strict_digits: Arg.Switch('-d', help='Check that all input digits are part of the alphabet.') = False, ): super().__init__( base=base, strip_padding=strip_padding, little_endian=little_endian, strict_digits=strict_digits, ) @property def _args(self): base = self.args.base if isinstance(base, int): if not base: return 0, B'' if base in _LARGER_ALPHABETS: return base, _LARGER_ALPHABETS[base] if base not in range(2, len(_DEFAULT_ALPHABET) + 1): raise ValueError(F'base may only be an integer between 2 and {len(_DEFAULT_ALPHABET)}') return base, _DEFAULT_ALPHABET[:base] if len(set(base)) != len(base): raise ValueError('the given alphabet contains duplicate letters') return len(base), bytearray(base) @property def byteorder(self): return 'little' if self.args.little_endian else 'big' def reverse(self, data): base, alphabet = self._args self.log_info('using byte order', self.byteorder) number = int.from_bytes(data, byteorder=self.byteorder) if base == 0: return B'0x%X' % number if base > len(alphabet): raise ValueError(F'Only {len(alphabet)} available; not enough to encode base {base}') data_bits = len(data) * 8 base_bits = math.log2(base) result = bytearray() while data_bits >= 1: number, k = divmod(number, base) result.append(alphabet[k]) if not number and self.args.strip_padding: break data_bits -= base_bits result.reverse() return result def process(self, data: bytearray): base, alphabet = self._args if base and base != 64 and not self.args.strict_digits: check = set(alphabet) index = 0 it = iter(data) for b in it: if b not in check: break index += 1 for b in it: if b in check: data[index] = b index += 1 self.log_info(F'stripped {len(data) - index} invalid digits from input data') del data[index:] if len(alphabet) <= len(_DEFAULT_ALPHABET): defaults = _DEFAULT_ALPHABET[:base] if alphabet != defaults: self.log_info('translating input data to a default alphabet for faster conversion') data_translated = data.translate(bytes.maketrans(alphabet, defaults)) result = int(data_translated, base) else: result = int(data, base) elif len(alphabet) == 64: import base64 _b64_alphabet = _LARGER_ALPHABETS[64] if alphabet != _b64_alphabet: data = data.translate(bytes.maketrans(alphabet, _b64_alphabet)) return base64.b64decode(data + b'===', validate=self.args.strict_digits) elif len(alphabet) == 85: import base64 _b85_alphabet = _LARGER_ALPHABETS[85] if alphabet != _b85_alphabet: data = data.translate(bytes.maketrans(alphabet, _b85_alphabet)) return base64.b85decode(data) else: self.log_warn('very long alphabet, unable to use built-ins; reverting to (slow) fallback.') result = 0 lookup = {digit: k for k, digit in enumerate(alphabet)} for digit in data: result *= base result += lookup[digit] if not base or self.args.strip_padding: size, rest = divmod(result.bit_length(), 8) size += int(bool(rest)) else: size = (len(data) - 1 + alphabet.index(data[0]) / base) * math.log2(base) / 8 size = math.ceil(size) return result.to_bytes(size, byteorder=self.byteorder)
class bat (keep_all=False, keep_comment=False, keep_definitions=False, keep_echo=False)
-
Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script is interpreted, variables are substituted for previously defined values, including commonly defined operating system environment variables. Variable definitions that are later evaluated are removed from the script, as are all echo commands and comments.
Expand source code Browse git
class bat(Unit): """ Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script is interpreted, variables are substituted for previously defined values, including commonly defined operating system environment variables. Variable definitions that are later evaluated are removed from the script, as are all echo commands and comments. """ def __init__( self, keep_all : Unit.Arg.Switch('-a', help='Do not strip anything after deobfuscation.') = False, keep_comment : Unit.Arg.Switch('-c', help='Do not strip comments from the script.') = False, keep_definitions : Unit.Arg.Switch('-d', help='Do not strip variable definitions.') = False, keep_echo : Unit.Arg.Switch('-e', help='Do not strip echo calls in the script.') = False, ): ... @unicoded def process(self, data: str) -> str: mode = STRIP.ALL if self.args.keep_all: mode = STRIP.NONE elif self.args.keep_comment: mode ^= STRIP.COMMENT elif self.args.keep_definitions: mode ^= STRIP.DEFINITION elif self.args.keep_echo: mode ^= STRIP.ECHO return BatchDeobfuscator().deobfuscate(data, mode)
class bitrev (bigendian=False, blocksize=None)
-
Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer multiple of the block size are ignored.
Unreadable bit reversal operations due to: https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
Expand source code Browse git
class bitrev(UnaryOperation): """ Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer multiple of the block size are ignored. """ @staticmethod def operate(arg): raise RuntimeError('operate was called before the unit was initialized') def __init__(self, bigendian=False, blocksize=None): """ Unreadable bit reversal operations due to: https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel """ super().__init__(bigendian=bigendian, blocksize=blocksize, _truncate=1) if self.bytestream: def operate(v): return ((v * 0x202020202) & 0x10884422010) % 1023 elif self.blocksize in (2, 4, 8): def operate(v): s = self.fbits m = self.fmask w = v while s > 1: s >>= 1 m = m ^ (m << s) w = ((w << s) & ~m) | ((w >> s) & m) return w else: def operate(v): w = v & 0 for s in range(self.fbits): w |= ((v >> s) & 1) << (self.fbits - s - 1) return w self.operate = operate
class bitsnip (slices=[slice(0, 1, None)], bigendian=False, blocksize=None)
-
Pick a certain range of bits from each block of the input. The extracted ranges of bits are concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are indexed from least significant at index 0 to most significant in each block. When the unit operates in big endian mode, the internal bit buffer is shifted left in each step and new bits are inserted as the least significant portion. Conversely, in default (little endian) mode, newly extracted bits are added as the now most significant ones. After concatenating all bit slices into a large integer, this integer is converted into a byte string according to the given byte ordering.
Expand source code Browse git
class bitsnip(BlockTransformationBase): """ Pick a certain range of bits from each block of the input. The extracted ranges of bits are concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are indexed from least significant at index 0 to most significant in each block. When the unit operates in big endian mode, the internal bit buffer is shifted left in each step and new bits are inserted as the least significant portion. Conversely, in default (little endian) mode, newly extracted bits are added as the now most significant ones. After concatenating all bit slices into a large integer, this integer is converted into a byte string according to the given byte ordering. """ def __init__( self, slices: Arg(help=( 'Specify start:stop:size, where size can be used to pad or truncate the extracted ' 'bits. If size is omitted, it defaults to (stop-start). If no slice is specified, ' 'it defaults to 0, which corresponds to 0:1:1, i.e. extracting the lowest bit.') ) = [slice(0, 1)], bigendian=False, blocksize=None ): super().__init__(slices=slices, bigendian=bigendian, blocksize=blocksize) def process(self, data: bytearray): bitsnip_data = 0 bitsnip_size = 0 slices: List[Tuple[int, int, int]] = [] maxbits = 8 * self.blocksize args: Iterable[slice] = iter(self.args.slices) bigendian: bool = self.args.bigendian for s in args: start = s.start stop = s.stop if start is None: start = 0 if stop is None: stop = maxbits elif stop > maxbits: raise ValueError(F'the selection {start}:{stop} is out of bounds for the block size {self.blocksize}') if start >= stop: continue size = stop - start mask = (1 << size) - 1 size = s.step or size slices.append((start, mask, size)) for item in self.chunk(data): for shift, mask, size in slices: bits = (item >> shift) & mask if bigendian: bitsnip_data <<= size bitsnip_data |= bits else: bitsnip_data |= bits << bitsnip_size bitsnip_size += size length, remainder = divmod(bitsnip_size, 8) if remainder != 0: self.log_info(F'discarding {bitsnip_size % 8} bits') if bigendian: bitsnip_data >>= remainder else: bitsnip_data &= (1 << (8 * length)) - 1 if bigendian: return bitsnip_data.to_bytes(length, 'big') else: return bitsnip_data.to_bytes(length, 'little')
class blabla (key, nonce=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', rounds=10, discard=0, stateful=False)
-
Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is similar to ChaCha in design but operates on 64-bit blocks.
Expand source code Browse git
class blabla(StreamCipherUnit): """ Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is similar to ChaCha in design but operates on 64-bit blocks. """ key_size = {32} def __init__( self, key, nonce: Arg(help='The 16-byte nonce. The default are 16 null bytes.') = bytes(16), rounds: Arg.Number('-r', help='The number of rounds, default is {default}.') = 10, discard=0, stateful=False ): super().__init__(key=key, nonce=nonce, rounds=rounds, discard=discard, stateful=stateful) def keystream(self): r = self.args.rounds n = self.args.nonce k = struct.unpack('<4Q', self.args.key) try: n = struct.unpack('<2Q', n) except Exception: raise ValueError(F'The given nonce has invalid length of {len(n)}, it must be 16 bytes in size.') q = [ 0x6170786593810fab, # 0x0 0x3320646ec7398aee, # 0x1 0x79622d3217318274, # 0x2 0x6b206574babadada, # 0x3 *k, # 0x4 .. 0x7 0x2ae36e593e46ad5f, # 0x8 0xb68f143029225fc9, # 0x9 0x8da1e08468303aa6, # 0xA 0xa48a209acd50a4a7, # 0xB 0x7fdc12f23f90778c, # 0xC 1, # 0xD *n # 0xE .. 0xF ] while True: v = [*q] for _ in range(r): for a, b, c, d in [ (0x0, 0x4, 0x8, 0xC), (0x1, 0x5, 0x9, 0xD), (0x2, 0x6, 0xA, 0xE), (0x3, 0x7, 0xB, 0xF), (0x0, 0x5, 0xA, 0xF), (0x1, 0x6, 0xB, 0xC), (0x2, 0x7, 0x8, 0xD), (0x3, 0x4, 0x9, 0xE), ]: v[a] = v[a] + v[b] & _M64 v[d] = rotr64(v[d] ^ v[a], 32) v[c] = v[c] + v[d] & _M64 v[b] = rotr64(v[b] ^ v[c], 24) v[a] = v[a] + v[b] & _M64 v[d] = rotr64(v[d] ^ v[a], 16) v[c] = v[c] + v[d] & _M64 v[b] = rotr64(v[b] ^ v[c], 63) v = [x + y & _M64 for x, y in zip(q, v)] q[0xD] += 1 yield from struct.pack('<16Q', *v)
class blk224 (text=False)
-
Returns the BLK224 hash of the input data.
class blk256 (text=False)
-
Returns the BLK256 hash of the input data.
class blk384 (text=False)
-
Returns the BLK384 hash of the input data.
class blk512 (text=False)
-
Returns the BLK512 hash of the input data.
class blowfish (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
Blowfish encryption and decryption.
Expand source code Browse git
class blowfish(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(Blowfish)): """ Blowfish encryption and decryption. """ pass
class blz
-
BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree implementation: It requires a lot of time & memory.
Expand source code Browse git
class blz(Unit): """ BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree implementation: It requires a lot of time & memory. """ def _begin(self, data): self._src = StructReader(memoryview(data)) self._dst = MemoryFile(bytearray()) return self def _reset(self): self._src.seek(0) self._dst.seek(0) self._dst.truncate() return self def _decompress(self): ( signature, version, src_count, src_crc32, dst_count, dst_crc32, ) = self._src.read_struct('>6L') if signature != 0x626C7A1A: raise ValueError(F'Invalid BriefLZ signature: {signature:08X}, should be 626C7A1A.') if version > 10: raise ValueError(F'Invalid version number {version}, should be less than 10.') self.log_debug(F'signature: 0x{signature:08X} V{version}') self.log_debug(F'src count: 0x{src_count:08X}') self.log_debug(F'src crc32: 0x{src_crc32:08X}') self.log_debug(F'dst count: 0x{dst_count:08X}') self.log_debug(F'dst crc32: 0x{dst_crc32:08X}') src = self._src.getbuffer() src = src[24:24 + src_count] if len(src) < src_count: self.log_warn(F'Only {len(src)} bytes in buffer, but header annoucned a length of {src_count}.') if src_crc32: check = zlib.crc32(src) if check != src_crc32: self.log_warn(F'Invalid source data CRC {check:08X}, should be {src_crc32:08X}.') dst = self._decompress_chunk(dst_count) if not dst_crc32: return dst check = zlib.crc32(dst) if check != dst_crc32: self.log_warn(F'Invalid result data CRC {check:08X}, should be {dst_crc32:08X}.') return dst def _decompress_modded(self): self._src.seekrel(8) total_size = self._src.u64() chunk_size = self._src.u64() remaining = total_size self.log_debug(F'total size: 0x{total_size:016X}') self.log_debug(F'chunk size: 0x{chunk_size:016X}') while remaining > chunk_size: self._decompress_chunk(chunk_size) remaining -= chunk_size return self._decompress_chunk(remaining) def _decompress_chunk(self, size=None): bitcount = 0 bitstore = 0 decompressed = 1 def readbit(): nonlocal bitcount, bitstore if not bitcount: bitstore = int.from_bytes(self._src.read_exactly(2), 'little') bitcount = 0xF else: bitcount = bitcount - 1 return (bitstore >> bitcount) & 1 def readint(): result = 2 + readbit() while readbit(): result <<= 1 result += readbit() return result self._dst.write(self._src.read_exactly(1)) try: while not size or decompressed < size: if readbit(): length = readint() + 2 sector = readint() - 2 offset = self._src.read_byte() + 1 delta = offset + 0x100 * sector available = self._dst.tell() if delta not in range(available + 1): raise RefineryPartialResult( F'Requested rewind by 0x{delta:08X} bytes with only 0x{available:08X} bytes in output buffer.', partial=self._dst.getvalue()) quotient, remainder = divmod(length, delta) replay = memoryview(self._dst.getbuffer()) replay = bytes(replay[-delta:] if quotient else replay[-delta:length - delta]) replay = quotient * replay + replay[:remainder] self._dst.write(replay) decompressed += length else: self._dst.write(self._src.read_exactly(1)) decompressed += 1 except EOFError as E: raise RefineryPartialResult(str(E), partial=self._dst.getbuffer()) dst = self._dst.getbuffer() if decompressed < size: raise RefineryPartialResult( F'Attempted to decompress {size} bytes, got only {len(dst)}.', dst) if decompressed > size: raise RuntimeError('Decompressed buffer contained more bytes than expected.') return dst def _compress(self): from refinery.lib.suffixtree import SuffixTree try: self.log_info('computing suffix tree') tree = SuffixTree(self._src.getbuffer()) except Exception: raise bitstore = 0 # The bit stream to be written bitcount = 0 # The number of bits in the bit stream buffer = MemoryFile(bytearray()) # Write empty header and first byte of source self._dst.write(bytearray(24)) self._dst.write(self._src.read_exactly(1)) def writeint(n: int) -> None: """ Write an integer to the bit stream. """ nonlocal bitstore, bitcount nbits = n.bit_length() if nbits < 2: raise ValueError # The highest bit is implicitly assumed: n ^= 1 << (nbits - 1) remaining = nbits - 2 while remaining: remaining -= 1 bitstore <<= 2 bitcount += 2 bitstore |= ((n >> remaining) & 3) | 1 bitstore <<= 2 bitcount += 2 bitstore |= (n & 1) << 1 src = self._src.getbuffer() remaining = len(src) - 1 self.log_info('compressing data') while True: cursor = len(src) - remaining rest = src[cursor:] if bitcount >= 0x10: block_count, bitcount = divmod(bitcount, 0x10) info_channel = bitstore >> bitcount bitstore = info_channel << bitcount ^ bitstore # The decompressor will read bits from top to bottom, and each 16 bit block has to be # little-endian encoded. The bit stream is encoded top to bottom bit in the bitstore # variable, and by encoding it as a big endian integer, the stream is in the correct # order. However, we need to swap adjacent bytes to achieve little endian encoding for # each of the blocks: info_channel = bytearray(info_channel.to_bytes(block_count * 2, 'big')) for k in range(block_count): k0 = 2 * k + 0 k1 = 2 * k + 1 info_channel[k0], info_channel[k1] = info_channel[k1], info_channel[k0] info_channel = memoryview(info_channel) data_channel = memoryview(buffer.getbuffer()) self._dst.write(info_channel[:2]) self._dst.write(data_channel[:-1]) self._dst.write(info_channel[2:]) data_channel = bytes(data_channel[-1:]) buffer.truncate(0) store = buffer if bitcount else self._dst store.write(data_channel) if remaining + bitcount < 0x10: buffer = buffer.getbuffer() if rest or buffer: bitstore <<= 0x10 - bitcount self._dst.write(bitstore.to_bytes(2, 'little')) self._dst.write(buffer) self._dst.write(rest) elif bitcount: raise RuntimeError('Bitbuffer Overflow') break node = tree.root length = 0 offset = 0 sector = None while node.children and length < len(rest): for child in node.children.values(): if tree.data[child.start] == rest[length]: node = child break if node.start >= cursor: break offset = node.start - length length = node.end + 1 - offset length = min(remaining, length) if length >= 4: sector, offset = divmod(cursor - offset - 1, 0x100) bitcount += 1 bitstore <<= 1 if sector is None: buffer.write(rest[:1]) remaining -= 1 continue bitstore |= 1 buffer.write(bytes((offset,))) writeint(length - 2) writeint(sector + 2) remaining -= length self._dst.seek(24) dst = self._dst.peek() self._dst.seek(0) self._dst.write(struct.pack('>6L', 0x626C7A1A, 1, len(dst), zlib.crc32(dst), len(src), zlib.crc32(src))) return self._dst.getbuffer() def process(self, data): self._begin(data) partial = None try: return self._decompress() except ValueError as error: if isinstance(error, RefineryPartialResult): partial = error self.log_warn(F'Reverting to modified BriefLZ after decompression error: {error!s}') self._reset() try: return self._decompress_modded() except RefineryPartialResult: raise except Exception as error: if not partial: raise raise partial from error def reverse(self, data): return self._begin(data)._compress()
class brotli
-
Brotli compression and decompression.
Expand source code Browse git
class brotli(Unit): """ Brotli compression and decompression. """ @Unit.Requires('brotlipy', 'all') def _brotli(): import brotli return brotli def process(self, data): return self._brotli.decompress(bytes(data)) def reverse(self, data): return self._brotli.compress(bytes(data))
class bruteforce (name, length=slice(1, None, None), format=None, alphabet=None, pattern=None, printable=False, digits=False, identifier=False, letters=False)
-
Generates all possible combinations of letters in a given alphabet. For each generated string, one copy of each input chunk is generated and populated with a meta variable containing that string. This can be used for simple brute forcing checks.
Expand source code Browse git
class bruteforce(Unit): """ Generates all possible combinations of letters in a given alphabet. For each generated string, one copy of each input chunk is generated and populated with a meta variable containing that string. This can be used for simple brute forcing checks. """ def __init__( self, name : Arg.String(help='Name of the meta variable to be populated.'), length: Arg.Bounds(metavar='length', help=( 'Specifies the range of characters to brute force, default is {default}.' )) = slice(1, None), format: Arg.String(help=( 'Optional format expression for the output string. The format sequence "{0}" is the ' 'current brute force string, the sequence "{1}" represents the input data.' )) = None, alphabet : Arg.Binary('-a', group='ALPH', help=( 'The alphabet from which to choose the letters. Entire byte range by default.' )) = None, pattern : Arg.RegExp('-r', group='ALPH', help='Provide a regular expression pattern to define the alphabet.') = None, printable : Arg.Switch('-p', group='ALPH', help='Equivalent to --pattern=[\\s\\x20-\\x7E]') = False, digits : Arg.Switch('-d', group='ALPH', help='Equivalent to --pattern=\\d') = False, identifier: Arg.Switch('-i', group='ALPH', help='Equivalent to --pattern=\\w') = False, letters : Arg.Switch('-l', group='ALPH', help='Equivalent to --pattern=[a-zA-Z]') = False, ): options = sum(1 for x in [printable, digits, identifier, letters] if x) if options > 1 or options and pattern: raise ValueError('Invalid selection.') if printable: pattern = b'[\\s\\x20-\\x7E]' if digits: pattern = b'\\d' if identifier: pattern = b'\\w' if letters: pattern = b'[a-zA-Z]' super().__init__( name=name, length=length, format=format, alphabet=alphabet, pattern=pattern, ) def _alphabet(self) -> bytes: alphabet = self.args.alphabet if alphabet: return alphabet alphabet = bytes(range(0x100)) pattern = self.args.pattern if not pattern: return alphabet alphabet = B''.join(re.findall(pattern, alphabet, flags=re.DOTALL)) if alphabet: return alphabet raise ValueError(F'Invalid regular expression: {pattern}') def process(self, data: bytearray): format_spec: str = self.args.format meta = metavars(data) name = self.args.name kwargs = {name: None} for length in integers_of_slice(self.args.length): self.log_info(F'generating {length} digits') if not isinstance(length, int) or length < 0: raise ValueError(F'Unable to brute force {length} characters.') for string in itertools.product(self._alphabet(), repeat=length): string = bytes(string) if format_spec: string = meta.format_bin(format_spec, self.codec, [string, data]) kwargs[name] = string yield self.labelled(data, **kwargs)
class byteswap (size=4)
-
Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block size are discarded.
Expand source code Browse git
class byteswap(UnaryOperation): """ Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block size are discarded. """ def __init__(self, size: Arg.Number(help='the block size in bytes; the default is {default}.') = 4): super().__init__(blocksize=size, _truncate=2) def inplace(self, block: ndarray) -> None: block.byteswap(True) operate = NotImplemented def process(self, data): try: return self._fastblock(data) except FastBlockError: b = self.blocksize n = len(data) m = n - n % b v = memoryview(data) if b == 1: self.log_warn('running this unit with a block size of 1 does not have any effect') return data for k in range(0, m, b): _end = k and k - 1 or None data[k : k + b] = v[k + b - 1:_end:-1] if m < n: del v del data[m:] return data
class bz2 (level=9)
-
BZip2 compression and decompression.
Expand source code Browse git
class bz2(Unit): """ BZip2 compression and decompression. """ def __init__(self, level: Arg('-l', type=number[1:9], help='compression level preset between 1 and 9') = 9): super().__init__(level=level) def process(self, data): return bz2_.decompress(data) def reverse(self, data): return bz2_.compress(data, self.args.level) @classmethod def handles(self, data: bytearray): return data[:3] == B'BZh'
class camellia (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
Camellia encryption and decryption.
Expand source code Browse git
class camellia(StandardBlockCipherUnit, cipher=BlockCipherFactory(Camellia)): """ Camellia encryption and decryption. """ pass
class carve (format, unique=False, decode=False, single=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True)
-
Extracts patches of data in particular formats from the input.
Expand source code Browse git
class carve(PatternExtractor): """ Extracts patches of data in particular formats from the input. """ def __init__( self, format: Arg.Choice(choices=[p.display for p in formats], metavar='format', help='Specify one of the following formats: {choices}'), unique: Arg.Switch('-q', help='Yield every match only once.') = False, decode: Arg.Switch('-d', help='Automatically decode known patterns.') = False, single: Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1') = False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True ): if single: take = 1 longest = True unique = True super().__init__( min=min, max=max, len=len, stripspace=stripspace, duplicates=not unique, longest=longest, take=take, ascii=ascii, utf16=utf16, format=formats.from_dashname(format) ) if not decode: decoder = NotImplemented elif self.args.format in (formats.multiline_string, formats.string): from ..encoding.esc import esc decoder = esc(unicode=True, quoted=True) elif self.args.format is formats.integer: from ..encoding.base import base decoder = base() elif self.args.format in (formats.uppercase_hex, formats.spaced_hex, formats.hex): from ..encoding.hex import hex decoder = hex() elif self.args.format is formats.hexdump: from ..formats.hexload import hexload decoder = hexload() elif self.args.format is formats.intarray: from ..blockwise.pack import pack decoder = pack() elif self.args.format in (formats.b64, formats.b64any, formats.spaced_b64): from ..encoding.b64 import b64 decoder = b64() elif self.args.format in (formats.b85, formats.spaced_b85): from ..encoding.b85 import b85 decoder = b85() elif self.args.format is formats.b64url: from ..encoding.b64 import b64 decoder = b64(urlsafe=True) elif self.args.format is formats.b32: from ..encoding.b32 import b32 decoder = b32() elif self.args.format is formats.ps1str: from ..encoding.ps1str import ps1str decoder = ps1str() elif self.args.format is formats.vbastr: from ..encoding.ps1str import ps1str decoder = ps1str() elif self.args.format is formats.hexarray: from ..blockwise.pack import pack decoder = pack(0x10) elif self.args.format is formats.wshenc: from ..encoding.wshenc import wshenc decoder = wshenc() elif self.args.format is formats.uuencode: from ..encoding.uuenc import uuenc decoder = uuenc() elif self.args.format in ( formats.urlquote, formats.urlquote_coarse, formats.urlquote_narrow, ): from ..encoding.url import url decoder = url() else: decoder = NotImplemented self.decoder = decoder def process(self, data): it = iter(self.matches_filtered(memoryview(data), self.args.format.value.bin_compiled)) if self.decoder is NotImplemented: yield from it for chunk in it: try: yield self.decoder(chunk) except Exception as E: self.log_info(F'decoder failure: {E!s}')
class carve_7z
-
Extracts anything from the input data that looks like a 7zip archive file.
Expand source code Browse git
class carve_7z(Unit): """ Extracts anything from the input data that looks like a 7zip archive file. """ @Unit.Requires('py7zr', 'arc', 'default', 'extended') def _py7zr(): import py7zr return py7zr HEADER_SIGNATURE = B'7z\xBC\xAF\x27\x1C' def process(self, data: bytearray): cursor = 0 mv = memoryview(data) while True: start = data.find(self.HEADER_SIGNATURE, cursor) if start < cursor: break self.log_debug(F'found header at offset: 0x{start:08X}') try: mf = MemoryFileRecorder(mv[start:]) self.log_debug('attempting to read archive') archive = self._py7zr.SevenZipFile(mf) self.log_debug('attempting to test archive') success = archive.test() is not False except ImportError: raise except Exception as error: self.log_debug('parsing archive failed:', error) success = False if success: self.log_info(F'identified archive of size 0x{mf.max_cursor:08X} at offset 0x{start:08X}') cursor = start + mf.max_cursor yield self.labelled(mv[start:cursor], offset=start) else: cursor = start + 5
class carve_json (dictonly=False)
-
Extracts anything from the input data that looks like JSON.
Expand source code Browse git
class carve_json(Unit): """ Extracts anything from the input data that looks like JSON. """ def __init__(self, dictonly: Arg.Switch('-d', help='only extract JSON dictionaries, do not extract lists.') = False): super().__init__(dictonly=dictonly) def process(self, data): for start, chunk in JSONCarver(data, dictonly=self.args.dictonly): yield self.labelled(chunk, offset=start)
class carve_lnk
-
Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)
Expand source code Browse git
class carve_lnk(Unit): """ Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file) """ @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'extended') def _LnkParse3(): import LnkParse3 import LnkParse3.extra_factory return LnkParse3 def process(self, data: bytearray): pos = 0 mem = memoryview(data) sig = B'\x4C\x00\x00\x00\x01\x14\x02\x00' lnk = self._LnkParse3 while True: pos = data.find(sig, pos) if pos < 0: break try: parsed = lnk.lnk_file(indata=mem[pos:]) except Exception: pos += 1 continue end = pos + parsed.header.size() + parsed.string_data.size() if parsed.has_target_id_list(): end += parsed.targets.size() if parsed.has_link_info() and not parsed.force_no_link_info(): with suppress(AttributeError): end += parsed.info.size() with NoLogging(): while end < len(mem): extra = lnk.extra_factory.ExtraFactory(mem[end:]) try: ec = extra.extra_class() except Exception: break if ec is None: break if 'UNKNOWN' in ec().name(): break end += extra.item_size() terminal_block = mem[end:end + 4] if terminal_block != B'\0\0\0\0': self.log_warn(F'detected LNK at offset 0x{pos:X}, but size calculation did not end on a terminal block') continue else: end += 4 yield self.labelled(mem[pos:end], offset=pos) pos = end
class carve_pe (*paths, list=False, join_path=False, drop_path=False, path=b'name', recursive=False, keep_root=False, memdump=False, fileinfo=False)
-
Extracts anything from the input data that looks like a Portable Executable (PE) file.
Expand source code Browse git
class carve_pe(PathExtractorUnit): """ Extracts anything from the input data that looks like a Portable Executable (PE) file. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, path=b'name', recursive: Arg.Switch('-r', help='Extract PE files that are contained in already extracted PEs.') = False, keep_root: Arg.Switch('-k', help='If the input chunk is itself a PE, include it as an output chunk.') = False, memdump : Arg.Switch('-m', help='Use the virtual memory layout of a PE file to calculate its size.') = False, fileinfo : Arg.Switch('-f', help='Use the PE meta information to deduce a file name meta variable.') = False ): super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, path=path, recursive=recursive, keep_root=keep_root, memdump=memdump, fileinfo=fileinfo, ) def unpack(self, data): cursor = 0 mv = memoryview(data) while True: offset = data.find(B'MZ', cursor) if offset < cursor: break cursor = offset + 2 ntoffset = mv[offset + 0x3C:offset + 0x3E] if len(ntoffset) < 2: return ntoffset, = unpack('H', ntoffset) if mv[offset + ntoffset:offset + ntoffset + 2] != B'PE': self.log_debug(F'invalid NT header signature for candidate at 0x{offset:08X}') continue try: pe = PE(data=data[offset:], fast_load=True) except PEFormatError as err: self.log_debug(F'parsing of PE header at 0x{offset:08X} failed:', err) continue pesize = get_pe_size(pe, memdump=self.args.memdump) pedata = mv[offset:offset + pesize] info = {} if self.args.fileinfo: pe_meta_parser = pemeta() try: info = pe_meta_parser.parse_version(pe) or {} except Exception as error: self.log_warn(F'Unable to obtain file information: {error!s}') try: info.update(pe_meta_parser.parse_header(pe) or {}) except Exception: pass try: path = info['OriginalFilename'] except KeyError: try: path = info['ExportName'] except KeyError: extension = 'exe' if pe.is_exe() else 'dll' if pe.is_dll() else 'sys' path = F'carve-0x{offset:08X}.{extension}' if offset > 0 or self.args.keep_root: yield UnpackResult(path, pedata, offset=offset) self.log_info(F'extracted PE file of size 0x{pesize:08X} from 0x{offset:08X}') else: self.log_info(F'ignored root file of size 0x{pesize:08X} from 0x{offset:08X}') continue if not offset or self.args.recursive: cursor += pe.OPTIONAL_HEADER.SizeOfHeaders else: cursor += pesize - 2
class carve_rtf
-
Extracts anything from the input data that looks like an RTF document.
Expand source code Browse git
class carve_rtf(Unit): """ Extracts anything from the input data that looks like an RTF document. """ def process(self, data: bytearray): pos = 0 mem = memoryview(data) sig = re.escape(b'{\\rtf') while True: match = re.search(sig, mem[pos:], flags=re.IGNORECASE) if match is None: break pos = pos + match.start() end = pos + 1 depth = 1 while depth and end < len(mem): if mem[end] == 0x7B: # { depth += 1 if mem[end] == 0x7D: # } depth -= 1 end += 1 if depth > 0: break yield self.labelled(mem[pos:end], offset=pos) pos = end
class carve_xml
-
Extracts anything from the input data that looks like XML.
Expand source code Browse git
class carve_xml(Unit): """ Extracts anything from the input data that looks like XML. """ def process(self, data): for offset, chunk in XMLCarver(data): yield self.labelled(chunk, offset=offset)
class carve_zip
-
Extracts anything from the input data that looks like a zip archive file.
Expand source code Browse git
class carve_zip(Unit): """ Extracts anything from the input data that looks like a zip archive file. """ def process(self, data: bytearray): end = len(data) mem = memoryview(data) rev = [] while True: end = data.rfind(ZipEndOfCentralDirectory.SIGNATURE, 0, end) if end < 0: break try: end_marker = ZipEndOfCentralDirectory(mem[end:]) except ValueError as e: self.log_info(F'error parsing end of central directory at 0x{end:X}: {e!s}') continue else: self.log_info(F'successfully parsed end of central directory at 0x{end:X}') start = end - end_marker.directory_size shift = start - end_marker.directory_offset if start < 0: self.log_debug('end of central directory size is invalid') continue try: central_directory = ZipCentralDirectory(mem[start:]) except ValueError: self.log_debug('computed location of central directory is invalid') end = end - len(ZipEndOfCentralDirectory.SIGNATURE) continue start = central_directory.header_offset + shift if mem[start:start + 4] not in (B'PK\x03\x04', B'\0\0\0\0'): # SFX payloads seem to have a nulled header, so we permit this. self.log_debug('computed start of ZIP archive does not have the correct signature bytes') continue rev.append((start, end + len(end_marker))) end = start for start, end in reversed(rev): zip = mem[start:end + len(end_marker)] yield self.labelled(zip, offset=start)
class cast (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
CAST encryption and decryption.
Expand source code Browse git
class cast(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(CAST)): """ CAST encryption and decryption. """ pass
class cca (data)
-
Short for ConCatAppend: This unit concatenates the input data with its argument by appending the latter to the former. See also
ccp
for the unit that prepends instead.Expand source code Browse git
class cca(Unit): """ Short for ConCatAppend: This unit concatenates the input data with its argument by appending the latter to the former. See also `refinery.ccp` for the unit that prepends instead. """ def __init__(self, data: Arg(help='Binary string to be appended to the input.')): super().__init__(data=data) def process(self, data: bytearray): data.extend(self.args.data) return data
class ccp (data)
-
Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending the latter to the former. See also
cca
for the unit that appends instead.Expand source code Browse git
class ccp(Unit): """ Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending the latter to the former. See also `refinery.cca` for the unit that appends instead. """ def __init__(self, data: Arg(help='Binary string to be prepended to the input.')): super().__init__(data=data) def process(self, data: bytearray): data[:0] = self.args.data return data
class cfmt (*formats, variable=None, separator=' ', multiplex=False, binary=False, unescape=False)
-
Stands for "Convert to ForMaT": Transform a given chunk by applying a format string operation. The positional format string placeholder
{}
will be replaced by the incoming data, named placeholders have to exist as meta variables in the current chunk. For example, the following pipeline can be used to print all files in a given directory with their corresponding SHA-256 hash:ef ** [| sha256 -t | cfmt {} {path} ]]
By default, format string arguments are simply joined along a space character to form a single format string.
Expand source code Browse git
class cfmt(Unit): """ Stands for "Convert to ForMaT": Transform a given chunk by applying a format string operation. The positional format string placeholder `{}` will be replaced by the incoming data, named placeholders have to exist as meta variables in the current chunk. For example, the following pipeline can be used to print all files in a given directory with their corresponding SHA-256 hash: ef ** [| sha256 -t | cfmt {} {path} ]] By default, format string arguments are simply joined along a space character to form a single format string. """ def __init__( self, *formats : Arg(help='Format strings.', type=str, metavar='format'), variable : Arg('-n', type=str, metavar='N', help='Store the formatted string in a meta variable.') = None, separator: Arg('-s', group='SEP', metavar='S', help='Separator to insert between format strings. The default is a space character.') = ' ', multiplex: Arg.Switch('-m', group='SEP', help='Do not join the format strings along the separator, generate one output for each.') = False, binary : Arg.Switch('-b', help='Use the binary formatter instead of the string formatter.') = False, unescape : Arg.Switch('-e', help='Interpret escape sequences in format strings.') = False, ): def fixfmt(fmt: bytes): if unescape: if isinstance(fmt, str): fmt = fmt.encode('latin1') return fmt.decode('unicode-escape') elif not isinstance(fmt, str): fmt = fmt.decode(self.codec) return fmt formats = [fixfmt(f) for f in formats] if not multiplex: formats = [fixfmt(separator).join(formats)] super().__init__(formats=formats, variable=variable, binary=binary) def process(self, data): meta = metavars(data) meta.ghost = True args = [data] variable = self.args.variable if self.args.binary: formatter = partial(meta.format_bin, codec=self.codec, args=args) else: def formatter(spec): return meta.format_str(spec, self.codec, args).encode(self.codec) for spec in self.args.formats: result = formatter(spec) if variable is not None: result = self.labelled(data, **{variable: result}) yield result
class chacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored.
Expand source code Browse git
class chacha(LatinCipherUnit): """ ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored. """ def keystream(self) -> Iterable[int]: key = self.args.key if len(key) == 64: it = ChaChaCipher.FromState(key) else: it = ChaChaCipher( key, self.args.nonce, self.args.magic, self.args.rounds, self.args.offset, ) yield from it
class chacha20 (key, nonce=b'REFINERY')
-
ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20 is functionally equivalent to
chacha
with 20 rounds, but this unit uses the PyCryptodome library C implementation rather than the pure Python implementation used bychacha
.Expand source code Browse git
class chacha20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20)): """ ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20 is functionally equivalent to `refinery.chacha` with 20 rounds, but this unit uses the PyCryptodome library C implementation rather than the pure Python implementation used by `refinery.chacha`. """ pass
class chacha20poly1305 (key, nonce=b'REFINERY')
-
ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20 variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce instead.
Expand source code Browse git
class chacha20poly1305(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20_Poly1305)): """ ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20 variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce instead. """ def _get_cipher(self, reset_cache=False): cipher = super()._get_cipher(reset_cache) cipher.block_size = 1 return cipher
class chaskey (key, iv=b'', padding=None, mode=None, raw=False, rounds=12, swap=False, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False)
-
This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey with 16 rounds and in CTR mode.
Expand source code Browse git
class chaskey(StandardBlockCipherUnit, cipher=BlockCipherFactory(Chaskey)): """ This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey with 16 rounds and in CTR mode. """ def __init__( self, key, iv=b'', padding=None, mode=None, raw=False, rounds: Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R, swap: Arg.Switch('-s', help='Use big endian byte order for all blocks.') = False, **more ): super().__init__(key, iv, padding=padding, mode=mode, raw=raw, rounds=rounds, swap=swap, **more) def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( swap=self.args.swap, rounds=self.args.rounds, **optionals )
class chop (size, step=None, truncate=False)
-
Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.
Expand source code Browse git
class chop(Unit): """ Reinterprets the input as a sequence of equally sized chunks and outputs this sequence. """ def __init__( self, size: Arg.Number('size', help='Chop data into chunks of this size'), step: Arg.Number('step', help=( 'Optionally specify a step size (which is equal to the size by default) which indicates the number of bytes by ' 'which the cursor will be increased after extracting a chunk.')) = None, truncate: Arg.Switch('-t', help=( 'Truncate possible excess bytes at the end of the input, by default they are appended as a single chunk.')) = False, ): return super().__init__(size=size, step=step, truncate=truncate) def process(self, data): view = memoryview(data) size = self.args.size step = self.args.step if size < 1: raise ValueError('The chunk size has to be a positive integer value.') yield from splitchunks(view, size, step, self.args.truncate)
class clower
-
Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the input to lowercase.
Expand source code Browse git
class clower(Unit): """ Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the input to lowercase. """ def process(self, data): return data.lower()
class cm (invert=False, all=False, reset=False, size=False, ext=False, entropy=False, ic=False, magic=False, sha1=False, sha256=False, crc32=False, md5=False, hashes=False, *names)
-
The Common Meta variables unit populates the set of meta variables of the current chunk with commonly used metadata. The unit has no effect outside a frame.
Expand source code Browse git
class cm(Unit): """ The Common Meta variables unit populates the set of meta variables of the current chunk with commonly used metadata. The unit has no effect outside a frame. """ def __init__( self, invert : Arg.Switch('-x', group='ALL', help='populate only options that have not been specified') = False, all : Arg.Switch('-a', group='ALL', help='populate all options') = False, reset : Arg.Switch('-r', help='discard all meta variables that were not explicitly specified') = False, size : Arg.Switch('-S', help='size of the chunk') = False, ext : Arg.Switch('-X', help='guess file extension') = False, entropy : Arg.Switch('-E', help='compute data entropy') = False, ic : Arg.Switch('-C', help='compute the index of coincidence') = False, magic : Arg.Switch('-M', help='compute file magic') = False, sha1 : Arg.Switch('-1', help='compute hash: SHA-1') = False, sha256 : Arg.Switch('-2', help='compute hash: SHA-256') = False, crc32 : Arg.Switch('-3', help='compute hash: CRC32') = False, md5 : Arg.Switch('-5', help='compute hash: MD5') = False, hashes : Arg.Switch('-H', help='compute all common hashes') = False, *names : Arg(metavar='name', help=( F'A variable name that can include the common properties: {_COMMON_PROPERTIES_LIST}.' R' If none is given, the size variable is populated. For most of these, an optional ' R'argument is available that can be used as a shorthand:')) ): def stringify(name): if isinstance(name, str): return name return name.decode(self.codec) names = {stringify(name) for name in names} if hashes: md5 = sha256 = sha1 = crc32 = True if size: names.add('size') if ext: names.add('ext') if entropy: names.add('entropy') if ic: names.add('ic') if magic: names.add('magic') if sha1: names.add('sha1') if sha256: names.add('sha256') if crc32: names.add('crc32') if md5: names.add('md5') if not names and not reset: names.add('size') if all: if invert: raise ValueError('invert and all are both enabled, resulting in empty configuration.') names = set(LazyMetaOracle.derivations) elif invert: names = set(LazyMetaOracle.derivations) - names super().__init__(names=names, reset=reset) def process(self, data): return data def filter(self, chunks): names = self.args.names reset = self.args.reset for chunk in chunks: chunk: Chunk if not chunk.visible: yield chunk continue meta = metavars(chunk) if reset: chunk.meta.clear() for name in names: chunk[name] = meta[name] yield chunk
class couple (*commandline, buffer=False, noerror=False, timeout=0.0)
-
Turns any command into a refinery unit. Data is processed by feeding it to the standard input of a process spawned from the given command line, and then reading the standard output of that process as the result of the operation. The main purpose of this unit is to allow using the syntax from
refinery.lib.frame
with other command line tools. By default, thecouple
unit streams the output from the executed command as individual outputs, but thebuffer
option can be set to buffer all output of a single execution. The format string expression{}
or{0}
can be used as one of the arguments passed to the external command to represent the incoming data. In this case, the data will not be sent to the standard input device of the new process.Expand source code Browse git
class couple(Unit): """ Turns any command into a refinery unit. Data is processed by feeding it to the standard input of a process spawned from the given command line, and then reading the standard output of that process as the result of the operation. The main purpose of this unit is to allow using the syntax from `refinery.lib.frame` with other command line tools. By default, the `refinery.couple` unit streams the output from the executed command as individual outputs, but the `buffer` option can be set to buffer all output of a single execution. The format string expression `{}` or `{0}` can be used as one of the arguments passed to the external command to represent the incoming data. In this case, the data will not be sent to the standard input device of the new process. """ _JOIN_TIME = 0.1 def __init__( self, *commandline : Arg(nargs='...', type=str, metavar='(all remaining)', help=( 'All remaining command line tokens form an arbitrary command line to be executed. Use format string syntax ' 'to insert meta variables and incoming data chunks.')), buffer: Arg.Switch('-b', help='Buffer the command output for one execution rather than streaming it.') = False, noerror: Arg('-e', help='do not merge stdin and stderr; stderr will only be output if -v is also specified.') = False, timeout: Arg('-t', metavar='T', help='Set an execution timeout as a floating point number in seconds, there is none by default.') = 0.0 ): if not commandline: raise ValueError('you need to provide a command line.') super().__init__(commandline=commandline, noerror=noerror, buffer=buffer, timeout=timeout) def process(self, data): def shlexjoin(): import shlex return ' '.join(shlex.quote(cmd) for cmd in commandline) meta = metavars(data) meta.ghost = True used = set() commandline = [ meta.format(cmd, self.codec, [data], None, False, used=used) for cmd in self.args.commandline ] if 0 in used: self.log_info('input used as command-line argument; sending no input to process stdin') data = None self.log_debug(shlexjoin) posix = 'posix' in sys.builtin_module_names process = Popen(commandline, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False, close_fds=posix) if self.args.buffer and not self.args.timeout: out, err = process.communicate(data) for line in err.splitlines(): self.log_info(line) yield out return import io from threading import Thread, Event from queue import Queue, Empty from time import process_time, sleep start = 0 result = None qerr = Queue() qout = Queue() done = Event() def adapter(stream, queue: Queue, event: Event): while not event.is_set(): out = stream.read1() if out: queue.put(out) else: break stream.close() recvout = Thread(target=adapter, args=(process.stdout, qout, done), daemon=True) recverr = Thread(target=adapter, args=(process.stderr, qerr, done), daemon=True) recvout.start() recverr.start() if data: process.stdin.write(data) process.stdin.close() start = process_time() if self.args.buffer or self.args.timeout: result = io.BytesIO() def queue_read(q: Queue): try: return q.get_nowait() except Empty: return None errbuf = io.BytesIO() while True: out = queue_read(qout) err = None if self.args.noerror: err = queue_read(qerr) else: out = out or queue_read(qerr) if err and self.log_info(): errbuf.write(err) errbuf.seek(0) lines = errbuf.readlines() errbuf.seek(0) errbuf.truncate() if lines: if not (done.is_set() or lines[~0].endswith(B'\n')): errbuf.write(lines.pop()) for line in lines: msg = line.rstrip(B'\n') if msg: self.log_info(msg) if out: if self.args.buffer or self.args.timeout: result.write(out) if not self.args.buffer: yield out if done.is_set(): if recverr.is_alive(): self.log_warn('stderr receiver thread zombied') if recvout.is_alive(): self.log_warn('stdout receiver thread zombied') break elif not err and not out and process.poll() is not None: recverr.join(self._JOIN_TIME) recvout.join(self._JOIN_TIME) done.set() elif self.args.timeout: if process_time() - start > self.args.timeout: self.log_info('terminating process after timeout expired') done.set() process.terminate() for wait in range(4): if process.poll() is not None: break sleep(self._JOIN_TIME) else: self.log_warn('process termination may have failed') recverr.join(self._JOIN_TIME) recvout.join(self._JOIN_TIME) if not len(result.getbuffer()): result = RuntimeError('timeout reached, process had no output') else: result = RefineryPartialResult( 'timeout reached, returning all collected output', partial=result.getvalue()) if isinstance(result, Exception): raise result elif self.args.buffer: yield result.getvalue()
class cp1252
-
Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.
Expand source code Browse git
class cp1252(Unit): """ Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data. """ def process(self, data): return data.decode(self.codec).encode('cp1252') def reverse(self, data): return data.decode('cp1252').encode(self.codec)
class crc32 (text=False)
-
Returns the CRC32 Hash of the input data.
Expand source code Browse git
class crc32(HashUnit): """ Returns the CRC32 Hash of the input data. """ def _algorithm(self, data: bytes) -> bytes: return struct.pack('>I', zlib.crc32(data))
class csb (format, utf16=True, ascii=True)
-
Short for carve single buffer; carves the single largest buffer of a given format from the input data and returns it.
Expand source code Browse git
class csb(carve): """ Short for carve single buffer; carves the single largest buffer of a given format from the input data and returns it. """ def __init__(self, format, utf16=True, ascii=True): super().__init__( format, decode=False, single=True, utf16=utf16, ascii=ascii, )
class csd (format, utf16=True, ascii=True)
-
Short for carve & decode; carves the single largest buffer of a given format from the input and decodes it with the appropriate decoder.
Expand source code Browse git
class csd(carve): """ Short for carve & decode; carves the single largest buffer of a given format from the input and decodes it with the appropriate decoder. """ def __init__(self, format, utf16=True, ascii=True): super().__init__( format, decode=True, single=True, utf16=utf16, ascii=ascii, )
class csv (quote=b'"', delim=b',')
-
Extracts the rows of a CSV document with header and converts them into JSON chunks.
Expand source code Browse git
class csv(Unit): """ Extracts the rows of a CSV document with header and converts them into JSON chunks. """ def __init__( self, quote: Unit.Arg('-q', help='Specify the quote character, the default is a double quote.') = B'"', delim: Unit.Arg('-d', help='Specify the delimiter, the default is a single comma.') = B',' ): super().__init__(quote=quote, delim=delim) def json_to_csv(self, table: dict): quote = self.args.quote.decode(self.codec) delim = self.args.delim.decode(self.codec) if not isinstance(table, list): raise ValueError('Input must be a JSON list.') out = MemoryFile() with io.TextIOWrapper(out, self.codec, newline='') as stream: writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) for row in table: if not isinstance(row, list): break if not all(isinstance(item, str) for item in row): break writer.writerow(row) else: return out.getvalue() keys = {} # A dictionary is used here over a set because dictionaries remember insertion order. # When feeding the unit a sequence of JSON objects, the user would likely expect the # column order in the resulting CSV to derive from the entry oder in the JSON data. for row in table: for key in row: if not isinstance(key, str): continue keys[key] = None keys = list(keys) out = MemoryFile() with io.TextIOWrapper(out, self.codec, newline='') as stream: writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) writer.writerow(keys) for row in table: writer.writerow([str(row.get(key, '')) for key in keys]) return out.getvalue() def reverse(self, data: bytearray): try: table: List[Dict[str, Any]] = json.loads(data) except Exception: table: List[Dict[str, Any]] = [json.loads(line) for line in data.splitlines()] return self.json_to_csv(table) def process(self, data): quote = self.args.quote.decode(self.codec) delim = self.args.delim.decode(self.codec) def convert(field: str): if field.isdigit() and not field.startswith('0'): return int(field) date = isodate(field) if date is not None: return date.isoformat(' ', 'seconds') return field with io.TextIOWrapper(MemoryFile(data), self.codec) as stream: rows = _csv.reader(stream, quotechar=quote, delimiter=delim, skipinitialspace=True) keys = next(rows) for row in rows: out = {key: convert(value) for key, value in zip(keys, row)} yield json.dumps(out, indent=4).encode(self.codec)
class cswap
-
Swap the case of the input string; all lowercase letters are turned into their uppercase variant and vice-versa.
Expand source code Browse git
class cswap(Unit): """ Swap the case of the input string; all lowercase letters are turned into their uppercase variant and vice-versa. """ def process(self, data: bytearray): lcase = bytes(range(B'a'[0], B'z'[0] + 1)) ucase = bytes(range(B'A'[0], B'Z'[0] + 1)) delta = lcase[0] - ucase[0] for k, letter in enumerate(data): if letter in ucase: data[k] += delta elif letter in lcase: data[k] -= delta return data
class cupper
-
Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the input to uppercase.
Expand source code Browse git
class cupper(Unit): """ Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the input to uppercase. """ def process(self, data): return data.upper()
class datefix (format='%Y-%m-%d %H:%M:%S', dos=False)
-
Parses all kinds of date formats and unifies them into the same format.
Expand source code Browse git
class datefix(Unit): """ Parses all kinds of date formats and unifies them into the same format. """ _FORMATS = [ '%B %dth %Y %H:%M:%S (UTC)', # November 27th 2019 17:37:02 (UTC) '%B %dnd %Y %H:%M:%S (UTC)', # November 22nd 2019 17:37:02 (UTC) '%B %dst %Y %H:%M:%S (UTC)', # November 21st 2019 17:37:02 (UTC) '%Y-%m-%dT%H:%M:%S', # 2010-03-15T06:27:50 '%Y-%m-%d %H:%M:%S', # iso (2010-03-15 06:27:50.000000) '%Y-%m-%d %H:%M:%SZ%f', '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%SZ%f', '%a %b %d %Y %H:%M:%S', # Thu Apr 24 2014 12:32:21 '%m/%d/%Y %H:%M:%S', '%m/%d/%Y', ] _TIMEZONE_REGEXES = [re_compile(p) for p in [ R'([+-])(\d{2})(\d{2})$', # Thu Apr 24 2014 12:32:21 GMT-0700 R'([+-])(\d{2}):(\d{2})$', # 2017:09:11 23:47:22+02:00 R'GMT([+-])(\d{2})(\d{2}) \(.+\)$' # Thu Apr 24 2014 12:32:21 GMT-0700 (PDT) ]] def __init__( self, format: Arg(help='Specify the output format as a strftime-like string, using ISO by default.') = '%Y-%m-%d %H:%M:%S', dos: Arg('-d', help='Parse timestamps in DOS rather than Unix format.') = False ): super().__init__(format=format, dos=dos) @staticmethod def dostime(stamp: int) -> datetime: """ Parses a given DOS timestamp into a datetime object. """ d, t = stamp >> 16, stamp & 0xFFFF s = (t & 0x1F) << 1 return datetime( year = ((d & 0xFE00) >> 0x9) + 1980, # noqa month = ((d & 0x01E0) >> 0x5), # noqa day = ((d & 0x001F) >> 0x0), # noqa hour = ((t & 0xF800) >> 0xB), # noqa minute = ((t & 0x07E0) >> 0x5), # noqa second = 59 if s == 60 else s, # noqa ) def _format(self, dt: datetime) -> str: return dt.strftime(self.args.format) def _extract_timezone(self, data): for r in self._TIMEZONE_REGEXES: m = r.search(data) if not m: continue pm = m[1] td = timedelta( hours=int(m[2]), minutes=int(m[3])) if pm == '-': td = -td return data[:-len(m[0])].strip(), td return data, None @linewise def process(self, data: str) -> str: data = data.strip() # replace colons (i.e. for exiftool dates: 2017:01:01) if len(data) > 10 and data[4] == ':' and data[7] == ':': data = F'{data[0:4]}-{data[5:7]}-{data[8:]}' # strips Z at end (i.e. 20171022055144Z) if data.endswith('Z'): data = data[:-1] if data.startswith('0x'): try: data = str(int(data, 16)) except Exception: pass # parses timestamps and dates without much format if data.isdigit(): time_stamp = int(data) if len(data) > 14: raise Exception('cannot parse all-numeric string as date: %s' % data) elif len(data) == 14: # i.e. 20111020193727 return self._format(datetime.strptime(data, '%Y%m%d%H%M%S')) elif len(data) == 13: # i.e. 1458016535000 time_stamp //= 1000 data = data[:-3] if self.args.dos: return self._format(self.dostime(time_stamp)) else: return self._format(date_from_timestamp(time_stamp)) data, time_delta = self._extract_timezone(data) for f in self._FORMATS: try: dt = datetime.strptime(data, f) except ValueError: continue return self._format(dt if time_delta is None else dt - time_delta) return data
class decompress (prepend=True, tolerance=12, max_ratio=1.0, min_ratio=0.0001, strict_limits=False)
-
Attempts all available decompression units against the input and returns the output of the first successful one. If none succeeds, the data is returned unaltered. The process is heavily biased against LZNT1 decompression due to a large tendency for LZNT1 false positives.
Expand source code Browse git
class decompress(Unit): """ Attempts all available decompression units against the input and returns the output of the first successful one. If none succeeds, the data is returned unaltered. The process is heavily biased against LZNT1 decompression due to a large tendency for LZNT1 false positives. """ def __init__( self, prepend: Arg.Switch('-P', '--no-prepend', off=True, help=( 'By default, if decompression fails, the unit attempts to prefix ' 'the data with all possible values of a single byte and decompress ' 'the result. This behavior can be disabled with this flag.') ) = True, tolerance: Arg.Number('-t', help=( 'Maximum number of bytes to strip from the beginning of the data; ' 'The default value is 12.') ) = 12, max_ratio: Arg('-m', metavar='R', help=( 'To determine whether a decompression algorithm was successful, the ' 'ratio of compressed size to decompressed size may at most be as large ' 'as this number, a floating point value R; default value is {default}.') ) = 1.0, min_ratio: Arg('-n', metavar='R', help=( 'Require that compression ratios must be at least as large as R. This ' 'is a "too good to be true" heuristic against algorithms like lznt1 ' 'that can produce false positives. The default is {default}.') ) = 0.0001, strict_limits: Arg('-l', action='store_true', help=( 'For recognized formats, i.e. when a magic signature is present, the ' 'above limits are disabled by default. Activate this flag to enforce ' 'them in every case.') ) = False ): if min_ratio <= 0: raise ValueError('The compression factor must be nonnegative.') super().__init__( tolerance=tolerance, prepend=prepend, min_ratio=min_ratio, max_ratio=max_ratio, strict_limits=strict_limits, ) self.engines: List[Unit] = [ engine.assemble() for engine in [ szdd, brotli, zl, lzma, aplib, qlz, lzf, lzw, jcalg, bz2, blz, lzjb, lz4, lzo, lznt1, nrv2e, nrv2d, nrv2b] ] for engine in self.engines: engine.log_detach() def process(self, data): data = memoryview(data) class Decompression(NamedTuple): engine: Unit rating: _R result: Optional[ByteString] = None cutoff: int = 0 prefix: Optional[int] = None def __str__(self): status = self.rating.summary engine = self.engine.name prefix = self.prefix if prefix is not None: prefix = F'0x{prefix:02X}' return F'prefix={prefix}, cutoff=0x{self.cutoff:02X}, [{status}] engine={engine}' def __len__(self): return len(self.result) @property def ratio(self): if not self.result: return INF return len(data) / len(self) @property def unmodified(self): return self.prefix is None and self.cutoff == 0 @property def method(self): return self.engine.name if self.args.prepend: buffer = bytearray(1 + len(data)) buffer[1:] = data best_by_rating: Dict[_R, Decompression] = {} def best_current_rating(): return max(best_by_rating, default=_R.InvalidData) def decompress(engine: Unit, cutoff: int = 0, prefix: Optional[int] = None): ingest = data[cutoff:] rating = _R.ValidData if prefix is not None: buffer[0] = prefix ingest = buffer is_handled = engine.handles(ingest) if is_handled is True: rating |= _R.KnownFormat if is_handled is False: return Decompression(engine, _R.InvalidData, None, cutoff, prefix) try: result = next(engine.act(ingest)) except RefineryPartialResult as pr: rating |= _R.HadOutput result = pr.partial except Exception: result = None else: rating |= _R.Successful return Decompression(engine, rating, result, cutoff, prefix) def update(new: Decompression, discard_if_too_good=False): ratio = new.ratio if self.args.strict_limits or not new.rating & _R.KnownFormat: if ratio > self.args.max_ratio: return if ratio < self.args.min_ratio: return best = best_by_rating.get(new.rating, None) prefix = new.prefix if prefix is not None: prefix = F'0x{prefix:02X}' if new.unmodified and best and not best.unmodified: threshold = 1 else: threshold = 0.95 if not best or len(new) < len(best): q = 0 else: q = len(best) / len(new) ratio *= 100 brief = new.rating.brief if q < threshold: if best and discard_if_too_good: if q < 0.5: return if new.failed: return self.log_info(lambda: F'[switch] [{brief}] [q={q:07.4f}] compression ratio {ratio:07.4f}% with: {new!s}') best_by_rating[new.rating] = new else: self.log_debug(lambda: F'[reject] [{brief}] [q={q:07.4f}] compression ratio {ratio:07.4f}% with: {new!s}') for engine in self.engines: self.log_debug(F'attempting engine: {engine.name}') careful = isinstance(engine, (lznt1, lzf, lzjb)) for t in range(self.args.tolerance): if best_current_rating() >= _R.Successful and careful and t > 0: break update(decompress(engine, t), careful) if self.args.prepend and best_current_rating() < _R.Successful: for p in range(0x100): update(decompress(engine, 0, p), careful) for r in sorted(best_by_rating, reverse=True): if dc := best_by_rating[r]: if not dc.rating & _R.HadOutput: continue self.log_info(F'settling on {dc.method} decompression.') if dc.rating & _R.KnownFormat: self.log_info('supporting evidence: found a known magic signature') if dc.rating & _R.HadNoErrors: self.log_info('supporting evidence: engine produced output without errors') elif dc.rating & _R.HadOutput: self.log_info('supporting evidence: there were errors, but the engine produced output') if not dc.rating & _R.Successful: self.log_info('the only decompression with result returned only a partial result.') return self.labelled(dc.result, method=dc.method) raise ValueError('no compression engine worked')
class dedup (key=None, count=False)
-
Deduplicates a sequence of multiple inputs. The deduplication is limited to the current
refinery.lib.frame
.Expand source code Browse git
class dedup(Unit): """ Deduplicates a sequence of multiple inputs. The deduplication is limited to the current `refinery.lib.frame`. """ def __init__( self, key: Arg('key', type=str, help='An optional meta variable expression to deduplicate.') = None, count: Arg.Switch('-c', help='Store the count of each deduplicated chunk.') = False ): super().__init__(key=key, count=count) def filter(self, chunks): keyvar = self.args.key if keyvar is not None: def key(chunk): v = PythonExpression.Evaluate(keyvar, metavars(chunk)) if isbuffer(v): v = md5(v).digest() return v else: def key(chunk): return md5(chunk).digest() if self.args.count: counts = {} buffer = {} hashes = None else: hashes = set() counts = None buffer = None for chunk in chunks: if not chunk.visible: yield chunk continue uid = key(chunk) if hashes is None: counts[uid] = counts.get(uid, 0) + 1 buffer.setdefault(uid, chunk) elif uid in hashes: continue else: hashes.add(uid) yield chunk if hashes is None: for uid, chunk in buffer.items(): yield self.labelled(chunk, count=counts[uid])
class defang (url_only=False, url_protocol=False, dot_only=False, quote_md=False)
-
Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot in the expression by
[.]
. For example,127.0.0.1
will be replaced by127.0.0[.]1
. For URL indicators, the colon after the procol scheme is also wrapped in brackets.Expand source code Browse git
class defang(Unit): """ Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot in the expression by `[.]`. For example, `127.0.0.1` will be replaced by `127.0.0[.]1`. For URL indicators, the colon after the procol scheme is also wrapped in brackets. """ _WHITELIST = [ B'wscript.shell', ] _PROTOCOL_ESCAPES = { B'http': B'hxxp', B'https': B'hxxps', B'ftp': B'fxp', B'ftps': B'fxps', } def __init__( self, url_only: Arg.Switch('-u', help='Only defang URLs, do not look for domains or IPs.') = False, url_protocol: Arg.Switch('-p', help='Escape the protocol in URLs.') = False, dot_only: Arg.Switch('-d', help='Do not escape the protocol colon in URLs.') = False, quote_md: Arg.Switch('-q', help='Wrap all indicators in backticks for markdown code.') = False ): self.superinit(super(), **vars()) def _quote(self, word): return word if not self.args.quote_md else B'`%s`' % word def reverse(self, data: bytearray): def refang(hostname): return hostname[0].replace(B'[.]', B'.') data = defanged.hostname.sub(refang, data) data = data.replace(B'[:]//', B'://') data = data.replace(B'[://]', B'://') data = re.sub(B'h.{3}?(s?)://', B'http\\1://', data) data = re.sub(B'fxp(s?)://', B'ftp\\1://', data) return data def process(self, data): def replace_hostname(hostname: bytes, match=True): if match: return self._quote(replace_hostname(hostname[0], False)) self.log_info('replace:', hostname) host = hostname user, atsgn, host = host.rpartition(B'@') host, colon, port = host.rpartition(B':') host = host.lower() if not colon: host = port port = B'' if host in self._WHITELIST: return hostname host = re.split(R'(?:\[\.\]|\.)', host.decode('latin1')) if len(host) == 1: return hostname components = iter(reversed(host)) defanged_parts = [next(components)] separator = '[.]' for part in components: defanged_parts.append(separator) defanged_parts.append(part) separator = '[.]' if part in tlds else '.' defanged_host = ''.join(reversed(defanged_parts)).encode('latin1') return user + atsgn + defanged_host + colon + port def replace_url(url: bytes): if not url: return url self.log_info('replace:', url) url = url.replace(B'[:]//', B'://', 1) url = url.replace(B'[.]', B'.') prefix = B'tcp' if url.startswith(B'://'): scheme = 0 elif url.startswith(B'//'): scheme = 1 prefix = prefix + B':' else: scheme = 2 prefix = B'' parsed = urlparse(prefix + url) operations = { name: self.process(getattr(parsed, name)) for name in ('path', 'params', 'query', 'fragment') } if self.args.url_protocol and parsed.scheme: operations.update(scheme=self._PROTOCOL_ESCAPES.get(parsed.scheme.lower(), scheme)) if scheme < 2: operations.update(scheme=B'') operations.update(netloc=replace_hostname(parsed.netloc, False)) url = urlunparse(parsed._replace(**operations)) if scheme == 0: url = B':' + url if not self.args.dot_only: url = url.replace(B'://', B'[:]//') return self._quote(url) urlsplit = defanged.url.split(data) step = defanged.url.value.groups + 1 urlsplit[1::step] = [replace_url(t) for t in itertools.islice(iter(urlsplit), 1, None, step)] if not self.args.url_only: urlsplit[0::step] = [ indicators.hostname.sub(replace_hostname, t) for t in itertools.islice(iter(urlsplit), 0, None, step) ] def fuse(urlsplit): txt = itertools.islice(iter(urlsplit), 0, None, step) url = itertools.islice(iter(urlsplit), 1, None, step) while True: try: yield next(txt) yield next(url) except StopIteration: break return B''.join(fuse(urlsplit))
class deob_js_arrays
-
JavaScript deobfuscator to turn
["Z", "t", "s", "e"][0]
into"Z"
.Expand source code Browse git
class deob_js_arrays(Deobfuscator): """ JavaScript deobfuscator to turn `["Z", "t", "s", "e"][0]` into `"Z"`. """ def deobfuscate(self, data): def litpick(match): try: array = match[1] index = int(match[2]) lpick = array.split(',')[index].strip() self.log_debug(lambda: F'{lpick} = {match[0]}') except (TypeError, IndexError): lpick = match[0] return lpick p = R'\s{{0,5}}'.join([ '\\[', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\]', '\\[', '({i})', '\\]' ]).format(i=formats.integer, s=formats.string) return re.sub(p, litpick, data)
class deob_js_getattr
-
JavaScript deobfuscator to turn
WScript["CreateObject"]
intoWScript.CreateObject
.Expand source code Browse git
class deob_js_getattr(Deobfuscator): """ JavaScript deobfuscator to turn `WScript["CreateObject"]` into `WScript.CreateObject`. """ def deobfuscate(self, data): def dottify(match): name = match[2][1:-1] if name.isidentifier(): return F'{match[1]}.{name}' return match[0] return re.sub(FR'(\w+)\[({formats.string})\]', dottify, data)
class deob_js_tuples
-
JavaScript deobfuscator to turn
("Z", "t", "s", "e")
into"e"
.Expand source code Browse git
class deob_js_tuples(Deobfuscator): """ JavaScript deobfuscator to turn `("Z", "t", "s", "e")` into `"e"`. """ def deobfuscate(self, data): def litpick(match): try: array = match[1] lpick = array.split(',')[-1].strip() self.log_debug(lambda: F'{lpick} = {match[0]}') except (TypeError, IndexError): lpick = match[0] return lpick p = R'\s{{0,5}}'.join([ '\\(', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\)' ]).format(i=formats.integer, s=formats.string) return re.sub(p, litpick, data)
class deob_ps1 (timeout=100)
-
Expand source code Browse git
class deob_ps1(IterativeDeobfuscator): _SUBUNITS = [sub() for sub in [ deob_ps1_escape, deob_ps1_cases, deob_ps1_brackets, deob_ps1_format, deob_ps1_typecast, deob_ps1_stringreplace, deob_ps1_b64convert, deob_ps1_encodings, deob_ps1_concat, deob_ps1_invoke, deob_ps1_uncurly ]] def deobfuscate(self, data): for u in self._SUBUNITS: u.log_level = self.log_level for unit in self._SUBUNITS: self.log_debug(lambda: F'invoking {unit.name}') checkpoint = hash(data) data = unit.deobfuscate(data) if checkpoint != hash(data) and not self.log_debug('data has changed.'): self.log_info(F'used {unit.name}') return re.sub(R'[\r\n]+', '\n', data)
class deob_ps1_b64convert
-
Expand source code Browse git
class deob_ps1_b64convert(Deobfuscator): _SENTINEL = re.compile('\\s*'.join( (re.escape('[System.Convert]::FromBase64String'), '\\(', '({s})', '\\)') ).format(s=formats.ps1str), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = Ps1StringLiterals(data) def replacer(match: re.Match[str]): if strlit.get_container(match.start()): return match[0] try: string, = string_unquote(match[1]) except ValueError: return match[0] try: bytes = base64.b64decode(string) except Exception: return match[0] return '@({})'.format(','.join(F'0x{b:02X}' for b in bytes)) return self._SENTINEL.sub(replacer, data)
class deob_ps1_brackets
-
PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e.
("{0}{2}{1}")
is transformed to"{0}{2}{1}"
. Currently, only integer and string constants are supported.Expand source code Browse git
class deob_ps1_brackets(Deobfuscator): """ PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e. `("{0}{2}{1}")` is transformed to `"{0}{2}{1}"`. Currently, only integer and string constants are supported. """ _SENTINEL = re.compile( RF'''(?<![\w"']{{2}})''' # this may be a function call RF'''(\-\w+)?''' # not a function call but an argument RF'''\(\s*({formats.integer}|{formats.ps1str})\s*(\S)''', flags=re.IGNORECASE ) def deobfuscate(self, data): strlit = Ps1StringLiterals(data) repeat = True @strlit.outside def replacement(match): nonlocal repeat if match[3] == ')': repeat = True return (match[1] or '') + match[2] while repeat: repeat = False data = self._SENTINEL.sub(replacement, data) return data
class deob_ps1_cases
-
Expand source code Browse git
class deob_ps1_cases(Deobfuscator): _NAMES = [ '-BXor', '-Exec Bypass', '-NoLogo', '-NonInter', '-Replace', '-Windows Hidden', '.Invoke', 'Assembly', 'Byte', 'Char', 'ChildItem', 'CreateThread', 'Get-Variable', 'GetType', 'IntPtr', 'Invoke-Expression', 'Invoke', 'Length', 'Net.WebClient', 'PowerShell', 'PSVersionTable', 'Set-Item', 'Set-Variable', 'Start-Sleep', 'ToString', 'Type', 'Value', 'Void', ] @outside(formats.ps1str) def deobfuscate(self, data): for name in self._NAMES: data = re.sub(RF'\b{re.escape(name)}\b', name, data, flags=re.IGNORECASE) return data
class deob_ps1_concat (timeout=100)
-
Expand source code Browse git
class deob_ps1_concat(IterativeDeobfuscator): _SENTINEL = re.compile(R'''['"]\s*[+&]\s*['"]''') def deobfuscate(self, data): def concat(data): strlit = Ps1StringLiterals(data) repeat = True while repeat: for match in self._SENTINEL.finditer(data): a, b = match.span() a = strlit.get_container(a) if a is None: continue b = strlit.get_container(b) if b is None or b != a + 1: continue a = strlit.ranges[a] b = strlit.ranges[b] stra = data[slice(*a)] strb = data[slice(*b)] parts = list(string_unquote(stra)) it = iter(string_unquote(strb)) parts[~0] += next(it) parts.extend(it) yield data[:a[0]] + string_quote(parts) data = data[b[1]:] strlit.update(data) break else: repeat = False yield data return ''.join(concat(data))
class deob_ps1_encodings
-
Expand source code Browse git
class deob_ps1_encodings(Deobfuscator): _SENTINEL = re.compile('\\s*'.join( (re.escape('[System.Text.Encoding]::') + '(\\w+)\\.GetString', '\\(', '@\\(', '({a})', '\\)', '\\)') ).format(a=formats.intarray), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = Ps1StringLiterals(data) def replacer(match: re.Match[str]): if strlit.get_container(match.start()): return match[0] try: bytes = bytearray(int(x.strip(), 0) for x in match[2].split(',')) except Exception: return match[0] encoding = { 'ASCII': 'ascii', 'BigEndianUnicode': 'utf-16be', 'Default': 'latin1', 'Unicode': 'utf-16le', }.get(match[1], match[1]) try: codecs.lookup(encoding) except LookupError: encoding = 'utf8' try: string = bytes.decode(encoding) except Exception: return match[0] return string_quote(string) return self._SENTINEL.sub(replacer, data)
class deob_ps1_escape
-
Expand source code Browse git
class deob_ps1_escape(Deobfuscator): def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def repl(m): return m[1] return re.sub(R'''`([^0abfnrtv`#'"\$])''', repl, data)
class deob_ps1_format
-
PowerShell deobfuscation for the following "format string"-based technique:
"{0}{2}{1}"-f 'signa','ures','t'
"{0}na{2}{1}"-f 'sig','ures','t'
Expand source code Browse git
class deob_ps1_format(Deobfuscator): """ PowerShell deobfuscation for the following "format string"-based technique: - `"{0}{2}{1}"-f 'signa','ures','t'` - `"{0}na{2}{1}"-f 'sig','ures','t'` """ def deobfuscate(self, data): repeat = True while repeat: repeat = False for string in re.finditer(str(formats.ps1str), data): argmatch = re.search(R'^\s*-[fF]\s*((?:{s},\s*)*{s})'.format(s=formats.ps1str), data[string.end():]) if not argmatch: continue def dbgmsg(): sample = string[0] if len(sample) > 33: sample = F"{sample[1:30]}...{sample[0]}" return F'found match at {string.start()}: {sample}' self.log_debug(dbgmsg) args = re.split(F'({formats.ps1str})', argmatch[1]) args = [list(string_unquote(a.strip())) for a in args[1::2]] def formatter(string): buffer = [] for k, part in enumerate(re.split(R'(\{\d+\})', string)): if k % 2 == 0: if part: buffer.append(part) continue try: index = int(part[1:-1]) arg = args[index] except IndexError as IE: raise IndexError(F'only found {len(args)} arguments and format sequence {index}, aborting.') from IE it = iter(arg) buffer.append(next(it)) if len(arg) > 1: yield ''.join(buffer) buffer = [] for last, part in lookahead(it): if last: buffer.append(part) break yield part yield ''.join(buffer) try: result = string_apply(string[0], formatter) except IndexError: continue data = data[:string.start()] + result + data[argmatch.end() + string.end():] repeat = True break return data
class deob_ps1_invoke
-
Expand source code Browse git
class deob_ps1_invoke(Deobfuscator): def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def invrepl1(m): return m[1] + m[3] data = re.sub( R'''(\.|::)''' # preceeded by dot or namespace delimiter R'''(['"])(\w{1,200})\2''' # quoted string (actually a method name) R'''(?=[\s\(\.\,\;\+\-])''', # only if followed by certain characters invrepl1, data # remove quotes around symbol ) @strlit.outside def invrepl2(m): return m[1] + '(' data = re.sub( '\\s{0,5}'.join([ '[.&]', '(\\(', # sourcing operator '(?:gcm|get-command)', ')?', # potentially a get-command '([\'"])([-a-z]{1,100})\\2' # string enclosing a command '(?(1)\\s{0,5}\\)|)', # closing bracket for get-command ]), '\\3', data, flags=re.IGNORECASE ) data = re.sub( R'''(\w{1,200})\.Invoke\s*\(''', invrepl2, data, flags=re.IGNORECASE ) return data
class deob_ps1_secstr (*a)
-
Expand source code Browse git
class deob_ps1_secstr(Deobfuscator): def __init__(self, *a, **kw): super().__init__(*a, **kw) self._pack = pack() self._secstr = secstr() self._pattern = re.compile( R'\s{{0,20}}'.join([ R'''(['"])({b})\1''', R'\|', R'\.?', R'&?', R'''(['"]?)ConvertTo-SecureString\3''', R'-ke?y?', R'''(\(?)({a}|{i}\s{{0,20}}\.\.\s{{0,20}}{i})''', R'((?:\)\s{{0,20}}){{0,10}})?' ]).format( b=formats.b64, a=formats.intarray, i=formats.integer ), flags=re.IGNORECASE | re.DOTALL ) def _decrypt_block(self, data, match): if '..' in match[5]: a, b = [int(x.strip(), 0) for x in match[5].split('..')] key = range(min(a, b), max(a, b) + 1) if a > b: key = reversed(key) self._secstr.args.key = bytes(bytearray(key)) else: self._secstr.args.key = self._pack(match[5].encode(self.codec)) decoded = self._secstr(match[2].encode(self.codec)) decoded = decoded.decode(self.codec) result = F'\n\n{decoded}\n\n' brackets = match[6].count(')') start = match.start() if match[4]: brackets -= 1 if brackets <= 0: if brackets < 0: result += ')' return start, result while brackets: start -= 1 if data[start] == '(': brackets -= 1 if data[start] == ')': brackets += 1 return start, result def deobfuscate(self, data): while True: match = self._pattern.search(data) if not match: break start, result = self._decrypt_block(data, match) data = data[:start] + result + data[match.end():] return data
class deob_ps1_stringreplace
-
Expand source code Browse git
class deob_ps1_stringreplace(Deobfuscator): _SENTINEL = re.compile(( R'(?i)[\'"]\s*' # end of haystack string R'(-c|-i|-|\.)replace' # the replace call R'([\(\s]*)({s})([\)\s]*),' # needle (with brackets) R'([\(\s]*)({s})([\)\s]*)' # insert (with brackets) ).format(s=formats.ps1str), flags=re.IGNORECASE) def deobfuscate(self, data): repeat = True strlit = Ps1StringLiterals(data) while repeat: repeat = False needle = None for match in self._SENTINEL.finditer(data): k = strlit.get_container(match.start()) if k is None: continue offset, end = strlit.ranges[k] if match.start() != end - 1: continue string = data[offset:end] pf, bl1, needle, bl2, br1, insert, br2 = match.groups() end = match.end() case = '' if pf[0] in '.c' else '(?i)' bl = bl1.count('(') - bl2.count(')') br = br2.count(')') - br1.count('(') if pf[0] == '.': bl -= 1 br -= 1 if bl != 0 or br < 0: continue needle = list(string_unquote(needle)) if len(needle) > 1: continue needle = needle[0] head, *body = string_unquote(insert) self.log_info('replacing', needle, 'by', insert) if not body: def perform_replacement(string): return re.sub(F'{case}{re.escape(needle)}', lambda _: head, string) else: *body, tail = body def perform_replacement(string): # noqa parts = re.split(F'{case}{re.escape(needle)}', string) if len(parts) == 1: yield string return it = iter(parts) yield next(it) + head yield from body for last, part in lookahead(it): if last: yield tail + part else: yield tail + part + head yield from body replaced = string_apply(string, perform_replacement) + (br * ')') strlit.ranges[k] = offset, offset + len(replaced) - br strlit.ranges[k + 1: k + 3] = [] strlit.shift(len(replaced) + offset - end, k + 1) data = data[:offset] + replaced + data[end:] repeat = True break return data
class deob_ps1_typecast
-
Replaces sequences like [Char]120 to their string representation, in this case the string "x".
Expand source code Browse git
class deob_ps1_typecast(Deobfuscator): """ Replaces sequences like [Char]120 to their string representation, in this case the string "x". """ def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def strip_typecast(m): return m[1] data = re.sub( FR'\[(?:string|char\[\])\]\s*({formats.ps1str!s})', strip_typecast, data, flags=re.IGNORECASE ) @strlit.outside def char_literal(match): c = chr(int(match[1].lower(), 0)) if c == "'": return '''"'"''' return F"'{c}'" data = re.sub( R'\[char\]\s*0*(0x[0-9a-f]+|\d+)', char_literal, data, flags=re.IGNORECASE ) def char_array(match): result = bytes(int(x, 0) for x in match[1].split(',')) try: result = result.decode('ascii') if not all(x in string.printable or x.isspace() for x in result): raise ValueError except ValueError: return match[0] else: return string_quote(result) data = re.sub( R'\s*'.join([ R'\[char\[\]\]', R'\((', R'(?:\s*(?:0x[0-9a-f]+|\d+)\s*,)+', R'(?:0x[0-9a-f]+|\d+)', R')\)' ]), char_array, data, flags=re.IGNORECASE ) return data
class deob_ps1_uncurly
-
PowerShell deobfuscation that removes superfluous curly braces around variable names that do not require it, i.e.
${variable}
is transformed to just$variable
.Expand source code Browse git
class deob_ps1_uncurly(Deobfuscator): """ PowerShell deobfuscation that removes superfluous curly braces around variable names that do not require it, i.e. `${variable}` is transformed to just `$variable`. """ _SENTINEL = re.compile(R'\$\{(\w+)\}') def deobfuscate(self, data): strlit = Ps1StringLiterals(data) @strlit.outside def strip(m): return F'${m[1]}' return self._SENTINEL.sub(strip, data)
class deob_vba (timeout=100)
-
Expand source code Browse git
class deob_vba(IterativeDeobfuscator): _SUBUNITS = [sub() for sub in [ deob_vba_comments, deob_vba_brackets, deob_vba_char_function, deob_vba_concat, deob_vba_arithmetic, deob_vba_constants, deob_vba_dummy_variables, deob_vba_stringreplace, deob_vba_stringreverse, ]] def deobfuscate(self, data): for u in self._SUBUNITS: u.log_level = self.log_level for unit in self._SUBUNITS: self.log_debug(lambda: F'invoking {unit.name}') checkpoint = hash(data) data = unit.deobfuscate(data) if checkpoint != hash(data) and not self.log_debug('data has changed.'): self.log_info(F'used {unit.name}') return re.sub(R'[\r\n]+', '\n', data)
class deob_vba_arithmetic
-
Expand source code Browse git
class deob_vba_arithmetic(Deobfuscator): def deobfuscate(self, data): strings = StringLiterals(formats.vbastr, data) def vba_int_eval(match: re.Match[str]) -> str: s = match[0].lower() if not s.startswith('&'): return s t, s = s[1], s[2:].rstrip('&') if t == 'h': return str(int(s, 16)) if t == 'b': return str(int(s, 2)) if t == 'o': return str(int(s, 8)) @strings.outside def evaluate(match: re.Match[str]): expression = match[0] expression = expression.strip() if not any(c.isdigit() for c in expression): return expression expression = re.sub(str(formats.vbaint), vba_int_eval, expression) brackets = 0 positions = [] ok = True head = tail = rest = '' for end, character in enumerate(expression): if character == '(': brackets += 1 positions.append(end) continue if character == ')': brackets -= 1 if brackets < 0: expression, tail = expression[:end], expression[end:] break else: positions.pop() if brackets == 0 and expression[0] == '(': expression, rest = expression[:end + 1], expression[end + 1:] break if expression.isdigit(): return match[0] if brackets > 0: pos = positions[~0] + 1 head = expression[:pos] expression = expression[pos:] try: result = str(_cautious_vba_eval(expression + rest)) except Exception: ok = False else: rest = '' if not ok and rest: try: result = str(_cautious_vba_eval(expression)) except Exception: expression += rest else: ok = True if not ok: result = expression self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})') else: if expression.startswith('(') and expression.endswith(')'): result = F'({result})' if tail: tail = self.deobfuscate(tail) return F'{head}{result}{rest}{tail}' pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format( i=str(formats.vbaint), f=str(formats.float))) return pattern.sub(evaluate, data)
class deob_vba_brackets
-
Expand source code Browse git
class deob_vba_brackets(Deobfuscator): _SENTINEL = re.compile( RF'''(?<![\w"']{{2}})''' # this may be a function call RF'''\(\s*({formats.vbaint}|{formats.vbastr}|{formats.float})\s*(\S)''', flags=re.IGNORECASE ) def deobfuscate(self, data): strlit = StringLiterals(formats.vbastr, data) repeat = True @strlit.outside def replacement(match): nonlocal repeat if match[2] == ')': repeat = True return match[1] while repeat: repeat = False data = self._SENTINEL.sub(replacement, data) return data
class deob_vba_char_function
-
Expand source code Browse git
class deob_vba_char_function(Deobfuscator): def deobfuscate(self, data): strings = StringLiterals(formats.vbastr, data) @strings.outside def evaluate_char_function(match: re.Match[str]): try: c = chr(int(match[1])) except ValueError: return match[0] if c == '"': return '""""' if c == '\\': return '"\\"' c = repr(c)[1:-1] if len(c) > 1: return match[0] return '"{}"'.format(c) return re.sub(R'(?i)\bchrw?\s*\(\s*(\d+)\s*\)', evaluate_char_function, data)
class deob_vba_chr_literals
-
Expand source code Browse git
class deob_vba_chr_literals(Unit): def process(self, data): def _chr(m): code = int(m[1], 0) if code == 34: return B'""""' return B'"%s"' % chr(code).encode('unicode_escape') data = re.sub(BR'Chr\((\d+x?\d+)\)', _chr, data, flags=re.IGNORECASE) data = re.sub(BR'"\s*\&\s*"', B'', data) return data
class deob_vba_comments
-
Expand source code Browse git
class deob_vba_comments(Deobfuscator): def deobfuscate(self, data): return re.sub(R"(?im)^\s{0,20}(?:'|rem\b|dim\b).*(?:\Z|$\n\r?)", '', data)
class deob_vba_concat (timeout=100)
-
Expand source code Browse git
class deob_vba_concat(IterativeDeobfuscator): _SENTINEL = re.compile(R'''"\s*(\++|&)\s*"''') def deobfuscate(self, data): def concat(data): strlit = StringLiterals(formats.vbastr, data) repeat = True while repeat: for match in self._SENTINEL.finditer(data): a, b = match.span() a = strlit.get_container(a) if a is None: continue b = strlit.get_container(b) if b is None or b != a + 1: continue _, a = strlit.ranges[a] b, c = strlit.ranges[b] yield data[:a - 1] + data[b + 1:c] data = data[c:] strlit.update(data) break else: repeat = False yield data return ''.join(concat(data))
class deob_vba_constants
-
Expand source code Browse git
class deob_vba_constants(Deobfuscator): def deobfuscate(self, data): codelines = data.splitlines(keepends=True) constants = {} constline = {} variables = set() for k, line in enumerate(codelines): match = re.match(R'(?im)^\s*(?:sub|function)\s*(\w+)', line) if match: variables.add(match[1]) continue match = re.match( R'(?im)^(?:\s*const)?\s*(\w+)\s*=\s*({i}|{s})\s*(?:\'|rem|$)'.format( s=formats.ps1str, i=formats.integer ), line) if match is None or match[1] in variables: pass elif match[2] != constants.get(match[1], match[2]): self.log_debug(F'del {match[1]}') del constants[match[1]] del constline[match[1]] variables.add(match[1]) else: self.log_debug(F'add {match[1]} = {match[2]}') constants[match[1]] = match[2] constline[match[1]] = k codelines = [line for k, line in enumerate(codelines) if k not in constline.values()] data = ''.join(codelines) for name, value in constants.items(): data = re.sub(RF'\b{re.escape(name)!s}\b', lambda _: value, data) return data
class deob_vba_dummy_variables
-
Expand source code Browse git
class deob_vba_dummy_variables(Deobfuscator): def deobfuscate(self, data): lines = data.splitlines(keepends=False) names = collections.defaultdict(list) def might_be_used_in(name, line): # avoid finding the name within a string literal line = '""'.join(re.split(str(formats.ps1str), line)) line = re.split(RF'\b{name}\b', line) try: L, R = line except ValueError: return False L = L.strip().lower() if L.startswith("'") or L.startswith('rem'): return False R = R.strip().lower() if R.startswith('=') and 'if' not in L: return False if L.startswith('dim'): return False return True pattern = re.compile( R'(?i)^\s{0,8}(?:const\s{1,8})?(\w+)\s{1,8}=\s{1,8}.*$' ) for k, line in enumerate(lines): try: name = pattern.match(line)[1] except (AttributeError, TypeError): continue if re.search(r'\w+\(', line): # might be a function call continue names[name].append(k) for line in lines: while True: for name in names: if might_be_used_in(name, line): del names[name] break else: break return '\n'.join(line for k, line in enumerate(lines) if not any( k in rows for rows in names.values()))
class deob_vba_stringreplace
-
Expand source code Browse git
class deob_vba_stringreplace(Deobfuscator): _SENTINEL = re.compile(( R'(?i)\bReplace\s*\(' # the replace call R'\s*({s}),' # haystack (with brackets) R'\s*({s}),' # needle (with brackets) R'\s*({s})\s*\)' # insert (with brackets) ).format(s=formats.vbastr), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = StringLiterals(formats.vbastr, data) @strlit.outside def replacement(match: re.Match[str]): return string_quote( string_unquote(match[1]).replace( string_unquote(match[2]), string_unquote(match[3]) ) ) return self._SENTINEL.sub(replacement, data)
class deob_vba_stringreverse
-
Expand source code Browse git
class deob_vba_stringreverse(Deobfuscator): _SENTINEL = re.compile(( R'(?i)\bStrReverse\s*\(' # the reverse call R'\s*({s})\s*\)' # string ).format(s=formats.vbastr), flags=re.IGNORECASE) def deobfuscate(self, data): strlit = StringLiterals(formats.vbastr, data) @strlit.outside def replacement(match: re.Match[str]): return string_quote(''.join(reversed(string_unquote(match[1])))) return self._SENTINEL.sub(replacement, data)
class des (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
DES encryption and decryption.
Expand source code Browse git
class des(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES)): """ DES encryption and decryption. """ pass
class des3 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
3-DES encryption and decryption.
Expand source code Browse git
class des3(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES3)): """ 3-DES encryption and decryption. """ pass
class deskd (size=8)
-
Stands for "DES Key Derivation". It implements the same functionality as
DES_string_to_key
in OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern key derivation function.Expand source code Browse git
class deskd(KeyDerivation): """ Stands for "DES Key Derivation". It implements the same functionality as `DES_string_to_key` in OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern key derivation function. """ def __init__(self, size: Arg(help='The number of bytes to generate, default is the maximum of 8.') = 8): super().__init__(size=size, salt=None) def process(self, password): from Cryptodome.Cipher import DES from Cryptodome.Util.strxor import strxor key = bytearray(8) for i, j in enumerate(password): if ((i % 16) < 8): key[i % 8] ^= (j << 1) & 0xFF else: j = (((j << 4) & 0xf0) | ((j >> 4) & 0x0f)) j = (((j << 2) & 0xcc) | ((j >> 2) & 0x33)) j = (((j << 1) & 0xaa) | ((j >> 1) & 0x55)) key[7 - (i % 8)] ^= j des_set_odd_parity(key) if password: n = len(password) password = password.ljust(n + 7 - ((n - 1) % 8), b'\0') des = DES.new(key, DES.MODE_ECB) for k in range(0, n, 8): key[:] = des.encrypt(strxor(password[k:k + 8], key)) des_set_odd_parity(key) if self.args.size > 8: raise RefineryPartialResult('can provide at most 8 bytes.', partial=key) return key[:self.args.size]
class dexstr
-
Extract strings from DEX (Dalvik Executable) files.
Expand source code Browse git
class dexstr(Unit): """ Extract strings from DEX (Dalvik Executable) files. """ def process(self, data): dex = DexFile(data) for string in dex.read_strings(): yield string.encode(self.codec)
class dnblob
-
Extracts all blobs defined in the
#Blob
stream of .NET executables.Expand source code Browse git
class dnblob(Unit): """ Extracts all blobs defined in the `#Blob` stream of .NET executables. """ def process(self, data): header = DotNetHeader(data, parse_resources=False) for blob in header.meta.Streams.Blob.values(): yield blob
class dncfx
-
Extracts the encrypted strings from ConfuserX protected .NET execuctables. Each decrypted string is returned as a single output.
Expand source code Browse git
class dncfx(Unit): """ Extracts the encrypted strings from ConfuserX protected .NET execuctables. Each decrypted string is returned as a single output. """ _PATTERN_ARRAY_INIT = ( BR'(\x1F.|\x20....)' # load size of a chunk BR'\x8D.\x00\x00\x01' # create a UInt32 array BR'\x25' # dup BR'\xD0%s\x04' # ldtoken: RVA of array data BR'\x28.\x00\x00.' # call to InitializeArray ) def process(self, data): header = DotNetHeader(data, parse_resources=False) decompressor = lzma() class IntegerAssignment: def __init__(self, match): self.offset = match.start() self.value, = struct.unpack('<I', match[1]) def get_size(match): ins = match[1] fmt = '<B' if ins[0] == 0x1F else '<I' result, = struct.unpack(fmt, ins[-struct.calcsize(fmt):]) return result potential_seeds = [ IntegerAssignment(m) for m in re.finditer(br'\x20(....)', data, re.DOTALL) ] for entry in header.meta.RVAs: offset = header.pe.get_offset_from_rva(entry.RVA) index = struct.pack('<I', entry.Field.Index) strings_found = 0 for match in re.finditer(self._PATTERN_ARRAY_INIT % re.escape(index[:3]), data, flags=re.DOTALL): ms = match.start() def sortkey(t): weight = abs(t.offset - ms) if t.offset < ms: # this weights assignments after the array initialization down, but still # prefers them over assignments that are further away than 2kb weight += 2000 return weight size = get_size(match) if size % 0x10 or size > 10000: continue self.log_debug(F'found RVA {entry.Field.Index} initialized with length {size}.') potential_seeds.sort(key=sortkey) for seed in potential_seeds[1:400]: # the first potential_seed will always be the assignment of the size variable ciphertext = data[offset:offset + size * 4] key = self._xs64star(seed.value) key = chunks.pack(key, 4) + ciphertext[:-0x40] decrypted = strxor(key, ciphertext) try: decompressed = decompressor(decrypted) except Exception as e: self.log_debug( F'decompression failed for seed {seed.value:08X} at offset {seed.offset:08X}: {e}') continue else: self.log_info( F'decompression worked for seed {seed.value:08X} at offset {seed.offset:08X}.') if len(decompressed) < 0x100: continue for string in self._extract_strings(decompressed): strings_found += 1 yield string if strings_found > 10: break def _xs64star(self, state): for i in range(16): state ^= (state >> 12) & 0xFFFFFFFF state ^= (state << 25) & 0xFFFFFFFF state ^= (state >> 27) & 0xFFFFFFFF yield state & 0xFFFFFFFF def _extract_strings(self, blob): reader = StreamReader(blob) while reader.tell() < len(blob): try: size = reader.expect(UInt32) string = reader.expect(StringPrimitive, size=size, codec='UTF8', align=4) except ParserEOF: return if string: yield string.encode(self.codec)
class dnds (dereference=True, encode=None, digest=None)
-
Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class "BinaryFormatter". The output is a representation of the deserialized data in JSON format.
Expand source code Browse git
class dnds(JSONEncoderUnit): """ Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class "BinaryFormatter". The output is a representation of the deserialized data in JSON format. """ def __init__( self, dereference: Arg.Switch('-r', '--keep-references', off=True, help='Do not resolve Object references in serialized data.') = True, encode=None, digest=None ): super().__init__(encode=encode, digest=digest, dereference=dereference) def process(self, data): self.log_debug('initializing parser, will fail on malformed stream') bf = BinaryFormatterParser( data, keep_meta=True, dereference=self.args.dereference, ignore_errors=not self.log_debug(), ) return self.to_json([ { 'Type': repr(record), 'Data': record } for record in bf ])
class dnfields (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
This unit can extract data from constant field variables in classes of .NET executables. Since the .NET header stores only the offset and not the size of constant fields, heuristics are used to search for opcode sequences that load the data and additional heuristics are used to guess the size of the data type.
Expand source code Browse git
class dnfields(PathExtractorUnit): """ This unit can extract data from constant field variables in classes of .NET executables. Since the .NET header stores only the offset and not the size of constant fields, heuristics are used to search for opcode sequences that load the data and additional heuristics are used to guess the size of the data type. """ _SIZEMAP = { '^s?byte$' : 1, '^s?char$' : 2, '^[us]?int.?16$' : 2, '^[us]?int.?32$' : 4, '^[us]?int.?64$' : 8, } def _guess_field_info(self, tables, data, t) -> FieldInfo: pattern = ( BR'(\x20....|\x1F.)' # ldc.i4 count BR'\x8D(...)([\x01\x02])' # newarr col|row BR'\x25' # dup BR'\xD0\x%02x\x%02x\x%02x\x04' # ldtoken t BR'(?:.{0,12}?' # ... BR'\x80(...)\x04)?' % ( # stsfld variable (t >> 0x00) & 0xFF, (t >> 0x08) & 0xFF, (t >> 0x10) & 0xFF ) ) for match in re.finditer(pattern, data, flags=re.DOTALL): count, j, r, name = match.groups() count, j, r = struct.unpack('<LLB', B'%s%s\0%s' % (count[1:].ljust(4, B'\0'), j, r)) if name: try: name = struct.unpack('<L', B'%s\0' % name) name = name[0] name = tables[4][name - 1].Name except Exception as E: self.log_info(F'attempt to parse field name failed: {E!s}') name = None element = tables[r][j - 1] for pattern, size in self._SIZEMAP.items(): if re.match(pattern, element.TypeName, flags=re.IGNORECASE): return FieldInfo(element.TypeName, count, size, name) def unpack(self, data): header = DotNetHeader(data, parse_resources=False) tables = header.meta.Streams.Tables fields = tables.FieldRVA if not fields: return iwidth = len(str(len(fields))) rwidth = max(len(F'{field.RVA:X}') for field in fields) rwidth = max(rwidth, 4) remaining_field_indices = set(range(len(tables.Field))) for k, rv in enumerate(fields): _index = rv.Field.Index field = tables.Field[_index - 1] remaining_field_indices.discard(_index - 1) fname = field.Name ftype = None if len(field.Signature) == 2: # Crude signature parser for non-array case. Reference: # https://www.codeproject.com/Articles/42649/NET-File-Format-Signatures-Under-the-Hood-Part-1 # https://www.codeproject.com/Articles/42655/NET-file-format-Signatures-under-the-hood-Part-2 guess = { 0x03: FieldInfo('Char', 1, 1, None), # noqa 0x04: FieldInfo('SByte', 1, 1, None), # noqa 0x05: FieldInfo('Byte', 1, 1, None), # noqa 0x06: FieldInfo('Int16', 1, 2, None), # noqa 0x07: FieldInfo('UInt16', 1, 2, None), # noqa 0x08: FieldInfo('Int32', 1, 4, None), # noqa 0x09: FieldInfo('UInt32', 1, 4, None), # noqa 0x0A: FieldInfo('Int64', 1, 8, None), # noqa 0x0B: FieldInfo('UInt64', 1, 8, None), # noqa 0x0C: FieldInfo('Single', 1, 4, None), # noqa 0x0D: FieldInfo('Double', 1, 8, None), # noqa }.get(field.Signature[1], None) else: guess = self._guess_field_info(tables, data, _index) if guess is None: self.log_debug(lambda: F'field {k:0{iwidth}d} with signature {field.Signature.hex()}: unable to guess type information') continue totalsize = guess.count * guess.size if guess.name is not None: fname = guess.name if not fname.isprintable(): fname = F'F{rv.RVA:0{rwidth}X}' ext = ftype = guess.type.lower() if guess.count > 1: ftype += F'[{guess.count}]' self.log_info( F'field {k:0{iwidth}d}; token 0x{_index:06X}; RVA 0x{rv.RVA:04X}; count {guess.count}; type {guess.type}; name {fname}') offset = header.pe.get_offset_from_rva(rv.RVA) yield UnpackResult( F'{fname}.{ext}', lambda t=offset, s=totalsize: data[t:t + s], name=fname, type=ftype, ) for _index in remaining_field_indices: field = tables.Field[_index] index = _index + 1 name = field.Name if field.Flags.HasFieldRVA: self.log_warn(F'field {name} has RVA flag set, but no RVA was found') token = index.to_bytes(3, 'little') values = set() for match in re.finditer(( BR'\x72(?P<token>...)\x70' # ldstr BR'(?:\x6F(?P<function>...)\x0A)?' # call GetBytes BR'\x80%s\x04' # stsfld ) % re.escape(token), data, re.DOTALL): md = match.groupdict() fn_token = md.get('function') fn_index = fn_token and int.from_bytes(fn_token, 'little') or None if fn_index is not None: fn_name = tables.MemberRef[fn_index].Name if fn_name != 'GetBytes': self.log_info(F'skipping string assignment passing through call to {fn_name}') continue k = int.from_bytes(md['token'], 'little') values.add(header.meta.Streams.US[k].encode(self.codec)) if not values: continue if len(values) == 1: yield UnpackResult( F'{name}.str', next(iter(values)), name=name, type='string' )
class dnhdr (resources=False, encode=None, digest=None)
-
Expects data that has been formatted with the
BinaryFormatter
class. The output is a representation of the deserialized data in JSON format.Expand source code Browse git
class dnhdr(JSONEncoderUnit): """ Expects data that has been formatted with the `BinaryFormatter` class. The output is a representation of the deserialized data in JSON format. """ def __init__( self, resources: Arg.Switch('-r', '--resources', help='Also parse .NET resources.') = False, encode=None, digest=None ): super().__init__(encode=encode, digest=digest, resources=resources) def process(self, data): dn = DotNetHeader(data, parse_resources=self.args.resources) dn = { 'Head': dn.head, 'Meta': dn.meta } if self.args.resources: dn['RSRC'] = dn.resources return self.to_json(dn)
class dnmr (*paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw=False)
-
Extracts subfiles from .NET managed resources.
Expand source code Browse git
class dnmr(PathExtractorUnit): """ Extracts subfiles from .NET managed resources. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw: Arg.Switch('-w', help='Do not deserialize the managed resource entry data.') = False ): super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, path=path, raw=raw, fuzzy=fuzzy, exact=exact, regex=regex, ) def unpack(self, data): try: managed = NetStructuredResources(data) except NoManagedResource: managed = None if not managed: raise RefineryPartialResult('no managed resources found', partial=data) for entry in managed: if entry.Error: self.log_warn(F'entry {entry.Name} carried error message: {entry.Error}') data = entry.Data if not self.args.raw: if isinstance(entry.Value, str): data = entry.Value.encode('utf-16le') elif isbuffer(entry.Value): data = entry.Value yield UnpackResult(entry.Name, data)
class dnrc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use the
refinery.units.formats.pe.dotnet.dnmr
unit to extract subfiles from managed .NET resources.Expand source code Browse git
class dnrc(PathExtractorUnit): """ Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use the `refinery.units.formats.pe.dotnet.dnmr` unit to extract subfiles from managed .NET resources. """ def unpack(self, data): header = DotNetHeader(data) if not header.resources: if self.args.list: return raise ValueError('This file contains no resources.') for resource in header.resources: yield UnpackResult(resource.Name, resource.Data)
class dnsdomain (min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)
-
Extracts domain names in the format as they appear in DNS requests. This can be used as a quick and dirty way to extract domains from PCAP files, for example.
Expand source code Browse git
class dnsdomain(PatternExtractorBase): """ Extracts domain names in the format as they appear in DNS requests. This can be used as a quick and dirty way to extract domains from PCAP files, for example. """ _DOMAIN_CHARACTERS = ( B'ABCDEFGHIJKLMNOPQRSTUVWXYZ' B'abcdefghijklmnopqrstuvwxyz' B'0123456789-_' ) _DOMAIN_PATTERN = BR'(?:%s){1,20}(?:%s)\b' % (_lps(0xFF), _lps(25)) def process(self, data): def transform(match): match = bytearray(match[0]) pos = 0 while pos < len(match): length = match[pos] match[pos] = 0x2E if len(match) < length + pos: return None if any(x not in self._DOMAIN_CHARACTERS for x in match[pos + 1 : pos + length]): return None pos += 1 + length return match[1:] yield from self.matches_filtered(memoryview(data), self._DOMAIN_PATTERN, transform)
class dnsfx (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extracts files from .NET single file applications.
Expand source code Browse git
class dnsfx(PathExtractorUnit): """ Extracts files from .NET single file applications. """ _SIGNATURE = bytes([ # 32 bytes represent the bundle signature: SHA-256 for '.net core bundle' 0x8b, 0x12, 0x02, 0xb9, 0x6a, 0x61, 0x20, 0x38, 0x72, 0x7b, 0x93, 0x02, 0x14, 0xd7, 0xa0, 0x32, 0x13, 0xf5, 0xb9, 0xe6, 0xef, 0xae, 0x33, 0x18, 0xee, 0x3b, 0x2d, 0xce, 0x24, 0xb3, 0x6a, 0xae ]) def unpack(self, data): reader = StreamReader(data) reader.seek(self._find_bundle_manifest_offset(data)) major_version = reader.expect(UInt32) minor_version = reader.expect(UInt32) self.log_info(F'version {major_version}.{minor_version}') count = reader.expect(UInt32) bhash = reader.expect(StringPrimitive) self.log_info(F'bundle {bhash} contains {count} files') if major_version >= 2: reader.expect(UInt64) # depsOffset reader.expect(UInt64) # depsSize reader.expect(UInt64) # runtimeConfigOffset reader.expect(UInt64) # runtimeConfigSize reader.expect(UInt64) # flags for _ in range(count): try: offset = reader.expect(UInt64) size = reader.expect(UInt64) compressed_size = 0 if major_version >= 6: compressed_size = reader.expect(UInt64) type = reader.expect(Byte) path = reader.expect(StringPrimitive) def _logmsg(): _log = F'read item at offset 0x{offset:08X}, type 0x{type:02X}, size {SizeInt(size)!r}' if compressed_size: return F'{_log}, compressed to size {SizeInt(compressed_size)!r}' return F'{_log}, uncompressed' self.log_debug(_logmsg) with reader.checkpoint(): reader.seek(offset) if compressed_size: item_data = reader.read(compressed_size) | zl | bytearray else: item_data = reader.read(size) yield UnpackResult(path, item_data) except ParserEOF: self.log_warn('unexpected EOF while parsing bundle, terminating') break def _find_bundle_manifest_offset(self, data: bytearray) -> int: bundle_sig_offset = data.find(self._SIGNATURE, 0) if bundle_sig_offset < 0: raise ValueError('Cannot find valid Bundle Manifest offset. Is this a .NET Bundle?') return int.from_bytes(data[bundle_sig_offset - 8:bundle_sig_offset], 'little') @classmethod def handles(self, data: bytearray): return self._SIGNATURE in data
class dnstr (user=True, meta=True)
-
Extracts all strings defined in the
#Strings
and#US
streams of .NET executables.Expand source code Browse git
class dnstr(Unit): """ Extracts all strings defined in the `#Strings` and `#US` streams of .NET executables. """ def __init__( self, user: Arg.Switch('-m', '--meta', off=True, group='HEAP', help='Only extract from #Strings.') = True, meta: Arg.Switch('-u', '--user', off=True, group='HEAP', help='Only extract from #US.') = True, ): if not meta and not user: raise ValueError('Either ascii or utf16 strings must be enabled.') super().__init__(meta=meta, user=user) def process(self, data): header = DotNetHeader(data, parse_resources=False) if self.args.meta: for string in header.meta.Streams.Strings.values(): yield string.encode(self.codec) if self.args.user: for string in header.meta.Streams.US.values(): yield string.encode(self.codec)
class doctxt
-
Extracts the text body from Word documents.
Expand source code Browse git
class doctxt(Unit): """ Extracts the text body from Word documents. """ @Unit.Requires('olefile', 'formats', 'office', 'extended') def _olefile(): import olefile return olefile def process(self, data: bytearray): extractors: Dict[str, Callable[[bytearray], str]] = OrderedDict( doc=self._extract_ole, docx=self._extract_docx, odt=self._extract_odt, ) if data.startswith(B'PK'): self.log_debug('document contains zip file signature, likely a odt or docx file') extractors.move_to_end('doc') if 'opendocument' in str(data | xtzip('mimetype')): self.log_debug('odt signature detected') extractors.move_to_end('odt', last=False) for filetype, extractor in extractors.items(): self.log_debug(F'trying to extract as {filetype}') try: result = extractor(data) except ImportError: raise except Exception as error: self.log_info(F'failed extractring as {filetype}: {error!s}') else: return result.encode(self.codec) raise ValueError('All extractors failed, the input data is not recognized as any known document format.') def _extract_docx(self, data: Chunk) -> str: NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARAGRAPH = F'{NAMESPACE}p' TEXT = F'{NAMESPACE}t' chunk = data | xtzip('word/document.xml') | bytearray if not chunk: raise ValueError('No document.xml file found.') root: Element = XML(chunk) with StringIO() as output: for index, paragraph in enumerate(root.iter(PARAGRAPH)): if index > 0: output.write('\n') for node in paragraph.iter(TEXT): if node.text: output.write(node.text) return output.getvalue() def _extract_odt(self, data: bytes): def _extract_text(node: Element): NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}' PARAGRAPH = F'{NAMESPACE}p' SPAN = F'{NAMESPACE}span' SPACE = F'{NAMESPACE}s' with StringIO() as res: for element in node: tag = element.tag text = element.text or '' tail = element.tail or '' if tag in [PARAGRAPH, SPAN]: res.write(text) elif tag == SPACE: res.write(' ') else: self.log_debug(F'unknown tag: {tag}') res.write(_extract_text(element)) res.write(tail) if tag == PARAGRAPH: res.write('\n') return res.getvalue() NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}' BODY = F'{NAMESPACE}body' TEXT = F'{NAMESPACE}text' for part in xtzip().unpack(data): if part.path != 'content.xml': continue xml_content: bytes = part.get_data() root: Element = XML(xml_content) body: Element = root.find(BODY) text: Element = body.find(TEXT) return _extract_text(text) else: raise ValueError('found no text') def _extract_ole(self, data: bytearray) -> str: stream = MemoryFile(data) with self._olefile.OleFileIO(stream) as ole: doc = ole.openstream('WordDocument').read() with StructReader(doc) as reader: table_name = F'{(doc[11] >> 1) & 1}Table' reader.seek(0x1A2) offset = reader.u32() length = reader.u32() with StructReader(ole.openstream(table_name).read()) as reader: reader.seek(offset) table = reader.read(length) piece_table = self._load_piece_table(table) return self._get_text(doc, piece_table) def _load_piece_table(self, table: bytes) -> bytes: with StructReader(table) as reader: while not reader.eof: entry_type = reader.read_byte() if entry_type == 1: reader.seekrel(reader.read_byte()) continue if entry_type == 2: length = reader.u32() return reader.read(length) raise NotImplementedError(F'Unsupported table entry type value 0x{entry_type:X}.') def _get_text(self, doc: bytes, piece_table: bytes) -> str: piece_count: int = 1 + (len(piece_table) - 4) // 12 with StringIO() as text: with StructReader(piece_table) as reader: character_positions = [reader.u32() for _ in range(piece_count)] for i in range(piece_count - 1): cp_start = character_positions[i] cp_end = character_positions[i + 1] fc_value = reader.read_struct('xxLxx', unwrap=True) is_ansi = bool((fc_value >> 30) & 1) fc = fc_value & 0xBFFFFFFF cb = cp_end - cp_start if is_ansi: encoding = 'cp1252' fc = fc // 2 else: encoding = 'utf16' cb *= 2 raw = doc[fc : fc + cb] text.write(raw.decode(encoding).replace('\r', '\n')) return text.getvalue()
class drp (consecutive=False, align=False, min=1, max=∞, len=None, all=False, threshold=20, weight=0, buffer=1024, chug=False)
-
Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data. The unit computes a suffix tree which may require a lot of memory for large buffers.
Expand source code Browse git
class drp(Unit): """ Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data. The unit computes a suffix tree which may require a lot of memory for large buffers. """ def __init__( self, consecutive: Arg.Switch('-c', help='Assume that the repeating pattern is consecutive when observable.') = False, align: Arg.Switch('-d', help='Assume that the pattern occurs at offsets that are multiples of its length.') = False, min: Arg.Number('-n', help='Minimum size of the pattern to search for. Default is {default}.') = 1, max: Arg.Number('-N', help='Maximum size of the pattern to search for. Default is {default}.') = INF, len: Arg.Number('-l', help='Set the exact size of the pattern. This is equivalent to --min=N --max=N.') = None, all: Arg.Switch('-a', help='Produce one output for each repeating pattern that was detected.') = False, threshold: Arg.Number('-t', help='Patterns must match this performance threshold in percent, lest they be discarded.') = 20, weight: Arg.Number('-w', help='Specifies how much longer patterns are favored over small ones. Default is {default}.') = 0, buffer: Arg.Number('-b', group='BFR', help='Maximum number of bytes to inspect at once. The default is {default}.') = 1024, chug : Arg.Switch('-g', group='BFR', help='Compute the prefix tree for the entire buffer instead of chunking it.') = False ): if len is not None: min = max = len super().__init__( min=min, max=max, all=all, consecutive=consecutive, align=align, weight=weight, buffer=buffer, chug=chug, threshold=threshold ) def _get_patterns(self, data): with stackdepth(len(data)): tree = SuffixTree(data) min_size = self.args.min max_size = self.args.max patterns = set() cursor = 0 while cursor < len(data): node = tree.root rest = data[cursor:] remaining = len(rest) length = 0 offset = None while node.children and length < remaining: for child in node.children.values(): if tree.data[child.start] == rest[length]: node = child break if node.start >= cursor: break offset = node.start - length length = node.end + 1 - offset if offset is None: cursor += 1 continue length = min(remaining, length) if max_size >= length >= min_size: pattern = rest[:length].tobytes() patterns.add(pattern) cursor += length del tree return patterns @staticmethod def _consecutive_count(data, pattern): length = len(pattern) if length == 1: return data.count(pattern) view = memoryview(data) return max(sum(1 for i in range(k, len(view), length) if view[i:i + length] == pattern) for k in range(len(pattern))) @staticmethod def _truncate_pattern(pattern): offset = 0 for byte in pattern[1:]: if byte == pattern[offset]: offset += 1 else: offset = 0 if offset > 0: pattern = pattern[:-offset] return pattern def process(self, data: bytearray): if len(data) <= 1: yield data return memview = memoryview(data) weight = 1 + (self.args.weight / 10) if self.args.chug: patterns = self._get_patterns(memview) else: patterns = set() chunksize = self.args.buffer for k in range(0, len(memview), chunksize): patterns |= self._get_patterns(memview[k:k + chunksize]) if not patterns: raise RefineryPartialResult('no repeating sequences found', data) self.log_debug('removing duplicate pattern detections') duplicates = set() maxlen = max(len(p) for p in patterns) for pattern in sorted(patterns, key=len): for k in range(2, maxlen // len(pattern) + 1): repeated = pattern * k if repeated in patterns: duplicates.add(repeated) patterns -= duplicates self.log_debug(F'counting coverage of {len(patterns)} patterns') pattern_count = {p: data.count(p) for p in patterns} pattern_performance = dict(pattern_count) for consecutive in (False, True): if consecutive: self.log_debug(F're-counting coverage of {len(patterns)} patterns') patterns = {self._truncate_pattern(p) for p in patterns} pattern_performance = {p: self._consecutive_count(data, p) for p in patterns} self.log_debug('evaluating pattern performance') for pattern, count in pattern_performance.items(): pattern_performance[pattern] = count * (len(pattern) ** weight) best_performance = max(pattern_performance.values()) for pattern, performance in pattern_performance.items(): pattern_performance[pattern] = performance / best_performance self.log_debug('removing patterns below performance threshold') threshold = self.args.threshold patterns = {p for p in patterns if pattern_performance[p] * 100 >= threshold} pattern_count = {p: data.count(p) for p in patterns} if not self.args.consecutive: break if self.args.all: for pattern in sorted(patterns, key=pattern_performance.get, reverse=True): yield self.labelled(pattern, count=pattern_count[pattern]) return best_patterns = [p for p in patterns if pattern_performance[p] == 1.0] if len(best_patterns) > 1: self.log_warn('could not determine unique best repeating pattern, returning the first of these:') for k, pattern in enumerate(best_patterns): self.log_warn(F'{k:02d}.: {pattern.hex()}') result = best_patterns[0] if self.args.align: def rotated(pattern): for k in range(len(pattern)): yield pattern[k:] + pattern[:k] rotations = {k % len(result): r for k, r in ( (data.find(r), r) for r in rotated(result)) if k >= 0} result = rotations[min(rotations)] yield result
class dsjava
-
Deserialize Java serialized data and re-serialize as JSON.
Expand source code Browse git
class dsjava(Unit): """ Deserialize Java serialized data and re-serialize as JSON. """ @Unit.Requires('javaobj-py3>=0.4.0.1', 'formats') def _javaobj(): import javaobj.v2 return javaobj.v2 def process(self, data): with JavaEncoder as encoder: return encoder.dumps(self._javaobj.loads(data)).encode(self.codec)
class dsphp
-
Deserialize PHP serialized data and re-serialize as JSON.
Expand source code Browse git
class dsphp(Unit): """ Deserialize PHP serialized data and re-serialize as JSON. """ @Unit.Requires('phpserialize', 'formats') def _php(): import phpserialize return phpserialize def reverse(self, data): return self._php.dumps(json.loads(data)) def process(self, data): phpobject = self._php.phpobject class encoder(json.JSONEncoder): def default(self, obj): try: return super().default(obj) except TypeError: pass if isinstance(obj, bytes) or isinstance(obj, bytearray): return obj.decode('utf8') if isinstance(obj, phpobject): return obj._asdict() return json.dumps( self._php.loads( data, object_hook=phpobject, decode_strings=True ), indent=4, cls=encoder ).encode(self.codec)
class dump (*files, tee=False, stream=False, plain=False, force=False)
-
Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any metadata field on an incoming chunk is available. Additionally, any field that can be populated by the
cm
unit is also available. These include the following:{ext} : Automatically guessed file extension {crc32} : CRC32 checksum of the data {index} : Index of the data in the input stream, starting at 0 {size} : Size of the data in bytes {md5} : MD5 hash of the data {sha1} : SHA1 hash of the data {sha256} : SHA-256 hash of the data {path} : Associated path; defaults to {sha256} if none is given.
When not using formatted file names, the unit ingests as many incoming inputs as filenames were specified on the command line. Unless connected to a terminal, the remaining inputs will be forwarded on STDOUT. The
-t
or--tee
switch can be used to forward all inputs, under all circumstances, regardless of whether or not they have been processed.If no file is specified, all ingested inputs are concatenated and written to the clipboard. This will only succeed when the data can successfully be encoded.
Expand source code Browse git
class dump(Unit): """ Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any metadata field on an incoming chunk is available. Additionally, any field that can be populated by the `refinery.cm` unit is also available. These include the following: {ext} : Automatically guessed file extension {crc32} : CRC32 checksum of the data {index} : Index of the data in the input stream, starting at 0 {size} : Size of the data in bytes {md5} : MD5 hash of the data {sha1} : SHA1 hash of the data {sha256} : SHA-256 hash of the data {path} : Associated path; defaults to {sha256} if none is given. When not using formatted file names, the unit ingests as many incoming inputs as filenames were specified on the command line. Unless connected to a terminal, the remaining inputs will be forwarded on STDOUT. The `-t` or `--tee` switch can be used to forward all inputs, under all circumstances, regardless of whether or not they have been processed. If no file is specified, all ingested inputs are concatenated and written to the clipboard. This will only succeed when the data can successfully be encoded. """ def __init__( self, *files: Arg(metavar='file', type=str, help='Optionally formatted filename.'), tee : Arg.Switch('-t', help='Forward all inputs to STDOUT.') = False, stream : Arg.Switch('-s', help='Dump all incoming data to the same file.') = False, plain : Arg.Switch('-p', help='Never apply any formatting to file names.') = False, force : Arg.Switch('-f', help='Remove files if necessary to create dump path.') = False, ): if stream and len(files) != 1: raise ValueError('Can only use exactly one file in stream mode.') super().__init__(files=files, tee=tee, stream=stream, force=force) self.stream = None self._formatted = not plain and any(self._has_format(f) for f in files) self._reset() @staticmethod def _has_format(filename): if not isinstance(filename, str): return False formatter = Formatter() return any( any(t.isalnum() for t in fields) for _, fields, *__ in formatter.parse(filename) if fields ) def _reset(self): self.exhausted = False self.paths = cycle(self.args.files) if self._formatted else iter(self.args.files) self._close() @property def _clipcopy(self): return not self.args.files def _components(self, path): def _reversed_components(path): while True: path, component = os.path.split(path) if not component: break yield component yield path components = list(_reversed_components(path)) components.reverse() return components def _open(self, path, unc=False): if hasattr(path, 'close'): return path path = os.path.abspath(path) base = os.path.dirname(path) if not unc: self.log_info('opening:', path) try: os.makedirs(base, exist_ok=True) except FileExistsError: self.log_info('existed:', path) part, components = '', self._components(path) while components: component, *components = components part = os.path.join(part, component) if os.path.exists(part) and os.path.isfile(part): if self.args.force: os.unlink(part) return self._open(path, unc) break raise RefineryCriticalException(F'Unable to dump to {path} because {part} is a file.') except FileNotFoundError: if unc or os.name != 'nt': raise path = F'\\\\?\\{path}' return self._open(path, unc=True) except OSError as e: if not self.log_info(): self.log_warn('opening:', path) self.log_warn('errored:', e.args[1]) return open(os.devnull, 'wb') else: mode = 'ab' if self.args.stream else 'wb' return open(path, mode) def _close(self, final=False): if not self.stream: return self.stream.flush() if self.args.stream and not final: return if self._clipcopy: if os.name == 'nt': from refinery.lib.winclip import ClipBoard, CF try: img = self._image.open(self.stream) with io.BytesIO() as out: img.save(out, 'BMP') except Exception: with ClipBoard(CF.TEXT) as cpb: cpb.copy(self.stream.getvalue()) else: with ClipBoard(CF.DIB) as cpb: out.seek(14, io.SEEK_SET) cpb.copy(out.read()) else: data = self.stream.getvalue() data = data.decode(self.codec, errors='backslashreplace') self._pyperclip.copy(data) self.stream.close() self.stream = None @Unit.Requires('pyperclip') def _pyperclip(): import pyperclip return pyperclip @Unit.Requires('Pillow', 'formats') def _image(): from PIL import Image return Image def process(self, data: bytes): forward_input_data = self.args.tee if self._clipcopy: self.stream.write(data) elif not self.exhausted: if not self.stream: # This should happen only when the unit is called from Python code # rather than via the command line. try: path = next(self.paths) except StopIteration: raise RefineryCriticalException('the list of filenames was exhausted.') else: with self._open(path) as stream: stream.write(data) else: self.stream.write(data) self.log_debug(F'wrote 0x{len(data):08X} bytes') self._close() else: forward_input_data = forward_input_data or not self.isatty if not forward_input_data: size = metavars(data).size self.log_warn(F'discarding unprocessed chunk of size {size!s}.') if forward_input_data: yield data def filter(self, chunks): if self.exhausted: self._reset() nostream = not self.args.stream clipcopy = self._clipcopy if clipcopy: self.stream = io.BytesIO() for index, chunk in enumerate(chunks, 0): if not chunk.visible: continue if not clipcopy and not self.exhausted and (nostream or not self.stream): try: path = next(self.paths) except StopIteration: self.exhausted = True else: if self._has_format(path): meta = metavars(chunk) meta.ghost = True meta.update_index(index) path = meta.format_str(path, self.codec, [chunk]) self.stream = self._open(path) yield chunk self._close(final=True) self.exhausted = True
class eat (name)
-
Consume a meta variable and replace the contents of the current chunk with it. If the variable contains a string, it is encoded with the default codec. If the variable cannot be converted to a byte string, the data is lost and an empty chunk is returned.
Expand source code Browse git
class eat(Unit): """ Consume a meta variable and replace the contents of the current chunk with it. If the variable contains a string, it is encoded with the default codec. If the variable cannot be converted to a byte string, the data is lost and an empty chunk is returned. """ def __init__( self, name: Arg(help='The name of the variable to be used.', type=str), ): super().__init__(name=check_variable_name(name)) def process(self, data: Chunk): def invalid_type(): return F'variable {name} is of type "{type}", unable to convert to byte string - data is lost' name = self.args.name meta = metavars(data) data = meta.pop(name) type = data.__class__.__name__ if isinstance(data, int): self.log_info(F'variable {name} is an integer, converting to string.') data = str(data).encode(self.codec) if isinstance(data, str): self.log_info(F'variable {name} is a string, encoding as {self.codec}') data = data.encode(self.codec) elif not isbuffer(data): try: wrapped = bytearray(data) except Exception: self.log_warn(invalid_type()) data = None else: data = wrapped return data
class ef (*filenames, list=False, meta=False, size=None, read=0, wild=False, tame=False, symlinks=False, linewise=False)
-
Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to read large files in chunks.
Expand source code Browse git
class ef(Unit): """ Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to read large files in chunks. """ def __init__(self, *filenames: Arg(metavar='FILEMASK', nargs='+', type=str, help=( 'A list of file masks. Each matching file will be read from disk and ' 'emitted. The file masks can include format string expressions which ' 'will be substituted from the current meta variables. The masks can ' 'use wild-card expressions, but this feature is disabled by default on ' 'Posix platforms, where it has to be enabled explicitly using the -w ' 'switch. On Windows, the feature is enabled by default and can be ' 'disabled using the -t switch.' )), list: Arg.Switch('-l', help='Only lists files with metadata.') = False, meta: Arg.Switch('-m', help=( 'Adds the atime, mtime, ctime, and size metadata variables.' )) = False, size: Arg.Bounds('-s', range=True, help=( 'If specified, only files are read whose size is in the given range.')) = None, read: Arg.Number('-r', help=( 'If specified, files will be read in chunks of size N and each ' 'chunk is emitted as one element in the output list.' )) = 0, wild: Arg.Switch('-w', group='W', help='Force use of wildcard patterns in file masks.') = False, tame: Arg.Switch('-t', group='W', help='Disable wildcard patterns in file masks.') = False, symlinks: Arg.Switch('-y', help='Follow symbolic links and junctions, these are ignored by default.') = False, linewise: Arg.Switch('-i', help=( 'Read the file linewise. By default, one line is read at a time. ' 'In line mode, the --read argument can be used to read the given ' 'number of lines in each chunk.' )) = False ): if wild and tame: raise ValueError('Cannot be both wild and tame!') super().__init__( size=size, read=read, list=list, meta=meta, wild=wild, tame=tame, symlinks=symlinks, linewise=linewise, filenames=filenames ) def _read_chunks(self, fd): while True: buffer = fd.read(self.args.read) if not buffer: break yield buffer def _read_lines(self, fd): count = self.args.read or 1 if count == 1: while True: buffer = fd.readline() if not buffer: break yield buffer return with MemoryFile() as out: while True: for _ in range(count): buffer = fd.readline() if not buffer: break out.write(buffer) if not out.tell(): break yield out.getvalue() out.seek(0) out.truncate() def _absolute_path(self, path_string: str): path = Path(path_string).absolute() if os.name == 'nt' and not path.parts[0].startswith('\\\\?\\'): # The pathlib glob method will simply fail mid-traversal if it attempts to descend into # a folder or to a file whose path exceeds MAX_PATH on Windows. As a workaround, we use # UNC paths throughout and truncate to relative paths after enumeration. path = Path(F'\\\\?\\{path!s}') return path def _glob(self, pattern: str) -> Iterable[Path]: if pattern.endswith('**'): pattern += '/*' wildcard = re.search(R'[\[\?\*]', pattern) if wildcard is None: yield self._absolute_path(pattern) return k = wildcard.start() base, pattern = pattern[:k], pattern[k:] path = self._absolute_path(base or '.') last = path.parts[-1] if base.endswith(last): # /base/something.* pattern = F'{last}{pattern}' path = path.parent scandir = os.scandir class EmptyIterator: def __enter__(self): return self def __exit__(self, *_, **__): pass def __next__(self): raise StopIteration def __iter__(self): return self if sys.version_info >= (3, 12): def islink(path): return os.path.islink(path) or os.path.isjunction(path) else: def islink(path): try: return bool(os.readlink(path)) except OSError: return False paths_scanned = set() def _patched_scandir(path): if islink(path): if not self.args.symlinks: return EmptyIterator() try: rp = os.path.realpath(path, strict=True) except OSError: return EmptyIterator() if rp in paths_scanned: self.log_warn(F'file system loop at: {path!s}') return EmptyIterator() paths_scanned.add(rp) path = rp try: return scandir(path) except Exception as e: self.log_warn('error calling scandir:', exception_to_string(e)) return EmptyIterator() try: os.scandir = _patched_scandir for match in path.glob(pattern): yield match finally: os.scandir = scandir def process(self, data): meta = metavars(data) size = self.args.size size = size and range(size.start, size.stop, size.step) meta.ghost = True wild = (os.name == 'nt' or self.args.wild) and not self.args.tame root = self._absolute_path('.') paths = self._glob if wild else lambda mask: [self._absolute_path(mask)] do_meta = self.args.meta do_stat = size or do_meta class SkipErrors: unit = self def __init__(self): self._history: Set[type] = set() self._message: Dict[type, Optional[str]] = { ValueError: ( None ), PermissionError: ( 'access error while scanning: {}' ), OSError: ( 'system error while scanning: {}' ), FileNotFoundError: ( 'file unexpectedly not found: {}' ), Exception: ( 'unknown error while reading: {}' ), } self.path = None def reset(self, path): self._history.clear() self.path = path return self def __enter__(self): return self def __exit__(self, et, ev, trace): if et is None: return False for t, msg in self._message.items(): if issubclass(et, t): if t not in self._history: self._history.add(t) if msg is not None: self.unit.log_info(msg.format(self.path)) return True else: return False for mask in self.args.filenames: mask = meta.format_str(mask, self.codec, [data]) self.log_debug('scanning for mask:', mask) kwargs = dict() skip_errors = SkipErrors() for path in paths(mask): skip_errors.reset(path) filesize = None with skip_errors: path = path.relative_to(root) with skip_errors: if wild and not path.is_file(): continue with skip_errors: if do_stat: stat = path.stat() filesize = stat.st_size if do_meta: kwargs.update( size=filesize, atime=datetime.fromtimestamp(stat.st_atime).isoformat(' ', 'seconds'), ctime=datetime.fromtimestamp(stat.st_ctime).isoformat(' ', 'seconds'), mtime=datetime.fromtimestamp(stat.st_mtime).isoformat(' ', 'seconds') ) if size is not None and filesize not in size: continue with skip_errors: if self.args.list: yield self.labelled(str(path).encode(self.codec), **kwargs) continue with path.open('rb') as stream: if self.args.linewise: yield from self._read_lines(stream) elif self.args.read: yield from self._read_chunks(stream) else: data = stream.read() self.log_info(lambda: F'reading: {path!s} ({len(data)} bytes)') yield self.labelled(data, path=path.as_posix(), **kwargs)
class emit (*data)
-
Expand source code Browse git
class emit(Unit): def __init__(self, *data: Arg(help=( 'Data to be emitted. If no argument is specified, data is retrieved from ' 'the clipboard. Multiple arguments are output in framed format.' ))): super().__init__(data=data) @Unit.Requires('pyperclip') def _pyperclip(): import pyperclip return pyperclip def process(self, data): if self.args.data: yield from self.args.data return if os.name == 'nt': from refinery.lib.winclip import get_any_data mode, data = get_any_data() if mode is not None: self.log_info(F'retrieved clipboard data in {mode.name} format') yield data else: data = self._pyperclip.paste() if not data: return yield data.encode(self.codec, 'replace')
class esc (hex=False, unicode=False, greedy=False, unquoted=False, quoted=False, bare=False)
-
Encodes and decodes common ASCII escape sequences.
Expand source code Browse git
class esc(Unit): """ Encodes and decodes common ASCII escape sequences. """ _ESCAPE = { 0x00: BR'\0', 0x07: BR'\a', 0x08: BR'\b', 0x0C: BR'\f', 0x0A: BR'\n', 0x0D: BR'\r', 0x09: BR'\t', 0x0B: BR'\v', 0x5C: BR'\\', 0x27: BR'\'', 0x22: BR'\"' } _UNESCAPE = { BR'0': B'\x00', BR'a': B'\x07', BR'b': B'\x08', BR'f': B'\x0C', BR'n': B'\x0A', BR'r': B'\x0D', BR't': B'\x09', BR'v': B'\x0B', B'\\': B'\x5C', BR"'": B'\x27', BR'"': B'\x22' } def __init__(self, hex : Arg.Switch('-x', help='Hex encode everything, do not use C escape sequences.') = False, unicode : Arg.Switch('-u', help='Use unicode escape sequences and UTF-8 encoding.') = False, greedy : Arg.Switch('-g', help='Replace \\x by x and \\u by u when not followed by two or four hex digits, respectively.') = False, unquoted: Arg.Switch('-p', group='Q', help='Never remove enclosing quotes.') = False, quoted : Arg.Switch('-q', group='Q', help='Remove enclosing quotes while decoding and add them for encoding.') = False, bare : Arg.Switch('-b', help='Do not escape quote characters.') = False, ) -> Unit: pass # noqa def process(self, data): data = memoryview(data) if self.args.quoted: quote = data[0] if data[-1] != quote: self.log_info('string is not correctly quoted') else: data = data[1:-1] elif not self.args.unquoted: quote = data[:1] strip = data[1:-1] if data[-1:] == quote and not re.search(br'(?<!\\)' + re.escape(quote), strip): self.log_info('removing automatically detected quotes') data = strip def unescape(match): c = match[1] if len(c) > 1: if c[0] == 0x75: # unicode upper = int(c[1:3], 16) lower = int(c[3:5], 16) if self.args.unicode: return bytes((lower, upper)).decode('utf-16le').encode(self.codec) return bytes((lower,)) elif c[0] == 0x78: # hexadecimal return bytes((int(c[1:3], 16),)) else: # octal escape sequence return bytes((int(c, 8) & 0xFF,)) elif c in B'ux': return c if self.args.greedy else match[0] return self._UNESCAPE.get(c, c) data = re.sub( RB'\\(u[a-fA-F0-9]{4}|x[a-fA-F0-9]{1,2}|[0-7]{3}|.)', unescape, data) return data def reverse(self, data): if self.args.unicode: string = data.decode(self.codec).encode('UNICODE_ESCAPE') else: if not self.args.hex: def escape(match): c = match[0][0] return self._ESCAPE.get(c, RB'\x%02x' % c) pattern = RB'[\x00-\x1F\x22\x27\x5C\x7F-\xFF]' if self.args.bare: pattern = RB'[\x00-\x1F\x5C\x7F-\xFF]' string = re.sub(pattern, escape, data) else: string = bytearray(4 * len(data)) for k in range(len(data)): a = k * 4 b = k * 4 + 4 string[a:b] = RB'\x%02x' % data[k] if self.args.quoted: string = B'"%s"' % string return string
class evtx (raw=False)
-
Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single output chunk in XML format.
Expand source code Browse git
class evtx(Unit): """ Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single output chunk in XML format. """ def __init__(self, raw: Unit.Arg.Switch('-r', help='Extract raw event data rather than XML.') = False): super().__init__(raw=raw) @Unit.Requires('python-evtx', 'formats') def _evtx(): from Evtx.Evtx import Evtx return Evtx def process(self, data): with VirtualFileSystem() as vfs: raw = self.args.raw with self._evtx(vfs.new(data)) as log: for record in log.records(): yield record.data() if raw else record.xml().encode(self.codec)
class fernet (key)
-
Decrypt Fernet messages.
Expand source code Browse git
class fernet(Unit): """ Decrypt Fernet messages. """ def __init__(self, key: Arg(help='A fernet key, either in base64 or raw binary.')): super().__init__(key=key) def _b64(self, data): try: return data | b64(urlsafe=True) | bytearray except Exception: return data def process(self, data): fk = self._b64(self.args.key) if len(fk) != 32: raise ValueError(F'The given Fernet key has length {len(fk)}, expected 32 bytes.') signing_key = fk[:16] encryption_key = fk[16:] decoded = self._b64(data) reader = StructReader(memoryview(decoded), bigendian=True) signed_data = reader.peek(reader.remaining_bytes - 32) version = reader.u8() timestamp = datetime.fromtimestamp(reader.u64()) iv = reader.read(16) if version != 0x80: self.log_warn(F'The Fernet version is 0x{version:02X}, the only documented one is 0x80.') ciphertext = reader.read(reader.remaining_bytes - 32) if len(ciphertext) % 16 != 0: raise ValueError('The encoded ciphertext is not 16-byte block aligned.') signature = reader.read(32) hmac = HMAC.new(signing_key, digestmod=SHA256) hmac.update(signed_data) if hmac.digest() != signature: self.log_warn('HMAC verification failed; the message has been tampered with.') self.log_info(F'computed signature: {hmac.hexdigest().upper()}') self.log_info(F'provided signature: {signature.hex().upper()}') plaintext = ciphertext | aes(mode='cbc', iv=iv, key=encryption_key) | bytearray return self.labelled(plaintext, timestamp=timestamp.isoformat(' ', 'seconds'))
class gost (key, iv=b'', padding=None, mode=None, raw=False, swap=False, sbox=SBOX.R34, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False)
-
GOST encryption and decryption.
Expand source code Browse git
class gost(StandardBlockCipherUnit, cipher=BlockCipherFactory(GOST)): """ GOST encryption and decryption. """ def __init__( self, key, iv=B'', padding=None, mode=None, raw=False, swap: Arg.Switch('-s', help='Decode blocks as big endian rather than little endian.') = False, sbox: Arg.Option('-x', choices=SBOX, help=( 'Choose an SBOX. The default is {default}, which corresponds to the R-34.12.2015 standard. ' 'The other option is CBR, which is the SBOX used by the Central Bank of Russia.' )) = SBOX.R34, **more ): sbox = Arg.AsOption(sbox, SBOX) super().__init__(key, iv, padding=padding, mode=mode, raw=raw, swap=swap, sbox=sbox, **more) def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( swap=self.args.swap, sbox=self.args.sbox, **optionals )
class group (size)
-
Group incoming chunks into frames of the given size.
Expand source code Browse git
class group(Unit): """ Group incoming chunks into frames of the given size. """ def __init__(self, size: Arg.Number(help='Size of each group; must be at least 2.', bound=(2, None))): super().__init__(size=size) def process(self, data: Chunk): if not data.temp: return yield data yield from islice(data.temp, 0, self.args.size - 1) def filter(self, chunks): it = iter(chunks) while True: try: head: Chunk = next(it) except StopIteration: return head.temp = it yield head
class groupby (name)
-
Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot stream any output until the input frame is consumed: It has to read every input chunk to make sure that all groupings are complete.
Expand source code Browse git
class groupby(Unit): """ Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot stream any output until the input frame is consumed: It has to read every input chunk to make sure that all groupings are complete. """ def __init__(self, name: Arg(type=str, help='name of the meta variable')): super().__init__(name=check_variable_name(name)) def process(self, data): yield from data.temp def filter(self, chunks: Iterable[Chunk]) -> Generator[Chunk, None, None]: name = self.args.name members = defaultdict(list) for chunk in chunks: try: value = chunk.meta[name] except KeyError: value = None members[value].append(chunk) for chunklist in members.values(): dummy = chunklist[0] dummy.temp = chunklist yield dummy
class hc128 (key, discard=0, stateful=False)
-
HC-128 encryption and decryption.
Expand source code Browse git
class hc128(StreamCipherUnit): """ HC-128 encryption and decryption. """ key_size = {32} def keystream(self) -> Iterable[int]: return HC128(self.args.key)
class hc256 (key, iv=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', discard=0, stateful=False)
-
HC-256 encryption and decryption.
Expand source code Browse git
class hc256(StreamCipherUnit): """ HC-256 encryption and decryption. """ key_size = {32} def __init__( self, key, iv: Arg(help='An initialization vector; the default is a sequence of 32 zero bytes.') = bytes(32), discard=0, stateful=False, ): super().__init__(key=key, iv=iv, stateful=stateful, discard=discard) self._keystream = None def keystream(self) -> Iterable[int]: for num in HC256(self.args.key, self.args.iv): yield from num.to_bytes(4, 'little')
class hex
-
Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding, any odd trailing hex digits are stripped as two hex digits are required to represent a byte.
Expand source code Browse git
class hex(Unit): """ Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding, any odd trailing hex digits are stripped as two hex digits are required to represent a byte. """ def reverse(self, data): import base64 return base64.b16encode(data) def process(self, data): import re import base64 data = re.sub(B'[^A-Fa-f0-9]+', B'', data) if len(data) % 2: data = data[:-1] return base64.b16decode(data, casefold=True) @classmethod def handles(self, data: bytearray): from refinery.lib.patterns import formats if formats.spaced_hex.fullmatch(data): return True
class hexload (blocks=1, dense=False, expand=False, narrow=False, width=0)
-
Convert hex dumps back to the original data and vice versa. All options of this unit apply to its reverse operation where binary data is converted to a readable hexdump format. The default mode of the unit expects the input data to contain a readable hexdump and converts it back to binary.
Expand source code Browse git
class hexload(HexViewer): """ Convert hex dumps back to the original data and vice versa. All options of this unit apply to its reverse operation where binary data is converted to a readable hexdump format. The default mode of the unit expects the input data to contain a readable hexdump and converts it back to binary. """ @regex class _ENCODED_BYTES: R""" (?ix)(?:^|(?<=\s)) # encoded byte patches must be prefixed by white space (?: (?: # separated chunks of hex data [a-f0-9]{2} # hexadecimal chunk; single byte (two hexadecimal letters) \s{1,2} # encoded byte followed by whitespace (?: # at least one more encoded byte [a-f0-9]{2} # followed by more encoded bytes (?:\s{1,2}[a-f0-9]{2})* # unless it was just a single byte )? ) | (?:[a-f0-9]{4}\s{1,2} # 2-byte chunks (?:[a-f0-9]{4} (?:\s{1,2}[a-f0-9]{4})*)?) | (?:[a-f0-9]{8}\s{1,2} # 4-byte chunks (?:[a-f0-9]{8} (?:\s{1,2}[a-f0-9]{8})*)?) | (?:(?:[a-f0-9]{2})+) # continuous line of hexadecimal characters )(?=\s|$) # terminated by a whitespace or line end """ def __init__(self, blocks=1, dense=False, expand=False, narrow=False, width=0): super().__init__(blocks=blocks, dense=dense, expand=expand, narrow=narrow, width=width) self._hexline_pattern = re.compile(F'{make_hexline_pattern(1)}(?:[\r\n]|$)', flags=re.MULTILINE) def process(self, data: bytearray): lines = data.decode(self.codec).splitlines(keepends=False) if not lines: return None decoded_bytes = bytearray() encoded_byte_matches: List[Dict[int, int]] = [] for line in lines: matches: Dict[int, int] = {} encoded_byte_matches.append(matches) for match in self._ENCODED_BYTES.finditer(line): a, b = match.span() matches[a] = b - a it = iter(encoded_byte_matches) offsets = set(next(it).keys()) for matches in it: offsets.intersection_update(matches.keys()) if not offsets: raise ValueError('unable to determine the position of the hex bytes in this dump') lengths: Dict[int, List[int]] = {offset: [] for offset in offsets} del offsets for matches in encoded_byte_matches: for offset in lengths: lengths[offset].append(matches[offset]) for offset in lengths: lengths[offset].sort() midpoint = len(encoded_byte_matches) // 2 offset, length = max(((offset, lengths[offset][midpoint]) for offset in lengths), key=operator.itemgetter(1)) end = offset + length del lengths for k, line in enumerate(lines, 1): encoded_line = line[offset:end] onlyhex = re.search(r'^[\sA-Fa-f0-9]+', encoded_line) if not onlyhex: self.log_warn(F'ignoring line without hexadecimal data: {line}') continue if onlyhex.group(0) != encoded_line: if k != len(lines): self.log_warn(F'ignoring line with mismatching hex data length: {line}') continue encoded_line = onlyhex.group(0) self.log_debug(F'decoding: {encoded_line.strip()}') decoded_line = bytes.fromhex(encoded_line) decoded_bytes.extend(decoded_line) txt = line[end:] txt_stripped = re.sub('\\s+', '', txt) if not txt_stripped: continue if len(decoded_line) not in range(len(txt_stripped), len(txt) + 1): self.log_warn(F'preview size {len(txt_stripped)} does not match decoding: {line}') if decoded_bytes: yield decoded_bytes def reverse(self, data): metrics = self._get_metrics(len(data)) if not self.args.width: metrics.fit_to_width(allow_increase=True) for line in self.hexdump(data, metrics): yield line.encode(self.codec)
class HKDF (size, salt, hash='SHA512')
-
HKDF Key derivation
Expand source code Browse git
class HKDF(KeyDerivation): """HKDF Key derivation""" def __init__(self, size, salt, hash='SHA512'): super().__init__(size=size, salt=salt, hash=hash) def process(self, data): from Cryptodome.Protocol.KDF import HKDF return HKDF(data, self.args.size, self.args.salt, self.hash)
class hmac (salt, hash='SHA1', size=None)
-
HMAC based key derivation
Expand source code Browse git
class hmac(KeyDerivation): """ HMAC based key derivation """ def __init__(self, salt, hash='SHA1', size=None): super().__init__(salt=salt, size=size, hash=hash) def process(self, data): from Cryptodome.Hash import HMAC return HMAC.new(data, self.args.salt, digestmod=self.hash).digest()
class htmlesc
-
Encodes and decodes HTML entities.
Expand source code Browse git
class htmlesc(Unit): """ Encodes and decodes HTML entities. """ @unicoded def process(self, data: str) -> str: return html_entities.unescape(data) @unicoded def reverse(self, data: str) -> str: return html_entities.escape(data)
class httprequest
-
Parses HTTP request data, as you would obtain from a packet dump. The unit extracts POST data in any format; each uploaded file is emitted as a separate chunk.
Expand source code Browse git
class httprequest(Unit): """ Parses HTTP request data, as you would obtain from a packet dump. The unit extracts POST data in any format; each uploaded file is emitted as a separate chunk. """ def process(self, data): def header(line: bytes): name, colon, data = line.decode('utf8').partition(':') if colon: yield (name.strip().lower(), data.strip()) head, _, body = data.partition(b'\r\n\r\n') request, *headers = head.splitlines(False) headers = dict(t for line in headers for t in header(line)) method, path, _, *rest = request.split() info = {} mode = _Fmt.RawBody if rest: self.log_warn('unexpected rest data while parsing HTTP request:', rest) if method == b'GET' and not body: mode = _Fmt.UrlEncode body = path.partition(B'?')[1] if method == b'POST' and (ct := headers.get('content-type', None)): ct, info = parse_header(ct) mode = _Fmt(ct) def chunks(upload: Dict[Union[str, bytes], List[bytes]]): for key, values in upload.items(): if not isinstance(key, str): key = key.decode('utf8') for value in values: yield self.labelled(value, name=key) if mode is _Fmt.RawBody: yield body return if mode is _Fmt.Multipart: boundary = info['boundary'] headers = Message() headers.set_type(F'{_Fmt.Multipart.value}; boundary={boundary}') try: headers['Content-Length'] = info['CONTENT-LENGTH'] except KeyError: pass fs = FieldStorage(MemoryFile(body, read_as_bytes=True), headers=headers, environ={'REQUEST_METHOD': method.decode()}) for name in fs: fields = fs[name] if not isinstance(fields, list): fields = [fields] for field in fields: field: FieldStorage chunk = self.labelled(field.value) if fn := field.filename: chunk.meta['name'] = fn yield chunk if mode is _Fmt.UrlEncode: yield from chunks(parse_qs(body, keep_blank_values=1)) @classmethod def handles(self, data: bytearray) -> bool | None: return data.startswith(B'POST ') or data.startswith(B'GET ')
class httpresponse
-
Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked or compressed transfer encoding was used.
Expand source code Browse git
class httpresponse(Unit): """ Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked or compressed transfer encoding was used. """ def process(self, data): with SockWrapper(data) as mock: mock.seek(0) parser = HTTPResponse(mock) parser.begin() try: return parser.read() except IncompleteRead as incomplete: msg = F'incomplete read: {len(incomplete.partial)} bytes processed, {incomplete.expected} more expected' raise RefineryPartialResult(msg, incomplete.partial) from incomplete
class iemap (legend=False, background=False, block_char='#', *label)
-
The information entropy map displays a colored bar on the terminal visualizing the file's local entropy from beginning to end.
Expand source code Browse git
class iemap(Unit): """ The information entropy map displays a colored bar on the terminal visualizing the file's local entropy from beginning to end. """ def __init__( self, legend: Unit.Arg.Switch('-l', help='Show entropy color legend.') = False, background: Unit.Arg.Switch('-b', help='Generate the bar by coloring the background.') = False, block_char: Unit.Arg('-c', '--block-char', type=str, metavar='C', help='Character used for filling the bar, default is {default}') = '#', *label: Unit.Arg(type=str, metavar='label-part', help=( 'The remaining command line specifies a format string expression that will be printed ' 'over the heat map display of each processed chunk.' )) ): super().__init__(label=' '.join(label), background=background, legend=legend, block_char=block_char) @Unit.Requires('colorama', 'display', 'default', 'extended') def _colorama(): import colorama return colorama def process(self, data): from sys import stderr from os import name as os_name colorama = self._colorama colorama.init(autoreset=False, convert=(os_name == 'nt')) nobg = not self.args.background meta = metavars(data) label = meta.format_str(self.args.label, self.codec, [data]) if label: if not label.endswith(' '): label = F'{label} ' if not label.startswith(' '): label = F' {label}' bgmap = [ colorama.Back.BLACK, colorama.Back.WHITE, colorama.Back.YELLOW, colorama.Back.CYAN, colorama.Back.BLUE, colorama.Back.GREEN, colorama.Back.LIGHTRED_EX, colorama.Back.MAGENTA, ] fgmap = [ colorama.Fore.LIGHTBLACK_EX, colorama.Fore.LIGHTWHITE_EX, colorama.Fore.LIGHTYELLOW_EX, colorama.Fore.LIGHTCYAN_EX, colorama.Fore.LIGHTBLUE_EX, colorama.Fore.LIGHTGREEN_EX, colorama.Fore.LIGHTRED_EX, colorama.Fore.LIGHTMAGENTA_EX, ] _reset = colorama.Back.BLACK + colorama.Fore.WHITE + colorama.Style.RESET_ALL clrmap = fgmap if nobg else bgmap header = '[' header_length = 1 footer_length = 4 + 7 if self.args.legend: header = '[{1}{0}] {2}'.format(_reset, ''.join(F'{bg}{k}' for k, bg in enumerate(clrmap, 1)), header) header_length += 3 + len(clrmap) _tw = get_terminal_size() width = _tw - header_length - footer_length if width < 16: raise RuntimeError(F'computed terminal width {_tw} is too small for heatmap') def entropy_select(value, map): index = min(len(map) - 1, math.floor(value * len(map))) return map[index] view = memoryview(data) size = len(data) chunk_size = 0 for block_size in range(1, width + 1): block_count = width // block_size chunk_size = size // block_count if chunk_size > 1024: break q, remainder = divmod(width, block_size) assert q == block_count indices = list(range(q)) random.seed(sum(view[:1024])) random.shuffle(indices) block_sizes = [block_size] * q q, r = divmod(remainder, block_count) for i in indices: block_sizes[i] += q for i in indices[:r]: block_sizes[i] += 1 assert sum(block_sizes) == width q, remainder = divmod(size, block_count) assert q == chunk_size chunk_sizes = [chunk_size] * block_count for i in indices[:remainder]: chunk_sizes[i] += 1 assert sum(chunk_sizes) == size stream = MemoryFile(view) filler = self.args.block_char if nobg else ' ' try: stderr.write(header) if label is not None: stderr.write(colorama.Fore.WHITE) stderr.flush() it = itertools.chain(itertools.repeat(filler, 3), label, itertools.cycle(filler)) cp = None for chunk_size, block_size in zip(chunk_sizes, block_sizes): chunk = stream.read(chunk_size) chunk_entropy = entropy(chunk) pp = entropy_select(chunk_entropy, clrmap) string = ''.join(itertools.islice(it, block_size)) if pp != cp: string = F'{pp}{string}' cp = pp stderr.write(string) stderr.flush() except BaseException: eraser = ' ' * width stderr.write(F'\r{_reset}{eraser}\r') raise else: stderr.write(F'{_reset}] [---.--%]') te = meta['entropy'] stderr.write('\b' * footer_length) stderr.write(F'] [{te!r:>7}]\n') stderr.flush() if not self.isatty: yield data
class iff (*expression, ge=None, gt=None, le=None, lt=None, ct=None, iN=None, eq=None, retain=False)
-
Filter incoming chunks depending on whether a given Python expression evaluates to true. If no expression is given, the unit filters out empty chunks.
Note: The reverse operation of a conditional unit uses the logical negation of its condition.
Expand source code Browse git
class iff(ConditionalUnit, extend_docs=True): """ Filter incoming chunks depending on whether a given Python expression evaluates to true. If no expression is given, the unit filters out empty chunks. """ def __init__( self, *expression: Arg(metavar='token', type=str, help=( 'All "token" arguments to this unit are joined with spaces to produce the expression ' 'to be evaluated. This is done so that unnecessary shell quoting is avoided.')), ge: Arg('-ge', type=str, metavar='RHS', group='OP', help='check that the expression is greater or equal to {varname}') = None, gt: Arg('-gt', type=str, metavar='RHS', group='OP', help='check that the expression is greater than {varname}') = None, le: Arg('-le', type=str, metavar='RHS', group='OP', help='check that the expression is less or equal to {varname}') = None, lt: Arg('-lt', type=str, metavar='RHS', group='OP', help='check that the expression is less than {varname}') = None, ct: Arg('-ct', type=str, metavar='RHS', group='OP', help='check that the expression contains {varname}') = None, iN: Arg('-in', '-i', type=str, metavar='RHS', group='OP', help='check that the expression is contained in {varname}') = None, eq: Arg('-eq', '-e', type=str, metavar='RHS', group='OP', help='check that the expression is equal to {varname}') = None, retain=False, ): operators = [ (ge, operator.__ge__), (gt, operator.__gt__), (le, operator.__le__), (lt, operator.__lt__), (eq, operator.__eq__), (ct, operator.__contains__), (iN, lambda a, b: operator.__contains__(b, a)), ] operators = [ (rhs, cmp) for (rhs, cmp) in operators if rhs is not None ] rhs, cmp, lhs = None, None, '\x20'.join(expression) or None if len(operators) > 0: if not lhs: raise ValueError('Comparison operator with empty left hand side.') if len(operators) > 1: raise ValueError('Only one comparison operation can be specified.') rhs, cmp = operators[0] super().__init__( lhs=lhs, rhs=rhs, cmp=cmp, retain=retain, ) def match(self, chunk): meta = metavars(chunk) lhs: Optional[str] = self.args.lhs rhs: Optional[Any] = self.args.rhs cmp: Optional[Callable[[Any, Any], bool]] = self.args.cmp if cmp is None and rhs is not None: raise ValueError('right hand side defined but no operator') if lhs is not None: if rhs is not None: lhs = DelayedNumSeqArgument(lhs, additional_types=(float, str))(chunk) else: lhs = PythonExpression.Evaluate(lhs, meta) rhs = rhs and DelayedNumSeqArgument(rhs, additional_types=(float, str))(chunk) self.log_info(F'lhs: type={lhs.__class__.__name__}; value={lhs!r}') self.log_info(F'rhs: type={rhs.__class__.__name__}; value={rhs!r}') if lhs is None: return bool(chunk) if rhs is None: return bool(lhs) return cmp(lhs, rhs)
class iffp (*patterns, partial=False, retain=False)
-
Filter incoming chunks depending on whether it matches any of a given set of patterns. The available patterns are the following: integer, float, number, string, multiline_string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote, urlquote_coarse, urlquote_narrow, intarray, numarray, word, letters, wshenc, alphanumeric, b32, b64, b85, b92, b64any, b64url, hex, uppercase_hex, spaced_hex, spaced_b64, spaced_b85, utf8, hexdump, hexarray, uuencode, domain, email, guid, ipv4, ipv6, md5, sha1, sha256, hostname, socket, subdomain, url, btc, pem, xmr, path, winpath, nixpath, environment_variable.
Note: The reverse operation of a conditional unit uses the logical negation of its condition.
Expand source code Browse git
class iffp(ConditionalUnit, extend_docs=True): """ Filter incoming chunks depending on whether it matches any of a given set of patterns. The available patterns are the following: {}. """ def __init__( self, *patterns: Arg.Choice(metavar='pattern', choices=_PATTERNS), partial: Arg.Switch('-p', help='Allow partial matches on the data.') = False, retain=False ): super().__init__( retain=retain, patterns=patterns, partial=partial ) def match(self, chunk): for name in self.args.patterns: p: pattern = _PATTERNS[name] matcher = p.match if self.args.partial else p.fullmatch if matcher(chunk): return True return False
class iffs (needle, retain=False)
-
Filter incoming chunks depending on whether they contain a given binary substring.
Note: The reverse operation of a conditional unit uses the logical negation of its condition.
Expand source code Browse git
class iffs(ConditionalUnit, extend_docs=True): """ Filter incoming chunks depending on whether they contain a given binary substring. """ def __init__( self, needle: Arg(help='the string to search for'), retain=False, ): super().__init__( needle=needle, retain=retain, ) def match(self, chunk): return self.args.needle in chunk
class iffx (regex, count=0, fullmatch=False, multiline=False, ignorecase=False)
-
Filter incoming chunks by discarding those that do not match the given regular expression.
Note: The reverse operation of a conditional unit uses the logical negation of its condition.
Expand source code Browse git
class iffx(SingleRegexUnit, ConditionalUnit, extend_docs=True): """ Filter incoming chunks by discarding those that do not match the given regular expression. """ def match(self, chunk): return self.matcher(chunk) is not None
class ifps
-
Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp.
Expand source code Browse git
class ifps(Unit): """ Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp. """ def process(self, data): return str(IFPSFile(data)).encode(self.codec) @classmethod def handles(self, data: bytearray) -> bool: return data.startswith(IFPSFile.Magic)
class ifpsstr
-
Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp.
Expand source code Browse git
class ifpsstr(Unit): """ Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp. """ def process(self, data): ifps = IFPSFile(data) for string in ifps.strings: yield string.encode(self.codec) @classmethod def handles(self, data: bytearray) -> bool: return data.startswith(IFPSFile.Magic)
class imphash (text=False)
-
Implements the import hash for PE files.
Expand source code Browse git
class imphash(HashUnit): """ Implements the import hash for PE files. """ def _algorithm(self, data): pe = PE(data=data, fast_load=True) pe.parse_data_directories(directories=[IMAGE_DIRECTORY_ENTRY_IMPORT]) th = pe.get_imphash() if not th: raise ValueError('no import directory.') return bytes.fromhex(th)
class isaac (key, discard=0, stateful=False)
-
The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.
Expand source code Browse git
class isaac(StreamCipherUnit): """ The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher. """ def keystream(self) -> Iterable[int]: key = self.args.key A: int = 0 B: int = 0 C: int = 0 S: List[int] = [0x9E3779B9] * 8 T: List[int] = [] K = list(chunks.unpack(key + bytearray(0x400 - len(key)), 4, bigendian=False)) U = 0xFFFFFFFF def _mix_state(): a, b, c, d, e, f, g, h = S a ^= (b << 0x0B) & U; d = d + a & U; b = b + c & U # noqa b ^= (c >> 0x02) & U; e = e + b & U; c = c + d & U # noqa c ^= (d << 0x08) & U; f = f + c & U; d = d + e & U # noqa d ^= (e >> 0x10) & U; g = g + d & U; e = e + f & U # noqa e ^= (f << 0x0A) & U; h = h + e & U; f = f + g & U # noqa f ^= (g >> 0x04) & U; a = a + f & U; g = g + h & U # noqa g ^= (h << 0x08) & U; b = b + g & U; h = h + a & U # noqa h ^= (a >> 0x09) & U; c = c + h & U; a = a + b & U # noqa S[:] = a, b, c, d, e, f, g, h return S def _initialize_with(R: List[int]): for i in range(0, 0x100, 8): S[:] = (x + R[j] & U for j, x in enumerate(S, i)) T[i:i + 8] = _mix_state() for _ in range(4): _mix_state() _initialize_with(K) _initialize_with(T) operations = [ (__lshift__, 0x0D), (__rshift__, 0x06), (__lshift__, 0x02), (__rshift__, 0x10), ] while True: C = (C + 1) & U B = (B + C) & U for i in range(0x100): X = T[i] shift, k = operations[i % 4] A = (A ^ shift(A, k)) & U A = (A + T[i ^ 0x80]) & U Y = T[+i] = T[X // 4 & 0xFF] + A + B & U B = K[~i] = X + T[Y // 1024 & 0xFF] & U yield from chunks.pack(K, 4, True)
class jcalg (ignore_header=False)
-
JCALG decompression.
Expand source code Browse git
class jcalg(Unit): """ JCALG decompression. """ def __init__( self, ignore_header: Unit.Arg('-g', help=( 'Keep decompressing even after the output has reached the final size as given by the header value.')) = False, ): super().__init__(ignore_header=ignore_header) def process(self, data: bytearray): with MemoryFile() as output, StructReader(data) as reader: if reader.read(2) != B'JC': self.log_warn('data does not begin with magic sequence, assuming that header is missing') reader.seek(0) size = checksum = None else: size = reader.u32() checksum = reader.u32() if self.args.ignore_header: size = None self._decompress(output, reader, size) if size is not None: if len(output) > size: self.log_info(F'tuncating to size {size}') output.truncate(size) elif len(output) < size: self.log_warn(F'header size was {size}, but only {len(data)} bytes were decompressed') data = output.getvalue() if checksum: c = self._checksum(data) if c != checksum: self.log_warn(F'header checksum was {checksum:08X}, computed value is {c:08X}') return data @classmethod def handles(cls, data: bytearray): if data[:2] == B'JC': return True def _checksum(self, data): from refinery.lib import chunks checksum = 0 it = chunks.unpack(data, 4) if len(data) % 4: import itertools it = itertools.chain(it, (int.from_bytes(data[-4:], 'little'),)) for chunk in it: checksum += chunk checksum ^= ((chunk & 0x7FFFFFFF) << 1) + (chunk >> 31) + 1 checksum &= 0xFFFFFFFF return checksum def _decompress(self, writer: MemoryFile, reader_: StructReader[bytearray], size: Optional[int] = None): index = 1 base = 8 literal_bits = None literal_offset = None flags = BitBufferedReader(reader_, 32) while True: if size and len(writer) >= size: break if flags.next(): b = flags.read(literal_bits) + literal_offset b = b & 0xFF writer.write_byte(b) continue if flags.next(): high = flags.variable_length_integer() if high == 2: match_length = flags.variable_length_integer() else: index = ((high - 3) << base) + flags.read(base) match_length = flags.variable_length_integer() if index >= 0x10000: match_length += 3 elif index >= 0x37FF: match_length += 2 elif index >= 0x27F: match_length += 1 elif index <= 127: match_length += 4 writer.replay(index, match_length) continue if not flags.next(): new_index = flags.read(7) match_length = 2 + flags.read(2) if new_index == 0: if match_length == 2: break base = flags.read(match_length + 1) else: index = new_index writer.replay(index, match_length) continue one_byte_phrase_value = flags.read(4) - 1 if one_byte_phrase_value == 0: writer.write_byte(0) elif one_byte_phrase_value > 0: b = writer.getbuffer()[-one_byte_phrase_value] writer.write_byte(b) else: if not flags.next(): literal_bits = 7 + flags.next() literal_offset = 0 if literal_bits != 8: literal_offset = flags.read(8) continue while True: for _ in range(0x100): b = flags.read(8) writer.write_byte(b) if not flags.next(): break
class jvdasm (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The unit is implemented as a path extractor and each path name corresponds to the name of one method defined in the class file.
Expand source code Browse git
class jvdasm(PathExtractorUnit): """ Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The unit is implemented as a path extractor and each path name corresponds to the name of one method defined in the class file. """ _OPC_STRLEN = max(len(op.name) for op in opc) def _hex(self, bytestring, sep=''): return sep.join(F'{x:02x}' for x in bytestring) def unpack(self, data): jc = JvClassFile(data) tt = ' ' opcw = self._OPC_STRLEN for method in jc.methods: for attribute in method.attributes: if attribute.name == 'Code': break else: self.log_warn(F'no code found for method: {method.name}') continue code: JvCode = attribute.parse(JvCode) with io.StringIO() as display: args, retval = re.match(R'^\((.*?)\)(.*?)$', method.descriptor).groups() print(F'{jc.this!s}::{method!s}{method.descriptor}', file=display) for op in code.disassembly: olen = len(op.raw) if op.table is None: args = ', '.join(repr(a) for a in op.arguments) else: ow = 4 if op.code is opc.tableswitch else 8 olen = olen - (len(op.table) - 1) * ow args = F'defaultjmp => {op.table[None]:#010x}' jmps = [] for k, (key, jmp) in enumerate(op.table.items()): if key is None: continue raw = self._hex(op.raw[olen + k * ow: olen + k * ow + ow], ' ') jmps.append(F'{tt}{raw!s:<{opcw + 15}} {key:#010x} => {jmp:#010x}') args = '\n'.join((args, *jmps)) opch = self._hex(op.raw[:olen], ' ') if len(opch) > 14: opch += F'\n{tt}{tt:<15}' print(F'{tt}{opch:<15}{op.code!r:<{opcw}} {args}', file=display) name = method.name if name.startswith('<'): this = jc.this.value.split('/') this = this[-1] name = F'{this}${name[1:-1]}' yield UnpackResult(F'{name}.jd', display.getvalue().encode(self.codec))
class jvstr
-
Extract string constants from Java class files.
Expand source code Browse git
class jvstr(Unit): """ Extract string constants from Java class files. """ def process(self, data): jc = JvClassFile(data) for string in jc.strings: yield string.encode(self.codec)
class kblob
-
Extracts a key from a Microsoft Crypto API BLOB structure.
Expand source code Browse git
class kblob(Unit): """ Extracts a key from a Microsoft Crypto API BLOB structure. """ def process(self, data): blob = CRYPTOKEY(data) try: return self.labelled( bytes(blob.key), type=blob.header.type.name, algorithm=blob.header.algorithm.name ) except AttributeError as A: raise ValueError(F'unable to derive key from {blob.header.type!s}') from A
class keccak256 (text=False)
-
Returns the KECCAK256 hash of the input data.
class kramer
-
Deobfuscate Python samples obfuscated with Kramer.
Expand source code Browse git
class kramer(Unit): """ Deobfuscate Python samples obfuscated with Kramer. """ _LINEBREAK_MAGIC = 950 def process(self, data): kramer = str() secret = set() _pyver = None def crawl(code: CodeType, depth=1): nonlocal kramer nonlocal secret for instruction in disassemble_code(code, _pyver): arg = instruction.argval if arg is None: continue if isinstance(arg, tuple): continue if isinstance(arg, str): if len(arg) > len(kramer): kramer = arg continue if isinstance(arg, int): secret.add(arg) continue try: crawl(arg, depth + 1) except Exception as E: self.log_info(F'error crawling arg of type {type(arg).__name__} at depth {depth}: {E}') for code in extract_code_from_buffer(bytes(data)): _pyver = code.version crawl(code.container) if not kramer: raise ValueError('could not find the encoded string') separator = re.search('[^a-fA-F0-9]+', kramer) if not separator: raise ValueError('no separator detected; encoding method may have changed') def rotchar(c: int): if c in range(0x61, 0x7a) or c in range(0x30, 0x39): return c + 1 if c == 0x7a: return 0x30 if c == 0x39: return 0x61 return c def decrypt(c: int, k: int): if c >= k: out = rotchar(c - k) if out not in range(0x100): raise _WrongKey return out if c == self._LINEBREAK_MAGIC: return 0x0A raise _WrongKey def decrypt_with_key(key: int): decrypted = bytearray(decrypt(c, key) for c in encrypted) if not re.fullmatch(B'[\\s!-~]+', decrypted): raise _WrongKey return decrypted separator = separator.group(0) encrypted = [ord(bytes.fromhex(e).decode()) for e in kramer.split(separator)] ubound = min(x for x in encrypted if x != self._LINEBREAK_MAGIC) lbound = ubound - 0xFF secret = {k for k in secret if k > lbound and k < ubound} self.log_debug('potential secrets from code:', secret) for key in sorted(secret, reverse=True): try: return decrypt_with_key(key) except _WrongKey: pass self.log_info(F'all candidates failed, searching [{lbound}, {ubound}]') for key in range(ubound, lbound - 1, -1): try: self.log_debug('attempting key:', key) return decrypt_with_key(key) except _WrongKey: pass raise RuntimeError('could not find decryption key')
class lnk (tabular=False)
-
Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit is a thin wrapper around the LnkParse3 library.
Expand source code Browse git
class lnk(Unit): """ Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit is a thin wrapper around the LnkParse3 library. """ @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'default', 'extended') def _LnkParse3(): import LnkParse3 return LnkParse3 def __init__(self, tabular: Unit.Arg('-t', help='Print information in a table rather than as JSON') = False): super().__init__(tabular=tabular) def process(self, data): with NoLogging(): parsed = self._LnkParse3.lnk_file(MemoryFile(data)).get_json() with JSONEncoderEx as encoder: pp = ppjson(tabular=self.args.tabular) yield from pp._pretty_output( parsed, indent=4, cls=encoder, ensure_ascii=False)
class loop (count, suffix, do_while, do_until, fullmatch=False, multiline=False, ignorecase=False)
-
Applies a given multibin suffix to the input chunk repeatedly. For example, the following command would carve the largest base64-encoded buffer from the input, decode it, and then decompress the result 20 times:
emit data | loop 20 csd[b64]:zl
Notably, the argument after the count is a suffix, which means that handlers are applied from left to right (not from right to left). The loop is aborted and the previous result returned if the newly computed result is empty. If the an error occurs while computing the suffix and the unit is lenient (i.e. the
-L
switch is set), the last known result is returned.Expand source code Browse git
class loop(RegexUnit): """ Applies a given multibin suffix to the input chunk repeatedly. For example, the following command would carve the largest base64-encoded buffer from the input, decode it, and then decompress the result 20 times: emit data | loop 20 csd[b64]:zl Notably, the argument after the count is a suffix, which means that handlers are applied from left to right (not from right to left). The loop is aborted and the previous result returned if the newly computed result is empty. If the an error occurs while computing the suffix and the unit is lenient (i.e. the `-L` switch is set), the last known result is returned. """ def __init__( self, count: Arg.Number(metavar='count', help='The number of repeated applications of the suffix.'), suffix: Arg(type=str, help='A multibin expression suffix.'), do_while: Arg('-w', '--while', type=regexp, metavar='RE', help='Halt when the given regular expression does not match the data.'), do_until: Arg('-u', '--until', type=regexp, metavar='RE', help='Halt when the given regular expression matches the data.'), fullmatch=False, multiline=False, ignorecase=False, ): super().__init__( count=count, suffix=suffix, do_while=do_while, do_until=do_until, fullmatch=fullmatch, multiline=multiline, ignorecase=ignorecase, ) def process(self, data): _count = self.args.count _width = len(str(_count)) _while = self._while _until = self._until for k in range(_count): if _while and not _while(data): self.log_info(F'step {k:0{_width}}: stopping, while-condition violated') break if _until and _until(data): self.log_info(F'step {k:0{_width}}: stopping, until-condition satisfied') break try: out = DelayedBinaryArgument( self.args.suffix, reverse=True, seed=data)(data) except Exception as error: self.log_info(F'step {k:0{_width}}: error;', exception_to_string(error)) msg = F'Stopped after {k} steps, increase verbosity for additional details.' raise RefineryPartialResult(msg, data) from error if not out: self.log_info(F'step {k:0{_width}}: stopping after empty result') break data[:] = out self.log_debug(F'step {k:0{_width}}: data =', data, clip=True) return data @property def _while(self): return self._make_matcher(self.args.do_while) @property def _until(self): return self._make_matcher(self.args.do_until)
class lz4
-
LZ4 block decompression. See also: https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format
Expand source code Browse git
class lz4(Unit): """ LZ4 block decompression. See also: https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format """ def _read_block(self, reader: StructReader, output: io.BytesIO, ubound=None): entry = reader.tell() lastend = 0 def ubound_check(): if ubound is None: return False consumed = reader.tell() - entry if consumed > ubound: raise ValueError(F'upper bound {ubound} exceeded by {consumed - ubound} in LZ4 block') return consumed == ubound while not reader.eof: reflen = reader.read_nibble() litlen = reader.read_nibble() litlen = reader.read_size(litlen) literal = reader.read(litlen) output.write(literal) if ubound_check(): break try: refpos = reader.u16() except EOFError: break if refpos - 1 not in range(output.tell()): with StreamDetour(output, lastend): if output.read(len(literal)) == literal: # This literal could have been encoded in the last match, but it wasn't. # Therefore, it is very likely that we have reached the end of the stream. break position = reader.tell() remaining = len(literal) - position raise RefineryPartialResult( F'encountered invalid match offset value {refpos} at position {position} with {remaining} bytes remaining', partial=output.getvalue()) reflen = reader.read_size(reflen) if ubound_check(): raise ValueError('last sequence in block contained a match') reflen += 4 available_bytes = min(refpos, reflen) q, r = divmod(reflen, available_bytes) with StreamDetour(output, -refpos, io.SEEK_CUR): match = output.read(available_bytes) match = q * match + match[:r] assert len(match) == reflen lastend = output.tell() - available_bytes + r output.write(match) def process(self, data): output = io.BytesIO() reader = LZ4Reader(memoryview(data)) try: magic = reader.u32() == 0x184D2204 except EOFError: magic = False if not magic: reader.seek(0) self._read_block(reader, output) return output.getbuffer() (dict_id, rsrv1, content_checksummed, content_size, blocks_checksummed, blocks_independent, v2, v1) = reader.read_bits(8) rsrv2 = reader.read_nibble() try: block_maximum = { 7: 0x400000, 6: 0x100000, 5: 0x040000, 4: 0x010000, }[reader.read_integer(3)] except KeyError: raise ValueError('unknown maximum block size value in LZ4 frame header') rsrv3 = reader.read_bit() if any((rsrv1, rsrv2, rsrv3)): self.log_warn('nonzero reserved value in LZ4 frame header') if (v1, v2) != (0, 1): self.log_warn(F'invalid version ({v1},{v2}) in LZ4 frame header') content_size = content_size and reader.u64() or None dict_id = dict_id and reader.u32() or None # Header Checksum xxh = xxhash(data[4:reader.tell()]).intdigest() >> 8 & 0xFF chk = reader.read_byte() if chk != xxh: self.log_warn(F'header checksum {chk:02X} does not match computed value {xxh:02X}') self.log_debug(lambda: F'dictionary id: {dict_id}') self.log_debug(lambda: F'block max: 0x{block_maximum:X}') if content_size is not None: self.log_debug(lambda: F'chunk max: 0x{content_size:X}') self.log_debug(lambda: F'blocks independent: {bool(blocks_independent)}') self.log_debug(lambda: F'blocks checksummed: {bool(blocks_checksummed)}') blockindex = 0 while True: blockindex += 1 size = reader.read_integer(31) uncompressed = reader.read_bit() if not size: assert not uncompressed break self.log_info(F'reading block of size 0x{size:06X}') assert reader.byte_aligned assert size <= block_maximum, 'block size exceeds maximum size' if uncompressed: output.write(reader.read_exactly(size)) else: self._read_block(reader, output, size) if blocks_checksummed: with StreamDetour(reader, -size, io.SEEK_CUR): xxh = xxhash(reader.read_exactly(size)).intdigest() chk = reader.u32() if chk != xxh: self.log_warn(F'block {blockindex} had checksum {chk:08X} which did not match computed value {xxh:08X}') if content_checksummed: self.log_info('computing checksum') xxh = xxhash(output.getbuffer()).intdigest() chk = reader.u32() if chk != xxh: self.log_warn(F'the given checksum {chk:08X} did not match the computed checksum {xxh:08X}') if not reader.eof: pos = reader.tell() self.log_warn(F'found {len(data) - pos} additional bytes starting at position 0x{pos:X} after compressed data') return output.getbuffer()
class lzf (fast=False)
-
This unit implements LZF compression and decompression.
Expand source code Browse git
class lzf(Unit): """ This unit implements LZF compression and decompression. """ def __init__(self, fast: Arg.Switch('-x', help='Enable fast compression mode.') = False): super().__init__(fast=fast) def reverse(self, data): def FRST(p: memoryview) -> int: return ((p[0]) << 8) | p[1] def NEXT(v: int, p: memoryview) -> int: return ((v << 8) | p[2]) & 0xFFFFFFFF def DELTA(p: memoryview): return view.nbytes - p.nbytes if self.args.fast: def HIDX(h: int) -> int: return (((h >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1)) else: def HIDX(h: int) -> int: q = (h ^ (h << 5)) return (((q >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1)) if not data: return data ip = view = memoryview(data) op = bytearray() if len(data) == 1: op.append(0) op.extend(data) return op hval = FRST(ip) htab = [0] * _HSIZE fast = 1 if self.args.fast else 0 lit = 0 def begin_literal(): nonlocal lit op.append(0) lit = 0 def advance_literal(): nonlocal lit, ip lit += 1 op.append(ip[0]) ip = ip[1:] if lit == _MAX_LIT: op[-lit - 1] = lit - 1 begin_literal() def commit_literal(): if lit > 0: op[-lit - 1] = lit - 1 else: op.pop() begin_literal() while ip.nbytes > 2: hval = NEXT(hval, ip) hpos = HIDX(hval) ipos = DELTA(ip) length = 2 r, htab[hpos] = htab[hpos], ipos off = ipos - r - 1 ref = view[r:] if off >= _MAX_OFF or r <= 0 or ref[:3] != ip[:3]: advance_literal() continue else: commit_literal() maxlen = min(_MAX_REF, ip.nbytes - length) while True: length += 1 if length >= maxlen or ref[length] != ip[length]: length -= 2 break if length < 7: op.append((off >> 8) + (length << 5)) else: op.append((off >> 8) + (7 << 5)) op.append(length - 7) op.append(off & 0xFF) begin_literal() if ip.nbytes <= length + 3: ip = ip[length + 2:] break if fast: ip = ip[length:] hval = FRST(ip) for _ in range(2): hval = NEXT(hval, ip) htab[HIDX(hval)] = DELTA(ip) ip = ip[1:] else: ip = ip[1:] for _ in range(length + 1): hval = NEXT(hval, ip) htab[HIDX(hval)] = DELTA(ip) ip = ip[1:] while ip.nbytes: advance_literal() commit_literal() return op def _decompress_chunk(self, data: memoryview, out: MemoryFile): ip = StructReader(data) while not ip.eof: ctrl = ip.u8() if ctrl < 0B100000: ctrl += 1 out.write(ip.read_exactly(ctrl)) else: length = ctrl >> 5 offset = 1 + ((ctrl & 0B11111) << 8) if length == 7: length += ip.u8() offset += ip.u8() length += 2 out.replay(offset, length) def process(self, data): mem = memoryview(data) out = MemoryFile() try: reader = StructReader(mem) header = LZFHeader(reader) except Exception: self.log_info('no header detected, decompressing as raw stream') self._decompress_chunk(mem, out) return out.getvalue() for k in itertools.count(1): self.log_info(F'chunk: e=0x{header.encoded_size:04X} d=0x{header.decoded_size:04X}') chunk = reader.read(header.encoded_size) if header.compressed: self._decompress_chunk(chunk, out) else: out.write(chunk) if reader.eof: break try: header = LZFHeader(reader) except Exception as E: msg = F'failed parsing next header after {k} chunks: {E!s}' raise RefineryPartialResult(msg, out.getvalue()) return out.getvalue()
class lzg
-
LZG decompression.
Expand source code Browse git
class lzg(Unit): """ LZG decompression. """ def process(self, data: bytearray): stream = LZGStream(data) out = stream.decompress() if len(out) != stream.decoded_size: msg = F'LZG header announced {stream.decoded_size} bytes, but decompressed buffer had size {len(out)}.' raise RefineryPartialResult(msg, out) return out @classmethod def handles(cls, data: bytearray): if data[:3] == B'LZG': return True
class lzip
-
LZIP decompression
Expand source code Browse git
class lzip(Unit): """ LZIP decompression """ def process(self, data: bytearray): view = memoryview(data) with MemoryFile() as output, StructReader(view) as reader: for k in count(1): if reader.eof: break trailing_size = len(data) - reader.tell() try: ID, VN, DS = reader.read_struct('4sBB') if ID != B'LZIP': if k > 1: raise EOF else: self.log_warn(F'ignoring invalid LZIP signature: {ID.hex()}') if VN != 1: self.log_warn(F'ignoring invalid LZIP version: {VN}') dict_size = 1 << (DS & 0x1F) dict_size -= (dict_size // 16) * ((DS >> 5) & 7) if dict_size not in range(_MIN_DICT_SIZE, _MAX_DICT_SIZE + 1): raise ValueError( F'The dictionary size {dict_size} is out of the valid range ' F'[{_MIN_DICT_SIZE}, {_MAX_DICT_SIZE}]; unable to proceed.' ) decoder = MemberDecoder(dict_size, reader, output) if not decoder(): raise ValueError(F'Data error in stream {k}.') crc32, data_size, member_size = reader.read_struct('<LQQ') if crc32 != decoder.crc32: self.log_warn(F'checksum in stream {k} was {decoder.crc:08X}, should have been {crc32:08X}.') if member_size - 20 != decoder.member_position: self.log_warn(F'member size in stream {k} was {decoder.member_position}, should have been {member_size}.') if data_size != decoder.data_position: self.log_warn(F'data size in stream {k} was {decoder.data_position}, should have been {data_size}.') except EOFError: if k <= 1: raise self.log_info(F'silently ignoring {trailing_size} bytes of trailing data') break return output.getvalue() @classmethod def handles(self, data: bytearray): return data[:4] == B'LZIP'
class lzjb
-
LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.
Expand source code Browse git
class lzjb(Unit): """ LZJB compression and decompression. This LZ-type compression is used in the ZFS file system. """ def reverse(self, src): # https://web.archive.org/web/20100807223517/ .. # .. http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/fs/zfs/lzjb.c output = bytearray() lempel = [0] * _LEMPEL_SIZE copymask = 0x80 position = 0 while position < len(src): copymask <<= 1 if copymask >= 0x100: copymask = 1 copymap = len(output) output.append(0) if position > len(src) - _MATCH_MAX: output.append(src[position]) position += 1 continue hsh = (src[position] << 16) + (src[position + 1] << 8) + src[position + 2] hsh += hsh >> 9 hsh += hsh >> 5 hsh %= len(lempel) offset = (position - lempel[hsh]) & _OFFSET_MASK lempel[hsh] = position cpy = position - offset if cpy >= 0 and cpy != position and src[position:position + 3] == src[cpy:cpy + 3]: output[copymap] |= copymask for mlen in range(_MATCH_MIN, min(len(src) - position, _MATCH_MAX)): if src[position + mlen] != src[cpy + mlen]: break output.append(((mlen - _MATCH_MIN) << (8 - _MATCH_LEN)) | (offset >> 8)) output.append(offset & 255) position += mlen else: output.append(src[position]) position += 1 return output def process(self, data): dst = bytearray() src = StructReader(data) while not src.eof: copy = src.read_byte() for mask in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80): if src.eof: break if not copy & mask: dst.append(src.read_byte()) continue elif not dst: raise ValueError('copy requested against empty buffer') with src.be: match_len = src.read_integer(6) + _MATCH_MIN match_pos = src.read_integer(10) if not match_pos or match_pos > len(dst): raise RuntimeError(F'invalid match offset at position {src.tell()}') match_pos = len(dst) - match_pos while match_len > 0: match = dst[match_pos:match_pos + match_len] dst.extend(match) match_pos += len(match) match_len -= len(match) return dst
class lzma (filter=None, raw=False, alone=False, xz=False, level=9, delta=None)
-
LZMA compression and decompression.
Expand source code Browse git
class lzma(Unit): """ LZMA compression and decompression. """ _LZMA_FILTER = extract_options(std_lzma, 'FILTER_', 'DELTA') _LZMA_PARSER = OptionFactory(_LZMA_FILTER) def __init__( self, filter: Arg.Choice(choices=list(_LZMA_FILTER), metavar='FILTER', help=( 'Specifies a bcj filter to be applied. Possible values are: {choices}')) = None, raw : Arg.Switch('-r', group='MODE', help='Use raw (no container) format.') = False, alone : Arg.Switch('-a', group='MODE', help='Use the lzma container format.') = False, xz : Arg.Switch('-x', group='MODE', help='Use the default xz format.') = False, level : Arg.Number('-l', bound=(0, 9), help='The compression level preset; between 0 and 9.') = 9, delta : Arg.Number('-d', help='Add a delta filter when compressing.') = None, ): filter = filter and self._LZMA_PARSER(filter) if (raw, alone, xz).count(True) > 1: raise ValueError('Only one container format can be enabled.') if level not in range(10): raise ValueError('Compression level must be a number between 0 and 9.') super().__init__(filter=filter, raw=raw, alone=alone, xz=xz, delta=delta, level=level | std_lzma.PRESET_EXTREME) def _get_lz_mode_and_filters(self, reverse=False): mode = std_lzma.FORMAT_AUTO filters = [] if self.args.filter is not None: filters.append({'id': self.args.filter.value}) if self.args.delta is not None: self.log_debug('adding delta filter') filters.append({ 'id': std_lzma.FILTER_DELTA, 'dist': self.args.delta }) if self.args.alone: self.log_debug('setting alone format') mode = std_lzma.FORMAT_ALONE filters.append({ 'id': std_lzma.FILTER_LZMA1, 'preset': self.args.level }) elif self.args.raw: self.log_debug('setting raw format') mode = std_lzma.FORMAT_RAW filters.append({ 'id': std_lzma.FILTER_LZMA2, 'preset': self.args.level }) elif self.args.xz or reverse: if reverse and not self.log_debug('setting xz container format'): self.log_info('choosing default .xz container format for compression.') mode = std_lzma.FORMAT_XZ filters.append({ 'id': std_lzma.FILTER_LZMA2, 'preset': self.args.level }) return mode, filters def reverse(self, data): mode, filters = self._get_lz_mode_and_filters(True) lz = std_lzma.LZMACompressor(mode, filters=filters) output = lz.compress(data) output += lz.flush() return output def _process_stream(self, data: ByteString, strategy: F, keywords): if strategy & F.STEPWISE: sizes = repeat(1) else: sizes = [len(data)] lz = std_lzma.LZMADecompressor(**keywords) with MemoryFile() as output: with MemoryFile(data) as stream: if strategy & F.INJECT: output.write(lz.decompress(stream.read(5))) output.write(lz.decompress(B'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF')) for size in sizes: if stream.eof or stream.closed: break try: position = stream.tell() output.write(lz.decompress(stream.read(size))) except (EOFError, std_lzma.LZMAError) as error: msg = error.args[0] if len(error.args) == 1 else error.__class__.__name__ raise RefineryPartialResult(F'compression failed at offset {position}: {msg!s}', output.getvalue()) return output.getvalue() def process(self, data: bytearray): errors: List[RefineryPartialResult] = [] view = memoryview(data) keywords = {} keywords['format'], filters = self._get_lz_mode_and_filters(False) if self.args.raw: keywords['filters'] = filters for strategy in (F.DEFAULT, F.INJECT, F.STEPWISE, F.INJECT | F.STEPWISE): try: return self._process_stream(view, strategy, keywords) except RefineryPartialResult as p: self.log_info(F'decompression failed with strategy {strategy}: {p.message}') errors.append(p) raise max(errors, key=lambda e: len(e.partial)) @classmethod def handles(self, data: bytearray): if data[:4] == B'\x5D\0\0\0': return True if data[:5] == B'\xFD7zXZ': return True
class lznt1 (chunk_size=4096)
-
LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API routine
RtlDecompressBuffer
, for example.Expand source code Browse git
class lznt1(Unit): """ LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API routine `RtlDecompressBuffer`, for example. """ def _decompress_chunk(self, chunk): out = B'' while chunk: flags = chunk[0] chunk = chunk[1:] for i in range(8): if not (flags >> i & 1): out += chunk[:1] chunk = chunk[1:] else: flag = struct.unpack('<H', chunk[:2])[0] pos = len(out) - 1 l_mask = 0xFFF o_shift = 12 while pos >= 0x10: l_mask >>= 1 o_shift -= 1 pos >>= 1 length = (flag & l_mask) + 3 offset = (flag >> o_shift) + 1 if length >= offset: tmp = out[-offset:] * (0xFFF // len(out[-offset:]) + 1) out += tmp[:length] else: out += out[-offset:length - offset] chunk = chunk[2:] if len(chunk) == 0: break return out def _find(self, src, target, max_len): result_offset = 0 result_length = 0 for i in range(1, max_len): offset = src.rfind(target[:i]) if offset == -1: break tmp_offset = len(src) - offset tmp_length = i if tmp_offset == tmp_length: tmp = src[offset:] * (0xFFF // len(src[offset:]) + 1) for j in range(i, max_len + 1): offset = tmp.rfind(target[:j]) if offset == -1: break tmp_length = j if tmp_length > result_length: result_offset = tmp_offset result_length = tmp_length if result_length < 3: return 0, 0 return result_offset, result_length def _compress_chunk(self, chunk): blob = copy.copy(chunk) out = B'' pow2 = 0x10 l_mask3 = 0x1002 o_shift = 12 while len(blob) > 0: bits = 0 tmp = B'' for i in range(8): bits >>= 1 while pow2 < (len(chunk) - len(blob)): pow2 <<= 1 l_mask3 = (l_mask3 >> 1) + 1 o_shift -= 1 if len(blob) < l_mask3: max_len = len(blob) else: max_len = l_mask3 offset1, length1 = self._find( chunk[:len(chunk) - len(blob)], blob, max_len) # try to find more compressed pattern offset2, length2 = self._find( chunk[:len(chunk) - len(blob) + 1], blob[1:], max_len) if length1 < length2: length1 = 0 if length1 > 0: symbol = ((offset1 - 1) << o_shift) | (length1 - 3) tmp += struct.pack('<H', symbol) bits |= 0x80 # set the highest bit blob = blob[length1:] else: tmp += blob[:1] blob = blob[1:] if len(blob) == 0: break out += struct.pack('B', bits >> (7 - i)) out += tmp return out def reverse(self, buf): out = B'' while buf: chunk = buf[:self.args.chunk_size] compressed = self._compress_chunk(chunk) if len(compressed) < len(chunk): # chunk is compressed flags = 0xB000 header = struct.pack('<H', flags | (len(compressed) - 1)) out += header + compressed else: flags = 0x3000 header = struct.pack('<H', flags | (len(chunk) - 1)) out += header + chunk buf = buf[self.args.chunk_size:] return out def process(self, data): out = io.BytesIO() offset = 0 while offset < len(data): try: header, = struct.unpack('<H', data[offset:offset + 2]) except struct.error as err: raise RefineryPartialResult(str(err), partial=out.getvalue()) offset += 2 size = (header & 0xFFF) + 1 if size + 1 >= len(data): raise RefineryPartialResult( F'chunk header indicates size {size}, but only {len(data)} bytes remain.', partial=out.getvalue() ) chunk = data[offset:offset + size] offset += size if header & 0x8000: chunk = self._decompress_chunk(chunk) out.write(chunk) return out.getvalue() def __init__(self, chunk_size: Arg.Number('-c', help='Optionally specify the chunk size for compression, default is 0x1000.') = 0x1000): super().__init__(chunk_size=chunk_size)
class lzo
-
LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used).
Expand source code Browse git
class lzo(Unit): """ LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used). """ def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray: """ An implementation of LZO decompression. We use the article "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)" as a reference since no proper specification is available. """ def integer() -> int: length = 0 while True: byte = src.read_byte() if byte: return length + byte length += 0xFF if length > 0x100000: raise LZOError('Too many zeros in integer encoding.') def literal(count): dst.write(src.read_bytes(count)) def copy(distance: int, length: int): if distance > len(dst): raise LZOError(F'Distance {distance} > bufsize {len(dst)}') buffer = dst.getbuffer() if distance > length: start = len(buffer) - distance end = start + length dst.write(buffer[start:end]) else: block = buffer[-distance:] while len(block) < length: block += block[:length - len(block)] if len(block) > length: block[length:] = () dst.write(block) src = StructReader(memoryview(data)) dst = MemoryFile() state = 0 first = src.read_byte() if first == 0x10: raise LZOError('Invalid first stream byte 0x10.') elif first <= 0x12: src.seekrel(-1) elif first <= 0x15: state = first - 0x11 literal(state) else: state = 4 literal(first - 0x11) while True: instruction = src.read_byte() if instruction < 0x10: if state == 0: length = instruction or integer() + 15 state = length + 3 if state < 4: raise LZOError('Literal encoding is too short.') else: state = instruction & 0b0011 D = (instruction & 0b1100) >> 2 H = src.read_byte() distance = (H << 2) + D + 1 if state >= 4: distance += 0x800 length = 3 else: length = 2 copy(distance, length) elif instruction < 0x20: L = instruction & 0b0111 H = instruction & 0b1000 length = L or integer() + 7 argument = src.u16() state = argument & 3 distance = (H << 11) + (argument >> 2) if not distance: return dst.getbuffer() if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265): raise LZOError('Compressed data contains sequence that is banned in LZOv1.') if LZOv1 and distance == 0xBFFF: X = src.read_byte() count = ((X << 3) | L) + 4 self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.') dst.write(B'\0' * count) else: copy(distance + 0x4000, length + 2) elif instruction < 0x40: L = instruction & 0b11111 length = L or integer() + 31 argument = src.u16() state = argument & 3 distance = (argument >> 2) + 1 copy(distance, length + 2) else: if instruction < 0x80: length = 3 + ((instruction >> 5) & 1) else: length = 5 + ((instruction >> 5) & 3) H = src.read_byte() D = (instruction & 0b11100) >> 2 state = instruction & 3 distance = (H << 3) + D + 1 copy(distance, length) if state: literal(state) def process(self, data): try: lzo = LZO(data) except LZOError: self.log_info('Not an LZO archive, processing raw stream.') return self.decompress_stream(data) with MemoryFile() as output: for k, chunk in enumerate(lzo, 1): self.log_debug(F'decompressing chunk {k}') output.write(self.decompress_stream(chunk.data)) return self.labelled( output.getbuffer(), path=lzo.name, date=date_from_timestamp(lzo.mtime) ) @classmethod def handles(self, data: bytearray) -> Optional[bool]: sig = LZO.SIGNATURE if data[:len(sig)] == sig: return True
class lzw
-
LZW decompression based on ancient Unix sources.
Expand source code Browse git
class lzw(Unit): ''' LZW decompression based on ancient Unix sources. ''' _MAGIC = B'\x1F\x9D' def process(self, data: bytearray): out = MemoryFile() inf = StructReader(memoryview(data)) if inf.peek(2) != self._MAGIC: self.log_info('No LZW signature found, assuming raw stream.') maxbits = LZW.BITS block_mode = True else: inf.seekrel(2) maxbits = inf.read_integer(5) if inf.read_integer(2) != 0: self.log_info('reserved bits were set in LZW header') block_mode = bool(inf.read_bit()) if maxbits > LZW.BITS: raise ValueError(F'Compressed with {maxbits} bits; cannot handle file.') maxmaxcode = 1 << maxbits ibuf = inf.read() tab_suffix = bytearray(LZW.WSIZE * 2) tab_prefix = array('H', itertools.repeat(0, 1 << LZW.BITS)) n_bits = LZW.INIT_BITS maxcode = (1 << n_bits) - 1 bitmask = (1 << n_bits) - 1 oldcode = ~0 finchar = +0 posbits = +0 free_entry = LZW.FIRST if block_mode else 0x100 tab_suffix[:0x100] = range(0x100) resetbuf = True while resetbuf: resetbuf = False ibuf = ibuf[posbits >> 3:] insize = len(ibuf) posbits = 0 inbits = (insize << 3) - (n_bits - 1) while inbits > posbits: if free_entry > maxcode: n = n_bits << 3 p = posbits - 1 posbits = p + (n - (p + n) % n) n_bits += 1 if (n_bits == maxbits): maxcode = maxmaxcode else: maxcode = (1 << n_bits) - 1 bitmask = (1 << n_bits) - 1 resetbuf = True break p = ibuf[posbits >> 3:] code = int.from_bytes(p[:3], 'little') >> (posbits & 7) & bitmask posbits += n_bits if oldcode == -1: if code >= 256: raise ValueError('corrupt input.') oldcode = code finchar = oldcode out.write_byte(finchar) continue if code == LZW.CLEAR and block_mode: tab_prefix[:0x100] = array('H', itertools.repeat(0, 0x100)) free_entry = LZW.FIRST - 1 n = n_bits << 3 p = posbits - 1 posbits = p + (n - (p + n) % n) n_bits = LZW.INIT_BITS maxcode = (1 << n_bits) - 1 bitmask = (1 << n_bits) - 1 resetbuf = True break incode = code stack = bytearray() if code >= free_entry: if code > free_entry: raise RefineryPartialResult('corrupt input.', out.getbuffer()) stack.append(finchar) code = oldcode while code >= 256: stack.append(tab_suffix[code]) code = tab_prefix[code] finchar = tab_suffix[code] stack.append(finchar) stack.reverse() out.write(stack) code = free_entry if code < maxmaxcode: tab_prefix[code] = oldcode & 0xFFFF tab_suffix[code] = finchar & 0x00FF free_entry = code + 1 oldcode = incode return out.getvalue() @classmethod def handles(self, data: bytearray) -> Optional[bool]: sig = self._MAGIC if data[:len(sig)] == sig: return True
class machometa (all=True, header=False, linked_images=False, signatures=False, version=False, load_commands=False, exports=False, imports=False, tabular=False)
-
Extract metadata from Mach-O files.
Expand source code Browse git
class machometa(Unit): """ Extract metadata from Mach-O files. """ def __init__( self, all: Arg('-c', '--custom', help='Unless enabled, all default categories will be extracted.') = True, header: Arg('-H', help='Parse basic data from the Mach-O header.') = False, linked_images: Arg('-K', help='Parse all library images linked by the Mach-O.') = False, signatures: Arg('-S', help='Parse signature and entitlement information.') = False, version: Arg('-V', help='Parse version information from the Mach-O load commands.') = False, load_commands: Arg('-D', help='Parse load commands from the Mach-O header.') = False, exports: Arg('-E', help='List all exported functions.') = False, imports: Arg('-I', help='List all imported functions.') = False, tabular: Arg('-t', help='Print information in a table rather than as JSON') = False, ): super().__init__( header=all or header, linked_images=all or linked_images, version=all or version, signatures=all or signatures, load_commands=load_commands, imports=imports, exports=exports, tabular=tabular, ) @Unit.Requires('k2l>=2.0', 'all') def _ktool(): import ktool import ktool.macho import ktool.codesign return ktool def compute_symhash(self, macho_image: Image) -> Dict: def _symbols(symbols: Iterable[Symbol]): for sym in symbols: if sym.types: continue yield sym.fullname symbols = sorted(set(_symbols(macho_image.symbol_table.ext))) symbols: str = ','.join(symbols) return md5(symbols.encode('utf8')).hexdigest() def parse_macho_header(self, macho_image: Image, data=None) -> Dict: info = {} macho_header = macho_image.macho_header dyld_header = macho_image.macho_header.dyld_header if dyld_header is not None: info['Type'] = dyld_header.type_name info['Magic'] = dyld_header.magic info['CPUType'] = macho_image.slice.type.name info['CPUSubType'] = macho_image.slice.subtype.name info['FileType'] = macho_image.macho_header.filetype.name info['LoadCount'] = dyld_header.loadcnt info['LoadSize'] = dyld_header.loadsize info['Flags'] = [flag.name for flag in macho_header.flags] info['Reserved'] = dyld_header.reserved return info def parse_linked_images(self, macho_image: Image, data=None) -> Dict: load_command_images = {} linked_images = macho_image.linked_images LOAD_COMMAND = self._ktool.macho.LOAD_COMMAND for linked_image in linked_images: load_command_name = LOAD_COMMAND(linked_image.cmd.cmd).name load_command_images.setdefault(load_command_name, []).append(linked_image.install_name) return load_command_images def parse_signature(self, macho_image: Image, data=None) -> Dict: _km = self._ktool.macho _kc = self._ktool.codesign class CodeDirectoryBlob(_km.Struct): FIELDS = { 'magic': _km.uint32_t, 'length': _km.uint32_t, 'version': _km.uint32_t, 'flags': _km.uint32_t, 'hashOffset': _km.uint32_t, 'identOffset': _km.uint32_t, 'nSpecialSlots': _km.uint32_t, 'nCodeSlots': _km.uint32_t, 'codeLimit': _km.uint32_t, 'hashSize': _km.uint8_t, 'hashType': _km.uint8_t, 'platform': _km.uint8_t, 'pageSize': _km.uint8_t, 'spare2': _km.uint32_t } def __init__(self, byte_order='little'): super().__init__(byte_order=byte_order) self.magic = 0 self.length = 0 self.version = 0 self.flags = 0 self.hashOffset = 0 self.identOffset = 0 self.nSpecialSlots = 0 self.nCodeSlots = 0 self.codeLimit = 0 self.hashSize = 0 self.hashType = 0 self.platform = 0 self.pageSize = 0 self.spare2 = 0 info = {} if macho_image.codesign_info is not None: superblob: SuperBlob = macho_image.codesign_info.superblob for blob in macho_image.codesign_info.slots: blob: BlobIndex # ktool does not include code for extracting Blobs of types # CSSLOT_CODEDIRECTORY, CSSLOT_CMS_SIGNATURE # so we must do it ourselves here. if blob.type == _kc.CSSLOT_CODEDIRECTORY: start = superblob.off + blob.offset codedirectory_blob = macho_image.read_struct(start, CodeDirectoryBlob) # Ad-hoc signing flags = _kc.swap_32(codedirectory_blob.flags) if flags & CS_ADHOC != 0: info['AdHocSigned'] = True else: info['AdHocSigned'] = False # Signature identifier identifier_offset = _kc.swap_32(codedirectory_blob.identOffset) identifier_data = macho_image.read_cstr(start + identifier_offset) info['SignatureIdentifier'] = identifier_data if blob.type == 0x10000: # CSSLOT_CMS_SIGNATURE start = superblob.off + blob.offset blob_data = macho_image.read_struct(start, _kc.Blob) blob_data.magic = _kc.swap_32(blob_data.magic) blob_data.length = _kc.swap_32(blob_data.length) cms_signature = macho_image.read_bytearray(start + _kc.Blob.SIZE, blob_data.length - _kc.Blob.SIZE) if len(cms_signature) != 0: try: parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature)) info['Signature'] = parsed_cms_signature except ValueError as pkcs7_parse_error: self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}') if macho_image.codesign_info.req_dat is not None: # TODO: Parse the requirements blob, # which is encoded according to the code signing requirements language: # https://developer.apple.com/library/archive/documentation/Security/Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html info['Requirements'] = macho_image.codesign_info.req_dat.hex() if macho_image.codesign_info.entitlements is not None: entitlements: str = macho_image.codesign_info.entitlements if entitlements: try: entitlements = plistlib.loads(entitlements.encode('utf8')) except Exception as error: self.log_warn(F'failed to parse entitlements: {error!s}') else: info['Entitlements'] = entitlements return info def parse_version(self, macho_image: Image, data=None) -> Dict: info = {} load_commands = macho_image.macho_header.load_commands SVC = self._ktool.macho.source_version_command BVC = self._ktool.macho.build_version_command for load_command in load_commands: if isinstance(load_command, SVC): if 'SourceVersion' not in info: info['SourceVersion'] = load_command.version else: self.log_warn('More than one load command of type source_version_command found; the MachO file is possibly malformed') elif isinstance(load_command, BVC): if 'BuildVersion' not in info: info['BuildVersion'] = {} info['BuildVersion']['Platform'] = macho_image.platform.name info['BuildVersion']['MinOS'] = F'{macho_image.minos.x}.{macho_image.minos.y}.{macho_image.minos.z}' info['BuildVersion']['SDK'] = F'{macho_image.sdk_version.x}.{macho_image.sdk_version.y}.{macho_image.sdk_version.z}' info['BuildVersion']['Ntools'] = load_command.ntools else: self.log_warn('More than one load command of type build_version_command found; the MachO file is possibly malformed') return info def parse_load_commands(self, macho_image: Image, data=None) -> List: info = [] load_commands = macho_image.macho_header.load_commands for load_command in load_commands: info.append(load_command.serialize()) return info def parse_imports(self, macho_image: Image, data=None) -> List: info = [] for imp in macho_image.imports: info.append(imp.name) return info def parse_exports(self, macho_image: Image, data=None) -> List: info = [] for exp in macho_image.exports: info.append(exp.name) return info def process(self, data: bytearray): result = {} ktool = self._ktool with NoLogging(NoLogging.Mode.ALL): macho = ktool.load_macho_file(fp=MemoryFile(memoryview(data)), use_mmaped_io=False) if macho.type is ktool.MachOFileType.FAT: result['FileType'] = 'FAT' elif macho.type is ktool.MachOFileType.THIN: result['FileType'] = 'THIN' slices = [] for macho_slice in macho.slices: slice_result = {} macho_image = ktool.load_image(fp=macho_slice) for switch, resolver, name in [ (self.args.header, self.parse_macho_header, 'Header'), (self.args.linked_images, self.parse_linked_images, 'LinkedImages'), (self.args.signatures, self.parse_signature, 'Signatures'), (self.args.version, self.parse_version, 'Version'), (self.args.load_commands, self.parse_load_commands, 'LoadCommands'), (self.args.imports, self.parse_imports, 'Imports'), (self.args.exports, self.parse_exports, 'Exports'), ]: if not switch: continue self.log_debug(F'parsing: {name}') try: info = resolver(macho_image, data) except Exception as E: self.log_info(F'failed to obtain {name}: {E!s}') continue if info: slice_result[name] = info if macho_image.uuid is not None: uuid: bytes = macho_image.uuid slice_result['UUID'] = uuid.hex() slice_result['SymHash'] = self.compute_symhash(macho_image) slice_result['BaseName'] = macho_image.base_name slice_result['InstallName'] = macho_image.install_name slices.append(slice_result) if slices: result['Slices'] = slices yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False)
class map (index, image, blocksize=None)
-
Each block of the input data which occurs as a block of the index argument is replaced by the corresponding block of the image argument. If a block size is specified, and if the index or image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes that are not an integer multiple of the block size are discarded. To prevent any automatic chunking, the
DelayedArgument.btoi()
handler can be used.Expand source code Browse git
class map(BlockTransformation): """ Each block of the input data which occurs as a block of the index argument is replaced by the corresponding block of the image argument. If a block size is specified, and if the index or image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes that are not an integer multiple of the block size are discarded. To prevent any automatic chunking, the `refinery.lib.argformats.DelayedArgument.btoi` handler can be used. """ _map: Optional[Dict[int, int]] def __init__( self, index: Arg.NumSeq(help='index characters'), image: Arg.NumSeq(help='image characters'), blocksize=None ): super().__init__(blocksize=blocksize, index=index, image=image, _truncate=2) self._map = None def reverse(self, data): return self._process(data, self.args.image, self.args.index) def process(self, data): return self._process(data, self.args.index, self.args.image) def _process(self, data: bytearray, index: Sequence[int], image: Sequence[int]): if not self.bytestream: if isbuffer(index): self.log_info(F'chunking index sequence into blocks of size {self.blocksize}') index = list(self.chunk(index)) self.log_debug(F'index sequence: {index}') if isbuffer(image): self.log_info(F'chunking image sequence into blocks of size {self.blocksize}') image = list(self.chunk(image)) self.log_debug(F'image sequence: {image}') if len(set(index)) != len(index): raise ValueError('The index sequence contains duplicates.') if len(index) > len(image): raise ValueError('The index sequence is longer than the image sequence.') if self.bytestream: mapping = dict(zip(index, image)) mapping = bytes(mapping.get(c, c) for c in range(0x100)) if not isinstance(data, bytearray): data = bytearray(data) data[:] = (mapping[b] for b in data) return data try: self._map = dict(zip(index, image)) return super().process(data) finally: self._map = None def process_block(self, token): return self._map.get(token, token)
class max_ (key=None)
-
Picks the maximum of all elements in the current
refinery.lib.frame
.Expand source code Browse git
class max_(Unit): """ Picks the maximum of all elements in the current `refinery.lib.frame`. """ def __init__( self, key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None, ): super().__init__(key=key) def filter(self, chunks: Iterable[Chunk]): def get_value(chunk: Chunk): if key is None: return chunk return metavars(chunk).get(key) key = self.args.key it = iter(chunks) for max_chunk in it: if not max_chunk.visible: yield max_chunk else: max_index = 0 max_value = get_value(max_chunk) break else: return for index, chunk in enumerate(chunks, 1): if not chunk.visible: yield chunk continue value = get_value(chunk) try: is_max = value > max_value except TypeError: if max_value is None: self.log_info( F'Discarding chunk {max_index} in favor of {index} because {key} was not ' F'set on the former; new maximum is {value!r}.') is_max = True else: self.log_info( F'Discarding chunk {index} because {key} had value {value!r}; it could not ' F'be compared to the current maximum {max_value!r} on chunk {max_index}.') is_max = False if is_max: max_value = value max_chunk = chunk max_index = index yield max_chunk
class md2 (text=False)
-
Returns the MD2 hash of the input data.
class md4 (text=False)
-
Returns the MD4 hash of the input data.
class md5 (text=False)
-
Returns the MD5 hash of the input data.
class mimewords
-
Implements the decoding of MIME encoded-word syntax from RFC-2047.
Expand source code Browse git
class mimewords(Unit): """ Implements the decoding of MIME encoded-word syntax from RFC-2047. """ @unicoded def process(self, data: str) -> str: def replacer(match): self.log_info('encoded mime word:', match[0]) decoded, = decode_header(match[0]) raw, codec = decoded return codecs.decode(raw, codec, errors='surrogateescape') return re.sub(R"=(?:\?[^\?]*){3}\?=", replacer, data)
class min_ (key=None)
-
Picks the minimum of all elements in the current
refinery.lib.frame
.Expand source code Browse git
class min_(Unit): """ Picks the minimum of all elements in the current `refinery.lib.frame`. """ def __init__( self, key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None, ): super().__init__(key=key) def filter(self, chunks: Iterable[Chunk]): def get_value(chunk: Chunk): if key is None: return chunk return metavars(chunk).get(key) key = self.args.key it = iter(chunks) for min_chunk in it: if not min_chunk.visible: yield min_chunk else: min_index = 0 min_value = get_value(min_chunk) break else: return for index, chunk in enumerate(chunks, 1): if not chunk.visible: yield chunk continue value = get_value(chunk) try: is_min = value < min_value except TypeError: if min_value is None: self.log_info( F'Discarding chunk {min_index} in favor of {index} because {key} was not ' F'set on the former; new minimum is {value!r}.') is_min = True else: self.log_info( F'Discarding chunk {index} because {key} had value {value!r}; it could not ' F'be compared to the current minimum {min_value!r} on chunk {min_index}.') is_min = False if is_min: min_value = value min_chunk = chunk min_index = index yield min_chunk
class mmh128x32 (seed=0, text=False)
-
Returns the 128bit Murmur Hash of the input data, 64bit variant.
Expand source code Browse git
class mmh128x32(MurMurHash): """ Returns the 128bit Murmur Hash of the input data, 64bit variant. """ def _algorithm(self, data: bytes) -> bytes: return mmh128digest32(data, self.args.seed)
class mmh128x64 (seed=0, text=False)
-
Returns the 128bit Murmur Hash of the input data, 64bit variant.
Expand source code Browse git
class mmh128x64(MurMurHash): """ Returns the 128bit Murmur Hash of the input data, 64bit variant. """ def _algorithm(self, data: bytes) -> bytes: return mmh128digest64(data, self.args.seed)
class mmh32 (seed=0, text=False)
-
Returns the 32bit Murmur Hash of the input data.
Expand source code Browse git
class mmh32(MurMurHash): """ Returns the 32bit Murmur Hash of the input data. """ def _algorithm(self, data: bytes) -> bytes: return mmh32digest(data, self.args.seed)
class mscdk (size, hash='MD5')
-
An implementation of the CryptDeriveKey routine available from the Win32 API.
Expand source code Browse git
class mscdk(KeyDerivation): """ An implementation of the CryptDeriveKey routine available from the Win32 API. """ def __init__(self, size, hash='MD5'): super().__init__(size=size, salt=None, hash=hash) def process(self, data): def digest(x): return self.hash.new(x).digest() size = self.args.size if self.args.hash in (HASH.SHA224, HASH.SHA256, HASH.SHA384, HASH.SHA512): buffer = digest(data) max_size = len(buffer) else: max_size = 2 * self.hash.digest_size value = digest(data) del data buffer1 = bytearray([0x36] * 64) buffer2 = bytearray([0x5C] * 64) for k, b in enumerate(value): buffer1[k] ^= b buffer2[k] ^= b buffer = digest(buffer1) + digest(buffer2) if size > max_size: raise RefineryPartialResult(F'too many bytes requested, can only provide {max_size}', partial=buffer) return buffer[:size]
class mscf (mode=None)
-
The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both with and without Huffman table) are supported. This pure Python implementation is very slow when compared to native code, so decompressing very large inputs can take several minutes.
Expand source code Browse git
class mscf(Unit): """ The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both with and without Huffman table) are supported. This pure Python implementation is very slow when compared to native code, so decompressing very large inputs can take several minutes. """ _SIGNATURE = B'\x0A\x51\xE5\xC0' def __init__( self, mode: Unit.Arg.Option(choices=MODE, help=( 'Manually select decompression mode ({choices}); by default the unit attempts to derive the ' 'mode from the header, but this will fail for raw streams. However, even if a header is ' 'found, a manually specified mode will take precedence.')) = None, ): mode = Unit.Arg.AsOption(mode, MODE) super().__init__(mode=mode) def process(self, data): mode: MODE = self.args.mode with StructReader(memoryview(data)) as reader, MemoryFile() as writer: reader: StructReader[memoryview] check = zlib.crc32(reader.peek(6)) magic = reader.read(4) if magic != self._SIGNATURE: if mode is None: self.log_warn( F'data starts with {magic.hex().upper()} rather than the expected sequence ' F'{self._SIGNATURE.hex().upper()}; this could be a raw stream.') else: reader.seek(0) handler = self._get_handler(mode) handler(reader, writer, None) return writer.getbuffer() header_size = reader.u16() if header_size != 24: self.log_warn(F'the header size {header_size} was not equal to 24') crc32byte = reader.u8() check = zlib.crc32(reader.peek(0x11), check) & 0xFF if check != crc32byte: self.log_warn(F'the CRC32 check byte was {crc32byte}, computed value was {check}') _mode_code = reader.u8() try: _mode = MODE(_mode_code) except ValueError: msg = F'header contains unknown compression type code {_mode_code}' if mode is None: raise ValueError(msg) else: self.log_warn(msg) else: if mode is not None and mode != _mode: logger = self.log_warn else: logger = self.log_info mode = _mode logger(F'header specifies algorithm {_mode.name}') self.log_info(F'using algorithm {mode.name}') decompress = self._get_handler(mode) final_size = reader.u32() _unknown_1 = reader.u32() chunk_size = reader.u32() _unknown_2 = reader.u32() if _unknown_1 != 0: self.log_warn(F'unknown value 1 was unexpectedly nonzero: 0x{_unknown_1:08X}') if _unknown_2 != 0: self.log_warn(F'unknown value 2 was unexpectedly nonzero: 0x{_unknown_2:08X}') self.log_debug(F'final size: 0x{final_size:08X}') self.log_debug(F'chunk size: 0x{chunk_size:08X}') if chunk_size > COMPRESS_MAX_CHUNK: raise ValueError('the header chunk size is greater than the maximum value') while len(writer) < final_size: src_size = reader.u32() src_data = reader.read(src_size) if len(src_data) != src_size: raise IndexError(F'Attempted to read {src_size} bytes, but got only {len(src_data)}.') if src_size + len(writer) == final_size: self.log_debug(F'final chunk is uncompressed, appending {src_size} raw bytes to output') writer.write(src_data) break self.log_debug(F'reading chunk of size {src_size}') start = writer.tell() chunk = StructReader(src_data) target = min(chunk_size, final_size - len(writer)) decompress(chunk, writer, target) writer.flush() written = writer.tell() - start if written != target: raise RuntimeError(F'decompressed output had unexpected size {written} instead of {chunk_size}') if not reader.eof: self.log_info(F'compression complete with {reader.remaining_bytes} bytes remaining in input') return writer.getbuffer() def _get_handler(self, mode: MODE) -> Callable[[StructReader, MemoryFile, Optional[int]], None]: decompress = { mode.MSZIP : self._decompress_mszip, mode.XPRESS_HUFF : self._decompress_xpress_huffman, mode.XPRESS : self._decompress_xpress, }.get(mode, None) if decompress is None: raise NotImplementedError(F'algorithm {mode.name} is not yet implemented') return decompress def _decompress_mszip(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None): header = bytes(reader.read(2)) if header != B'CK': raise ValueError(F'chunk did not begin with CK header, got {header!r} instead') decompress = zlib.decompressobj(-zlib.MAX_WBITS, zdict=writer.getbuffer()) writer.write(decompress.decompress(reader.read())) writer.write(decompress.flush()) def _decompress_xpress_huffman( self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None, max_chunk_size: int = 0x10000 ) -> None: limit = writer.tell() if target is not None: target += limit while not reader.eof: if reader.remaining_bytes < XPRESS_NUM_SYMBOLS // 2: raise IndexError( F'There are only {reader.remaining_bytes} bytes reamining in the input buffer,' F' but at least {XPRESS_NUM_SYMBOLS // 2} are required to read a Huffman table.') table = bytearray(reader.read_integer(4) for _ in range(XPRESS_NUM_SYMBOLS)) table = make_huffman_decode_table(table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN) limit = limit + max_chunk_size flags = BitBufferedReader(reader, 16) while True: position = writer.tell() if position == target: if reader.remaining_bytes: self.log_info(F'chunk decompressed with {reader.remaining_bytes} bytes remaining in input buffer') return if position >= limit: if position > limit: limit = position self.log_info(F'decompression of one chunk generated more than the limit of {max_chunk_size} bytes') flags.collect() break try: sym = read_huffman_symbol(flags, table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN) except EOFError: self.log_debug('end of file while reading huffman symbol') break if sym < XPRESS_NUM_CHARS: writer.write_byte(sym) continue length = sym & 0xF offsetlog = (sym >> 4) & 0xF flags.collect() if reader.eof: break offset = (1 << offsetlog) | flags.read(offsetlog) if length == 0xF: nudge = reader.read_byte() if nudge < 0xFF: length += nudge else: length = reader.u16() or reader.u32() length += XPRESS_MIN_MATCH_LEN writer.replay(offset, length) def _decompress_xpress(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None) -> bytearray: if target is not None: target += writer.tell() flags = BitBufferedReader(reader) nibble_cache = None while not reader.eof: if target is not None and writer.tell() >= target: return if not flags.next(): writer.write(reader.read(1)) continue offset, length = divmod(reader.u16(), 8) offset += 1 if length == 7: length = nibble_cache if length is None: length_pair = reader.u8() nibble_cache = length_pair >> 4 length = length_pair & 0xF else: nibble_cache = None if length == 15: length = reader.u8() if length == 0xFF: length = reader.u16() or reader.u32() length -= 22 if length < 0: raise RuntimeError(F'Invalid match length of {length} for long delta sequence') length += 15 length += 7 length += 3 writer.replay(offset, length) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: sig = cls._SIGNATURE if data[:len(sig)] == sig: return True
class msgpack
-
Converts a message-pack (msgpack) buffer to JSON and vice-versa.
Expand source code Browse git
class msgpack(Unit): """ Converts a message-pack (msgpack) buffer to JSON and vice-versa. """ def reverse(self, data): return mp.dumps(json.loads(data)) def process(self, data): unpacker: mp.fallback.Unpacker = mp.Unpacker(MemoryFile(data, read_as_bytes=True)) for k in itertools.count(): try: last = unpacker.tell() item = unpacker.unpack() except Exception as E: if isinstance(E, mp.OutOfData) and k == 1: break raise RefineryPartialResult(str(E), memoryview(data)[last:]) from E else: yield json.dumps(item).encode(self.codec)
class mspdb (size, salt, iter=100, hash='SHA1')
-
An implementation of the PasswordDeriveBytes routine available from the .NET standard library. According to documentation, it is an extension of PBKDF1.
Expand source code Browse git
class mspdb(KeyDerivation): """ An implementation of the PasswordDeriveBytes routine available from the .NET standard library. According to documentation, it is an extension of PBKDF1. """ def __init__(self, size, salt, iter=100, hash='SHA1'): self.superinit(super(), **vars()) def process(self, data): if self.codec != 'UTF8': data = data.decode(self.codec).encode('UTF8') data += self.args.salt for _ in range(self.args.iter - 1): data = self.hash.new(data).digest() counter, seedhash = 1, data data = self.hash.new(data).digest() while len(data) < self.args.size: data += self.hash.new(B'%d%s' % (counter, seedhash)).digest() counter += 1 return data[:self.args.size]
class mvg (*names, top=False)
-
Short for "Make Variable Global": This unit can move meta variables into the scope of the parent frame. If used at the end of a frame, the variables will be moved the scope of the frame that the pipeline will return to. Otherwise and if the –top switch is being used, variables will be moved to scope 0, i.e. to the topmost frame in the current tree.
Note that it is not possible to promote a variable to a parent frame if that variable does not have the same value on all chunks in the current frame - such variables will always be removed when the frame closes.
Expand source code Browse git
class mvg(Unit): """ Short for "Make Variable Global": This unit can move meta variables into the scope of the parent frame. If used at the end of a frame, the variables will be moved the scope of the frame that the pipeline will return to. Otherwise and if the --top switch is being used, variables will be moved to scope 0, i.e. to the topmost frame in the current tree. Note that it is not possible to promote a variable to a parent frame if that variable does not have the same value on all chunks in the current frame - such variables will always be removed when the frame closes. """ def __init__( self, *names: Arg(type=str, metavar='name', help=( 'Name of a variable to be removed. If no variables are explicitly specified, all ' 'variables in the current chunk will be rescoped.' )), top: Arg.Switch('-t', help='Move the variable(s) to the topmost frame layer.') = False ): super().__init__(names=names, top=top) def process(self, data): meta = metavars(data) nest = self.args.nesting if nest < 0 and not self.args.top: spot = meta.scope + nest else: spot = 1 for name in self.args.names or meta.variable_names(): try: if meta.get_scope(name) <= spot: continue meta.set_scope(name, spot) except KeyError: self.log_info(F'variable not defined: {name}') return data
class n40 (key)
-
Decrypts hex-encoded strings in various latin-american banker families, including N40.
Expand source code Browse git
class n40(Unit): """ Decrypts hex-encoded strings in various latin-american banker families, including N40. """ def __init__(self, key: Arg(help='Decryption key.')): ... def process(self, data): try: data = b16decode(data, casefold=True) except Error: self.log_info('Input was not hex-encoded; ignoring this step.') mask = data[1:] | xor(self.args.key) | bytearray return bytearray(0xFF + b - a if b <= a else b - a for a, b in zip(data, mask))
class neg (bigendian=False, blocksize=None)
-
Each block of the input data is negated bitwise. This is sometimes also called the bitwise complement or inverse.
Expand source code Browse git
class neg(UnaryOperation): """ Each block of the input data is negated bitwise. This is sometimes also called the bitwise complement or inverse. """ def operate(self, a): return ~a def inplace(self, a): a ^= self.fmask
class netbios (key=b'A')
-
Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character, respectively. The default offset is the capital letter A.
Expand source code Browse git
class netbios(Unit): """ Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character, respectively. The default offset is the capital letter A. """ def __init__(self, key: Arg(help="Provide a single letter to use as the offset.") = B'A'): if len(key) != 1: raise ValueError("The key must be a binary string of length exactly 1") super().__init__(key=key[0]) def reverse(self, data): result = bytearray(2 * len(data)) for k, byte in enumerate(data): hi, lo = byte >> 4, byte & 15 result[2 * k + 0] = hi + self.args.key result[2 * k + 1] = lo + self.args.key return result def process(self, data): def merge(it): while True: try: hi = next(it) - self.args.key lo = next(it) - self.args.key if hi not in range(16) or lo not in range(16): raise ValueError(F'Invalid character encoding detected: hi={hi:X}, lo={lo:X}.') yield (hi << 4) | lo except StopIteration: break return bytearray(merge(iter(data)))
class ngrams (size=slice(2, None, None))
-
Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams and deduplicates using a set data structure. The number n is taken from an arbitrary range given as a Python slice expression.
Expand source code Browse git
class ngrams(Unit): """ Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams and deduplicates using a set data structure. The number n is taken from an arbitrary range given as a Python slice expression. """ def __init__( self, size: Arg.Bounds( help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.') = slice(2, None), ): super().__init__(size=size) def process(self, data: bytearray): for n in integers_of_slice(self.args.size): self.log_info(F'emitting {n}-grams') if n > len(data): break deduplicator = set() view = memoryview(data) for index in range(len(data) - n + 1): block = bytes(view[index:index + n]) if block in deduplicator: continue deduplicator.add(block) yield self.labelled(block, offset=index)
class nop
-
The unit generates the exact output that was received as input. All unknown arguments passed to nop are completely ignored, which is different from the behavior of other units. As such, nop can be used to comment out other units in longer refinery pipelines by simply prefixing a command with nop.
Expand source code Browse git
class nop(Unit): """ The unit generates the exact output that was received as input. All unknown arguments passed to nop are completely ignored, which is different from the behavior of other units. As such, nop can be used to comment out other units in longer refinery pipelines by simply prefixing a command with nop. """ @classmethod def argparser(cls, **keywords): argp = NopArgParser( keywords, prog=cls.name, description=documentation(cls), add_help=False) argp.set_defaults(nesting=0) return cls._interface(argp)
class nrv2b (bits=32)
-
Decompress data using the NRV2B algorithm.
Expand source code Browse git
class nrv2b(NRVUnit): """ Decompress data using the NRV2B algorithm. """ def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader): last_offset = 1 while not src.eof: while next(bb): dst.write_byte(src.read_byte()) offset = 2 + next(bb) while not next(bb): offset = 2 * offset + next(bb) if offset == 2: offset = last_offset else: offset = (offset - 3) * 0x100 + src.read_byte() if offset & 0xFFFFFFFF == 0xFFFFFFFF: break offset += 1 last_offset = offset length = next(bb) length = 2 * length + next(bb) if length == 0: length = 2 + next(bb) while not next(bb): length = 2 * length + next(bb) length += 2 length += int(bool(offset > 0xD00)) dst.replay(offset, length + 1)
class nrv2d (bits=32)
-
Decompress data using the NRV2D algorithm.
Expand source code Browse git
class nrv2d(NRVUnit): """ Decompress data using the NRV2D algorithm. """ def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader): last_offset = 1 while not src.eof: while next(bb): dst.write_byte(src.read_byte()) offset = 2 + next(bb) while not next(bb): offset = 2 * (offset - 1) + next(bb) # noqa offset = 2 * offset + next(bb) # noqa if offset == 2: offset = last_offset length = next(bb) else: offset = (offset - 3) * 0x100 + src.read_byte() if offset & 0xFFFFFFFF == 0xFFFFFFFF: break length = (offset ^ 1) & 1 # noqa offset = (offset >> 1) + 1 last_offset = offset length = 2 * length + next(bb) if length == 0: length = 2 + next(bb) while not next(bb): length = 2 * length + next(bb) length += 2 length += int(bool(offset > 0x500)) dst.replay(offset, length + 1)
class nrv2e (bits=32)
-
Decompress data using the NRV2E algorithm.
Expand source code Browse git
class nrv2e(NRVUnit): """ Decompress data using the NRV2E algorithm. """ def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader): last_offset = 1 while not src.eof: while next(bb): dst.write_byte(src.read_byte()) offset = 2 + next(bb) while not next(bb): offset = 2 * (offset - 1) + next(bb) # noqa offset = 2 * offset + next(bb) # noqa if offset == 2: offset = last_offset length = next(bb) else: offset = (offset - 3) * 0x100 + src.read_byte() if offset & 0xFFFFFFFF == 0xFFFFFFFF: break length = (offset ^ 1) & 1 # noqa offset = (offset >> 1) + 1 last_offset = offset if length: length = 1 + next(bb) elif next(bb): length = 3 + next(bb) else: length = 2 + next(bb) while not next(bb): length = 2 * length + next(bb) length += 3 length += int(bool(offset > 0x500)) dst.replay(offset, length + 1)
class ntlm (text=False)
-
Returns the Windows NTLM hash of the input.
Expand source code Browse git
class ntlm(HashUnit): """ Returns the Windows NTLM hash of the input. """ def _algorithm(self, data: bytes) -> bytes: from Cryptodome.Hash import MD4 return MD4.new(data.decode(self.codec).encode('utf-16le'))
class officecrypt (password=b'VelvetSweatshop')
-
A simple proxy for the
msoffcrypto
package to decrypt office documents.Expand source code Browse git
class officecrypt(Unit): """ A simple proxy for the `msoffcrypto` package to decrypt office documents. """ def __init__(self, password: Arg.Binary(help=( 'The document password. By default, the Excel default password "{default}" is used.' )) = b'VelvetSweatshop'): super().__init__(password=password) @Unit.Requires('msoffcrypto-tool', 'formats', 'office') def _msoffcrypto(): import msoffcrypto return msoffcrypto def process(self, data): password: bytes = self.args.password with MemoryFile(data) as stream: doc = self._msoffcrypto.OfficeFile(stream) if not doc.is_encrypted(): self.log_warn('the document is not encrypted; returning input') return data if password: doc.load_key(password=password.decode(self.codec)) with MemoryFile(bytearray()) as output: doc.decrypt(output) return output.getvalue()
class opc (mode='x32', *, count=None, until=None, nvar='name', avar='addr', ovar='arg')
-
Disassembles the input data using capstone and generates opcodes with metadata as output. This is useful for programmatic disassembly, while the
asm
unit outputs a human-readable representation. Internally,asm
uses this unit and pretty-prints the output.Expand source code Browse git
class opc(Unit): """ Disassembles the input data using capstone and generates opcodes with metadata as output. This is useful for programmatic disassembly, while the `refinery.asm` unit outputs a human-readable representation. Internally, `refinery.asm` uses this unit and pretty-prints the output. """ def __init__( self, mode: Arg.Choice( help='Machine code architecture, default is {default}. Select from the following list: {choices}.', choices=_ARCHES, metavar='[x32|x64|..]') = 'x32', *, count: Arg.Number('-c', help='Maximum number of bytes to disassemble, infinite by default.') = None, until: Arg.String('-u', help='Disassemble until the given string appears among the disassembly.') = None, nvar: Arg.String('-n', help=( 'Variable to receive the disassembled mnemonic. Default is "{default}".')) = 'name', avar: Arg.String('-a', help=( 'Variable to receive the address of the instruction. Default is "{default}".')) = 'addr', ovar: Arg.String('-o', help=( 'Variable prefix for instruction operands. Default is "{default}". The complete operand ' 'string will be in {default}s, the first argument in {default}1, the second in {default}2, ' 'and so on.')) = 'arg', **more ): super().__init__( mode=mode, count=count, until=until, nvar=nvar, avar=avar, ovar=ovar, **more) @Unit.Requires('capstone') def _capstone(): import capstone return capstone @property def _capstone_engine(self) -> Cs: cs = self._capstone return cs.Cs(*{ 'arm' : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM), 'mips32' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS32), 'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS64), 'ppc32' : (cs.CS_ARCH_PPC, cs.CS_MODE_32), 'ppc64' : (cs.CS_ARCH_PPC, cs.CS_MODE_64), 'x16' : (cs.CS_ARCH_X86, cs.CS_MODE_16), 'x32' : (cs.CS_ARCH_X86, cs.CS_MODE_32), 'x64' : (cs.CS_ARCH_X86, cs.CS_MODE_64), }.get(self.args.mode.lower())) def process(self, data): count = self.args.count or 0 until = self.args.until nvar = self.args.nvar avar = self.args.avar ovar = self.args.ovar if isinstance(until, str): until = until.lower() for insn in self._capstone_engine.disasm(data, 0, count): kwargs = { avar: insn.address, nvar: insn.mnemonic, } try: ops = insn.op_str operands = [op.strip() for op in ops.split(',')] except Exception: operands = [] else: kwargs[F'{ovar}s'] = ops for k, op in enumerate(operands, 1): if not op: break try: op = int(op, 0) except Exception: pass kwargs[F'{ovar}{k}'] = op yield self.labelled(insn.bytes, **kwargs) if until is None: continue if until in ops.lower() or until in insn.mnemonic.lower(): break
class p1
-
A shortcut for
pick
with the argument0:1
.Expand source code Browse git
class p1(pick): """ A shortcut for `refinery.pick` with the argument `0:1`. """ def __init__(self): super().__init__(slice(0, 1))
class p2
-
A shortcut for
pick
with the argument0:2
.Expand source code Browse git
class p2(pick): """ A shortcut for `refinery.pick` with the argument `0:2`. """ def __init__(self): super().__init__(slice(0, 2))
class p3
-
A shortcut for
pick
with the argument0:3
.Expand source code Browse git
class p3(pick): """ A shortcut for `refinery.pick` with the argument `0:3`. """ def __init__(self): super().__init__(slice(0, 3))
class pack (base=0, prefix=False, strict=False, width=0, single_floats=False, double_floats=False, bigendian=False, blocksize=None)
-
Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of an array of numbers into its binary form. For example,
123,34,256,12,1,234
would be transformed into the byte sequence7B22000C01EA
, where256
was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed with options -EB2, the result would be equal to007B00220100000C000100EA
in hexadecimal.Expand source code Browse git
class pack(BlockTransformationBase): """ Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of an array of numbers into its binary form. For example, `123,34,256,12,1,234` would be transformed into the byte sequence `7B22000C01EA`, where `256` was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed with options -EB2, the result would be equal to `007B00220100000C000100EA` in hexadecimal. """ def __init__(self, base: Arg(type=number[2:36], help=( 'Find only numbers in given base. Default of 0 means that ' 'common expressions for hexadecimal, octal and binary are ' 'accepted.')) = 0, prefix : Arg.Switch('-r', group='FLT', help='Add numeric prefixes like 0x, 0b, and 0o in reverse mode.') = False, strict : Arg.Switch('-s', help='Only parse integers that fit in one block of the given block size.') = False, width : Arg.Number('-w', help='Pad numbers with the specified amount of leading zeros.') = 0, single_floats: Arg.Switch('-f', group='FLT', help='Pack single-precision floating-point numbers. Implies -B4.') = False, double_floats: Arg.Switch('-d', group='FLT', help='Pack double-precision floating-point numbers. Implies -B8.') = False, bigendian=False, blocksize=None ): if single_floats and double_floats: raise ValueError('The floats and doubles option are mutually exclusive.') elif single_floats: fmode = FMode.SINGLE blocksize = 4 elif double_floats: fmode = FMode.DOUBLE blocksize = 8 else: fmode = FMode.TO_INT super().__init__( base=base, prefix=prefix, strict=strict, width=width, bigendian=bigendian, blocksize=blocksize, fmode=fmode, _truncate=2, ) @property def bytestream(self): # never alow bytes to be left unchunked return False def reverse(self, data): base = self.args.base or 10 width = self.args.width mode: FMode = self.args.fmode prefix = B'' self.log_debug(F'using base {base:d}') if self.args.prefix: prefix = { 0x02: b'0b', 0x08: b'0o', 0x10: b'0x' }.get(base, B'') if mode is FMode.TO_INT: converter = BaseUnit( base, little_endian=not self.args.bigendian, strip_padding=True, ) for n in self.chunk_into_bytes(data): converted = converter.reverse(n) if width: converted = converted.rjust(width, B'0') if prefix: converted = prefix + converted yield converted return elif mode is FMode.SINGLE: float_format = 'f' float_size = 4 elif mode is FMode.DOUBLE: float_format = 'd' float_size = 8 count, rest = divmod(len(data), float_size) if rest: self.log_warn(F'data contained {rest} trailing bytes that were ignored') data = memoryview(data)[:-rest] float_format *= count if self.args.bigendian: float_format = F'>{float_format}' else: float_format = F'<{float_format}' for n in struct.unpack(float_format, data): yield str(n).encode(self.codec) def process(self, data): base: int = self.args.base strict: bool = self.args.strict mode: FMode = self.args.fmode ep = '>' if self.args.bigendian else '<' def evaluate_literals(literals: Iterable[bytes]): for literal in literals: if mode is FMode.TO_INT: if base == 0 and literal[0] == 0x30 and literal[1:].isdigit(): literal = B'0o%s' % literal N = int(literal, base) elif mode is FMode.SINGLE: N, = struct.unpack(F'{ep}I', struct.pack(F'{ep}f', float(literal))) elif mode is FMode.DOUBLE: N, = struct.unpack(F'{ep}Q', struct.pack(F'{ep}d', float(literal))) else: raise TypeError('unexpected floating point mode') M = N & self.fmask if strict and M != N: continue yield M if base == 0: pattern = formats.number elif base <= 10: pattern = re.compile(B'[-+]?[0-%d]{1,64}' % (base - 1)) else: pattern = re.compile(B'[-+]?[0-9a-%c]{1,20}' % (0x57 + base), re.IGNORECASE) return self.unchunk(evaluate_literals(m[0] for m in pattern.finditer(data)))
class pad (width, padding=b'\x00', left=False, absolute=False)
-
Allows padding of the input data.
Expand source code Browse git
class pad(Unit): """ Allows padding of the input data. """ def __init__( self, width: Arg.Number(help='Input is padded to the nearest multiple of this size.'), padding: Arg(help=( 'This custom binary sequence is used (repeatedly, if necessary) to pad the ' 'input. The default is a zero byte.')) = B'\0', left: Arg.Switch('-l', help='Pad on the left instead of the right.') = False, absolute: Arg.Switch('-a', help=( 'The width argument specifies an absolute size, not a block size.')) = False ): super().__init__(width=width, padding=padding, left=left, absolute=absolute) def process(self, data): width = self.args.width if self.args.absolute and len(data) >= width: return data q, r = divmod(len(data), width) size = (q + bool(r)) * width missing = (size - len(data)) if missing <= 0: return data pad = self.args.padding if missing > len(pad): pad *= missing // len(pad) if self.args.left: return pad[:missing] + data else: data += pad[:missing] return data
class pbkdf1 (size, salt=b'\x00\x00\x00\x00\x00\x00\x00\x00', iter=1000, hash='SHA1')
-
PBKDF1 Key derivation
Expand source code Browse git
class pbkdf1(KeyDerivation): """PBKDF1 Key derivation""" @Arg('salt', help='Salt for the derivation; default are 8 null bytes.') def __init__(self, size, salt=bytes(8), iter=1000, hash='SHA1'): self.superinit(super(), **vars()) def process(self, data): from Cryptodome.Protocol.KDF import PBKDF1 return multidecode(data, lambda pwd: ( PBKDF1(pwd, self.args.salt, dkLen=self.args.size, count=self.args.iter, hashAlgo=self.hash) ))
class pbkdf2 (size, salt, iter=1000, hash='SHA1')
-
PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries.
Expand source code Browse git
class pbkdf2(KeyDerivation): """ PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries. """ def __init__(self, size, salt, iter=1000, hash='SHA1'): self.superinit(super(), **vars()) def process(self, data: ByteStr): from Cryptodome.Protocol.KDF import PBKDF2 return multidecode(data, partial( PBKDF2, salt=self.args.salt, dkLen=self.args.size, hmac_hash_module=self.hash, count=self.args.iter ))
class pcap (merge=False, client=False, server=False)
-
Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of each TCP conversation, attaching several pieces of metadata to each such output: Included are the source and destination socket address as well as the variable
stream
which identifies the conversation which it was part of. The chunks are returned in the order that the bytes were exchanged between source and destination. When the--merge
parameter is specified, the unit instead collects all bytes going forward and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place.Expand source code Browse git
class pcap(Unit): """ Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of each TCP conversation, attaching several pieces of metadata to each such output: Included are the source and destination socket address as well as the variable `stream` which identifies the conversation which it was part of. The chunks are returned in the order that the bytes were exchanged between source and destination. When the `--merge` parameter is specified, the unit instead collects all bytes going forward and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place. """ def __init__( self, merge: Arg.Switch('-m', help='Merge both parts of each TCP conversation into one chunk.') = False, client: Arg.Switch('-c', group='D', help='Show only the client part of each conversation.') = False, server: Arg.Switch('-s', group='D', help='Show only the server part of each conversation.') = False, ): super().__init__(merge=merge, client=client, server=server) @Unit.Requires('pypcapkit[scapy]>=1.3', 'all') def _pcapkit(): with NoLogging(): import scapy.layers.tls.session # noqa import pcapkit return pcapkit @Unit.Requires('scapy', 'all') def _scapy(): import scapy import scapy.packet return scapy def process(self, data): pcapkit = self._pcapkit merge = self.args.merge with NoLogging(), VirtualFileSystem() as fs: vf = VirtualFile(fs, data, 'pcap') pcap = pcapkit.extract( fin=vf.path, engine=pcapkit.Scapy, store=True, nofile=True, extension=False, ip=True, tcp=True, reassembly=True, reasm_strict=True, ) tcp: List[Datagram] = list(pcap.reassembly.tcp) tcp.sort(key=lambda p: min(p.index, default=0)) count, convo = 0, None src_buffer = MemoryFile() dst_buffer = MemoryFile() self.log_debug(F'extracted {len(pcap.frame)} packets, assembled {len(tcp)} datagrams') PT = self._scapy.packet def payload(packet: Packet): ok = (bytes, bytearray, PT.Raw) no = (PT.NoPayload, PT.Padding) circle = set() while True: try: inner = packet.payload except AttributeError: break if isinstance(packet, ok) and not isinstance(packet, no): return packet.original if id(inner) in circle: break packet = inner circle.add(id(inner)) return B'' def sequence(i: int): packet = pcap.frame[i - 1] while len(packet): try: return packet.seq except AttributeError: pass try: packet = packet.payload except AttributeError: break return 0 client = self.args.client server = self.args.server def commit(): if src_buffer.tell(): if not server: yield self.labelled(src_buffer.getvalue(), **convo.src_to_dst()) src_buffer.truncate(0) if dst_buffer.tell(): if not client: yield self.labelled(dst_buffer.getvalue(), **convo.dst_to_src()) dst_buffer.truncate(0) for datagram in tcp: this_convo = Conversation.FromID(datagram.id) if this_convo != convo: if count and merge: yield from commit() count = count + 1 convo = this_convo data = bytearray() for index in sorted(datagram.index, key=sequence): data.extend(payload(pcap.frame[index - 1])) if not data: continue if not merge: yield self.labelled(data, **this_convo.src_to_dst(), stream=count) elif this_convo.src == convo.src: src_buffer.write(data) elif this_convo.dst == convo.src: dst_buffer.write(data) else: raise RuntimeError(F'direction of packet {convo!s} in conversation {count} is unknown') yield from commit()
class pcap_http
-
Extracts HTTP payloads from packet capture (PCAP) files.
Expand source code Browse git
class pcap_http(Unit): """ Extracts HTTP payloads from packet capture (PCAP) files. """ def process(self, data): http_parser = httpresponse() requests: List[_HTTP_Request] = [] responses: List[bytearray] = [] def lookup(src, dst): for k, request in enumerate(requests): if request.src == dst and request.dst == src: requests.pop(k) return self.labelled(data, url=request.url) return None for stream in data | pcap(): try: data = http_parser.process(stream) except Exception: try: rq = _parse_http_request(stream) requests.append(rq) except _HTTPParseError as E: self.log_info(F'error parsing http request: {E!s}') except Exception: pass continue if not data: continue src, dst = stream['src'], stream['dst'] item = lookup(src, dst) if item is None: responses.append((src, dst, data)) continue yield item while responses: src, dst, data = responses.pop() item = lookup(src, dst) yield data if item is None else item
class pedebloat (*names, certificate=False, directories=False, memdump=False, resources=False, sections=False, trim_code=False, trim_rsrc=False, threshold=0.05, size_limit=10.0 MB, keep_limit=False, aggressive=False)
-
Removes junk or excess data from PE files and returns the stripped executable. By default, only the PE overlay is considered; use the flags
-r
and-s
to also consider resources and entire sections. Any buffer is only considered for removal if it exceeds a certain size. If this condition is met, a binary search is performed to determine the offset inside the buffer up to which the compression ratio is above a certain threshold; everything beyond that point is then removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.Expand source code Browse git
class pedebloat(OverlayUnit): """ Removes junk or excess data from PE files and returns the stripped executable. By default, only the PE overlay is considered; use the flags `-r` and `-s` to also consider resources and entire sections. Any buffer is only considered for removal if it exceeds a certain size. If this condition is met, a binary search is performed to determine the offset inside the buffer up to which the compression ratio is above a certain threshold; everything beyond that point is then removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely. """ def __init__( self, *names: Arg(type=str), certificate=False, directories=False, memdump=False, resources: Arg.Switch('-r', help='Strip large resources.') = False, sections : Arg.Switch('-s', help='Strip large sections.') = False, trim_code: Arg.Switch('-X', help='Lift the exception on code sections for stripping.') = False, trim_rsrc: Arg.Switch('-Y', help='Lift the exception on rsrc sections for stripping.') = False, threshold: Arg('-t', metavar='T', type=percent, help=( 'Trailing data from resources and sections is stripped until the compression ratio ' 'of the remaining data rises above this threshold. The default value is {default}. ' 'Set this to 1 to ignore the limit entirely and trim every structure as much as ' 'possible without violating alignment. Setting this value to 0 will only strip repeated ' 'occurrences of the last byte.')) = 0.05, size_limit: Arg.Number('-l', help=( 'Structures below this size are not stripped. Default is {default!r}.')) = _STRIP, keep_limit: Arg.Switch('-k', help=( 'Do not strip structures to below the above size limit.')) = False, aggressive: Arg.Switch('-a', help=( 'Equivalent to -srt1: Strip large sections and resources aggressively.')) = False, ): if aggressive: sections = True resources = True threshold = 1 super().__init__( certificate, directories, memdump, sections=sections, resources=resources, size_limit=size_limit, keep_limit=keep_limit, threshold=threshold, trim_rsrc=trim_rsrc, trim_code=trim_code, names=names, ) def _right_strip_data(self, data: memoryview, alignment=1, block_size=_MB) -> int: if not data: return 0 threshold = self.args.threshold data_overhang = len(data) % alignment result = data_overhang if 0 < threshold < 1: def compression_ratio(offset: int): ratio = len(zlib.compress(data[:offset], level=1)) / offset self.log_debug(F'compressing {SizeInt(offset)!r} ratio={ratio:6.4f}') return ratio upper = len(data) lower = result if compression_ratio(upper) <= threshold: while block_size < upper - lower: pivot = (lower + upper) // 2 ratio = compression_ratio(pivot) if ratio > threshold: lower = pivot + 1 continue upper = pivot if abs(ratio - threshold) < 1e-10: break result = upper elif threshold == 0: result = len(data) elif threshold == 1: result = 0 while result > 1 and data[result - 2] == data[result - 1]: result -= 1 result = max(result, data_overhang) if self.args.keep_limit: result = max(result, self.args.size_limit) result = result + (data_overhang - result) % alignment if result > len(data): excess = result - len(data) excess = excess + (-excess % alignment) result = result - excess return result def _adjust_offsets(self, pe: PE, gap_offset: int, gap_size: int): base = pe.OPTIONAL_HEADER.ImageBase alignment = pe.OPTIONAL_HEADER.FileAlignment rva_offset = pe.get_rva_from_offset(gap_offset) tva_offset = rva_offset + base section = pe.get_section_by_offset(gap_offset) new_section_size = section.SizeOfRawData - gap_size if new_section_size % alignment != 0: raise RuntimeError( F'trimming 0x{gap_size:X} bytes from section {_ASCII(section.Name)} of size 0x{section.SizeOfRawData:X} ' F'violates required section alignment of 0x{alignment:X} bytes') inside_section_offset = gap_offset - section.PointerToRawData if inside_section_offset > new_section_size: overlap = inside_section_offset - new_section_size raise RuntimeError(F'trimming from section {_ASCII(section.Name)}; data extends {overlap} beyond section') rva_lbound = section.VirtualAddress rva_ubound = section.VirtualAddress + section.Misc_VirtualSize - 1 tva_lbound = rva_lbound + base tva_ubound = rva_ubound + base def adjust_attributes_of_structure( structure: Structure, gap_offset: int, valid_values_lower_bound: Optional[int], valid_values_upper_bound: Optional[int], attributes: Iterable[str] ): for attribute in attributes: old_value = getattr(structure, attribute, 0) if old_value <= gap_offset: continue if valid_values_lower_bound is not None and old_value < valid_values_lower_bound: continue if valid_values_upper_bound is not None and old_value > valid_values_upper_bound: continue new_value = old_value - gap_size if new_value < gap_offset: raise BrokenLink(F'attribute {attribute} points into removed region') self.log_debug(F'adjusting field in {structure.name}: {attribute}') setattr(structure, attribute, new_value) it: Iterable[Structure] = iter(pe.__structures__) remove = [] for index, structure in enumerate(it): old_offset = structure.get_file_offset() new_offset = old_offset - gap_offset if old_offset > gap_offset: if old_offset < gap_offset + gap_size: self.log_debug(F'removing structure {structure.name}; starts inside removed region') remove.append(index) continue if isinstance(structure, SectionStructure) and new_offset % alignment != 0: raise RuntimeError( F'structure {structure.name} would be moved to offset 0x{new_offset:X}, ' F'violating section alignment value 0x{alignment:X}.') structure.set_file_offset(new_offset) try: adjust_attributes_of_structure(structure, rva_offset, rva_lbound, rva_ubound, ( 'OffsetToData', 'AddressOfData', 'VirtualAddress', 'AddressOfNames', 'AddressOfNameOrdinals', 'AddressOfFunctions', 'AddressOfEntryPoint', 'AddressOfRawData', 'BaseOfCode', 'BaseOfData', )) adjust_attributes_of_structure(structure, tva_offset, tva_lbound, tva_ubound, ( 'StartAddressOfRawData', 'EndAddressOfRawData', 'AddressOfIndex', 'AddressOfCallBacks', )) adjust_attributes_of_structure(structure, gap_offset, None, None, ( 'OffsetModuleName', 'PointerToRawData', )) except BrokenLink as error: self.log_debug(F'removing structure {structure.name}; {error!s}') remove.append(index) continue for attribute in ( 'CvHeaderOffset', 'OffsetIn2Qwords', 'OffsetInQwords', 'Offset', 'OffsetLow', 'OffsetHigh' ): if not hasattr(structure, attribute): continue self.log_warn(F'potential offset in structure {structure.name} ignored: {attribute}') while remove: index = remove.pop() pe.__structures__[index:index + 1] = [] section.SizeOfRawData = new_section_size def _trim_sections(self, pe: PE, data: bytearray) -> int: S = self.args.size_limit P = self.args.names trimmed = 0 for section in pe.sections: section: SectionStructure offset = section.PointerToRawData name = _ASCII(section.Name) if not self.args.trim_code and name.lower() in ('.text', '.code'): self.log_debug(F'skipping code section {name}; specify --trim-code to override.') continue if not self.args.trim_rsrc and name.lower() == '.rsrc': self.log_debug(F'skipping rsrc section {name}; specify --trim-rsrc to override.') continue old_size = section.SizeOfRawData if old_size <= S and not any(fnmatch(name, p) for p in P): self.log_debug(F'criteria not satisfied for section: {SizeInt(old_size)!r} {name}') continue new_size = self._right_strip_data( memoryview(data)[offset:offset + old_size], pe.OPTIONAL_HEADER.FileAlignment) if new_size == old_size: continue self.log_info(F'stripping section {name} from {TI(old_size)!r} to {TI(new_size)!r}') gap_size = old_size - new_size gap_offset = offset + new_size if gap_size <= 0: continue self._adjust_offsets(pe, gap_offset, gap_size) trimmed += gap_size data[gap_offset:gap_offset + gap_size] = [] return trimmed def _trim_pe_resources(self, pe: PE, data: bytearray) -> int: S = self.args.size_limit P = self.args.names trimmed = 0 def find_bloated_resources(pe: PE, directory, level: int = 0, *path) -> Generator[Structure, None, None]: for entry in directory.entries: name = getattr(entry, 'name') numeric = getattr(entry, 'id') if not name: if level == 0 and numeric in iter(RSRC): name = RSRC(entry.id) elif numeric is not None: name = str(numeric) name = name and str(name) or '?' if entry.struct.DataIsDirectory: yield from find_bloated_resources(pe, entry.directory, level + 1, *path, name) continue struct: Structure = entry.data.struct name = '/'.join((*path, name)) if struct.Size <= S and not any(fnmatch(name, p) for p in P): self.log_debug(F'criteria not satisfied for resource: {SizeInt(struct.Size)!r} {name}') continue yield name, struct RSRC_INDEX = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE'] pe.parse_data_directories(directories=[RSRC_INDEX]) try: resources = pe.DIRECTORY_ENTRY_RESOURCE except AttributeError: return 0 for name, resource in find_bloated_resources(pe, resources): offset = pe.get_offset_from_rva(resource.OffsetToData) old_size = resource.Size new_size = self._right_strip_data( memoryview(data)[offset:offset + old_size], pe.OPTIONAL_HEADER.FileAlignment) self.log_info(F'stripping resource {name} from {old_size} to {new_size}') gap_size = old_size - new_size gap_offset = offset + new_size if gap_size <= 0: continue resource.Size = new_size self._adjust_offsets(pe, gap_offset, gap_size) trimmed += gap_size data[gap_offset:gap_offset + gap_size] = [] pe.OPTIONAL_HEADER.DATA_DIRECTORY[RSRC_INDEX].Size -= trimmed self.log_info(F'trimming size of resource data directory by {TI(trimmed)!r}') return trimmed def process(self, data: bytearray) -> bytearray: overlay_offset = self._get_size(data) if len(data) - overlay_offset >= self.args.size_limit: view = memoryview(data) overlay_length = self._right_strip_data(view[overlay_offset:]) body_size = overlay_offset + overlay_length try: data[body_size:] = [] except Exception: data = data[:body_size] if not self.args.resources and not self.args.sections: return data pe = PE(data=data, fast_load=True) total = len(data) trimmed = 0 view = pe.__data__ copy = False if not isinstance(view, bytearray): view = memoryview(view) try: view[0] = 0x4D except Exception: copy = True view = bytearray(pe.__data__) if self.args.resources: trimmed += self._trim_pe_resources(pe, view) if self.args.sections: trimmed += self._trim_sections(pe, view) if copy: pe.__data__ = view data = pe.write() end = total - trimmed if end < len(data): self.log_warn(F'output contains {len(data) - end} trailing bytes') return data
class peek (lines=10, all=False, brief=False, decode=0, escape=False, bare=False, meta=0, gray=False, index=False, stdout=False, narrow=False, blocks=1, dense=False, expand=False, width=0)
-
The unit extracts preview information of the input data and displays it on the standard error stream. If the standard output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal, the data is discarded instead.
Expand source code Browse git
class peek(HexViewer): """ The unit extracts preview information of the input data and displays it on the standard error stream. If the standard output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal, the data is discarded instead. """ def __init__( self, lines : Arg.Number('-l', group='SIZE', help='Specify number N of lines in the preview, default is 10.') = 10, all : Arg.Switch('-a', group='SIZE', help='Output all possible preview lines without restriction') = False, brief : Arg.Switch('-b', group='SIZE', help='One line peek, implies --lines=1.') = False, decode : Arg.Counts('-d', group='MODE', help=( 'Attempt to decode and display printable data. Specify twice to enable line wrapping.')) = 0, escape : Arg.Switch('-e', group='MODE', help='Always peek data as string, escape characters if necessary.') = False, bare : Arg.Switch('-r', group='META', help='Only peek the data itself, do not show a metadata preview.') = False, meta : Arg.Counts('-m', group='META', help=( 'Show more auto-derivable metadata. Specify multiple times to populate more variables.')) = 0, gray : Arg.Switch('-g', help='Do not colorize the output.') = False, index : Arg.Switch('-i', help='Display the index of each chunk within the current frame.') = False, stdout : Arg.Switch('-2', help='Print the peek to STDOUT rather than STDERR; the input data is lost.') = False, narrow=False, blocks=1, dense=False, expand=False, width=0 ): if decode and escape: raise ValueError('The decode and esc options are exclusive.') if brief: narrow = True if environment.colorless.value: gray = True lines = 1 if brief else INF if all else lines super(peek, self).__init__( brief=brief, gray=gray, blocks=blocks, decode=decode, dense=dense, index=index, escape=escape, expand=expand, narrow=narrow, lines=lines, meta=meta, bare=bare, width=width, stdout=stdout, ) @HexViewer.Requires('colorama', 'display', 'default', 'extended') def _colorama(): import colorama return colorama def process(self, data): colorize = not self.args.gray and not self.args.stdout lines = self._peeklines(data, colorize) if self.args.stdout: for line in lines: yield line.encode(self.codec) return stderr = sys.stderr if colorize: colorama = self._colorama if os.name == 'nt': stderr = colorama.AnsiToWin32(stderr).stream _erase = ' ' * get_terminal_size() _reset = F'\r{colorama.Style.RESET_ALL}{_erase}\r' else: _reset = '' try: for line in lines: print(line, file=stderr) except BaseException: stderr.write(_reset) raise if not self.isatty: self.log_info('forwarding input to next unit') yield data def _peekmeta(self, linewidth, sep, meta: dict, peek=None) -> Generator[str, None, None]: if not meta and not peek: return width = max((len(name) for name in meta), default=0) separators = iter([sep]) if peek is not None: if len(peek) > linewidth: peek = peek[:linewidth - 3] + '...' yield from separators yield peek for name in sorted(meta): value = meta[name] if value is None: continue if isinstance(value, CustomStringRepresentation): value = repr(value).strip() elif isbuffer(value): value = repr(ByteStringWrapper(value)) elif isinstance(value, int): if value in range(-999, 1000): value = str(value) elif value > 0: value = F'0x{value:X}' else: value = F'-0x{-value:X}' elif isinstance(value, float): value = F'{value:.4f}' metavar = F'{name:>{width + 2}} = {value!s}' if len(metavar) > linewidth: metavar = metavar[:linewidth - 3] + '...' yield from separators yield metavar def _trydecode(self, data, codec: Optional[str], width: int, linecount: int) -> str: remaining = linecount result = [] wrap = self.args.decode > 1 if codec is None: from refinery.units.encoding.esc import esc decoded = data[:abs(width * linecount)] decoded = str(decoded | -esc(bare=True)) limit = abs(min(linecount * width, len(decoded))) for k in range(0, limit, width): result.append(decoded[k:k + width]) return result try: import unicodedata unprintable = {'Cc', 'Cf', 'Co', 'Cs'} self.log_info(F'trying to decode as {codec}.') decoded = codecs.decode(data, codec, errors='strict') count = sum(unicodedata.category(c) not in unprintable for c in decoded) ratio = count / len(decoded) except UnicodeDecodeError as DE: self.log_info('decoding failed:', DE.reason) return None except ValueError as V: self.log_info('decoding failed:', V) return None if ratio < 0.8: self.log_info(F'data contains {ratio * 100:.2f}% printable characters, this is too low.') return None decoded = decoded.splitlines(False) if not wrap: for k, line in enumerate(decoded): line = line.replace('\t', '\x20' * 4) if len(line) <= width: continue clipped = line[:width - 3] if self.args.gray: color = '' reset = '' else: colorama = self._colorama color = colorama.Fore.LIGHTRED_EX reset = colorama.Style.RESET_ALL decoded[k] = F'{clipped}{color}...{reset}' return decoded[:abs(linecount)] for paragraph in decoded: if not remaining: break wrapped = [ line for chunk in textwrap.wrap( paragraph, width, break_long_words=True, break_on_hyphens=False, drop_whitespace=False, expand_tabs=True, max_lines=abs(remaining + 1), replace_whitespace=False, tabsize=4, ) for line in chunk.splitlines(keepends=False) ] remaining -= len(wrapped) result.extend(wrapped) return result[:abs(linecount)] def _peeklines(self, data: bytearray, colorize: bool) -> Generator[str, None, None]: meta = metavars(data) codec = None lines = None final = data.temp or False empty = True if not self.args.index: meta.discard('index') index = None else: index = meta.get('index', None) if not self.args.brief: padding = 0 else: padding = SizeInt.width + 2 if index is not None: padding += 6 metrics = self._get_metrics(len(data), self.args.lines, padding) if self.args.brief: metrics.address_width = 0 metrics.fit_to_width(allow_increase=True) sepsize = metrics.hexdump_width txtsize = self.args.width or sepsize if self.args.lines and data: if self.args.escape: lines = self._trydecode(data, None, txtsize, metrics.line_count) if self.args.decode > 0: for codec in ('utf8', 'utf-16le', 'utf-16', 'utf-16be'): lines = self._trydecode(data, codec, txtsize, metrics.line_count) if lines: codec = codec break else: codec = None if lines is None: lines = list(self.hexdump(data, metrics, colorize)) else: sepsize = txtsize def separator(title=None): if title is None or sepsize <= len(title) + 8: return sepsize * '-' return '-' * (sepsize - len(title) - 5) + F'[{title}]---' if self.args.brief: final = False elif not self.args.bare: peek = repr(meta.size) line = separator() if len(data) <= 5_000_000: peek = F'{peek}; {meta.entropy!r} entropy' peek = F'{peek}; {meta.magic!s}' if self.args.lines == 0: peek = None elif not data: peek = None line = separator('empty chunk') if self.args.meta > 0: meta.derive('size') meta.derive('magic') meta.derive('entropy') peek = None if self.args.meta > 1: meta.derive('crc32') meta.derive('sha256') if self.args.meta > 2: for name in meta.derivations: meta[name] for line in self._peekmeta(metrics.hexdump_width, line, meta, peek=peek): empty = False yield line if lines: empty = False if not self.args.brief: yield separator(codec or None) yield from lines else: brief = next(iter(lines)) brief = F'{SizeInt(len(data))!r}: {brief}' if index is not None: brief = F'#{index:03d}: {brief}' yield brief if final and (self.args.bare or not empty): yield separator() def filter(self, chunks): try: self._colorama.init(wrap=False) except ImportError: pass discarded = 0 it = iter(chunks) buffer = collections.deque(itertools.islice(it, 0, 2)) buffer.reverse() while buffer: if self.isatty and not buffer[0].visible: buffer.popleft() discarded += 1 else: item = buffer.pop() last = not bool(buffer) item.temp = last if not item.visible and self.isatty: discarded += 1 else: yield item try: buffer.appendleft(next(it)) except StopIteration: pass if discarded: self.log_warn(F'discarded {discarded} invisible chunks to prevent them from leaking into the terminal.')
class pemeta (custom=False, debug=False, dotnet=False, signatures=False, timestamps=0, version=False, header=False, exports=0, imports=0, tabular=False, timeraw=False)
-
Extract metadata from PE files. By default, all information except for imports and exports are extracted.
Expand source code Browse git
class pemeta(Unit): """ Extract metadata from PE files. By default, all information except for imports and exports are extracted. """ def __init__( self, custom : Arg('-c', '--custom', help='Unless enabled, all default categories will be extracted.') = False, debug : Arg.Switch('-D', help='Parse the PDB path from the debug directory.') = False, dotnet : Arg.Switch('-N', help='Parse the .NET header.') = False, signatures : Arg.Switch('-S', help='Parse digital signatures.') = False, timestamps : Arg.Counts('-T', help='Extract time stamps. Specify twice for more detail.') = 0, version : Arg.Switch('-V', help='Parse the VERSION resource.') = False, header : Arg.Switch('-H', help='Parse base data from the PE header.') = False, exports : Arg.Counts('-E', help='List all exported functions. Specify twice to include addresses.') = 0, imports : Arg.Counts('-I', help='List all imported functions. Specify twice to include addresses.') = 0, tabular : Arg.Switch('-t', help='Print information in a table rather than as JSON') = False, timeraw : Arg.Switch('-r', help='Extract time stamps as numbers instead of human-readable format.') = False, ): if not custom and not any((debug, dotnet, signatures, timestamps, version, header)): debug = dotnet = signatures = timestamps = version = header = True super().__init__( debug=debug, dotnet=dotnet, signatures=signatures, timestamps=timestamps, version=version, header=header, imports=imports, exports=exports, timeraw=timeraw, tabular=tabular, ) @classmethod def _ensure_string(cls, x): if not isinstance(x, str): x = repr(x) if not isinstance(x, bytes) else x.decode(cls.codec, 'backslashreplace') return x @classmethod def _parse_pedict(cls, bin): return dict(( cls._ensure_string(key), cls._ensure_string(val) ) for key, val in bin.items() if val) @classmethod def parse_signature(cls, data: bytearray) -> dict: """ Extracts a JSON-serializable and human-readable dictionary with information about time stamp and code signing certificates that are attached to the input PE file. """ from refinery.units.formats.pkcs7 import pkcs7 try: signature = data | pkcs7 | json.loads except Exception as E: raise ValueError(F'PKCS7 parser failed with error: {E!s}') info = {} def find_timestamps(entry): if isinstance(entry, dict): if set(entry.keys()) == {'type', 'value'}: if entry['type'] == 'signing_time': return {'Timestamp': entry['value']} for value in entry.values(): result = find_timestamps(value) if result is None: continue with suppress(KeyError): result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name']) return result elif isinstance(entry, list): for value in entry: result = find_timestamps(value) if result is None: continue return result timestamp_info = find_timestamps(signature) if timestamp_info is not None: info.update(timestamp_info) try: certificates = signature['content']['certificates'] except KeyError: return info if len(certificates) == 1: main_certificate = certificates[0] else: certificates_with_extended_use = [] main_certificate = None for certificate in certificates: with suppress(Exception): crt = certificate['tbs_certificate'] ext = [e for e in crt['extensions'] if e['extn_id'] == 'extended_key_usage' and e['extn_value'] != ['time_stamping']] key = [e for e in crt['extensions'] if e['extn_id'] == 'key_usage'] if ext: certificates_with_extended_use.append(certificate) if any('key_cert_sign' in e['extn_value'] for e in key): continue if any('code_signing' in e['extn_value'] for e in ext): main_certificate = certificate break if main_certificate is None and len(certificates_with_extended_use) == 1: main_certificate = certificates_with_extended_use[0] if main_certificate: crt = main_certificate['tbs_certificate'] serial = crt['serial_number'] if isinstance(serial, int): serial = F'{serial:x}' if len(serial) % 2 != 0: serial = F'0{serial}' assert bytes.fromhex(serial) in data subject = crt['subject'] location = [subject.get(t, '') for t in ('locality_name', 'state_or_province_name', 'country_name')] info.update(Subject=subject['common_name']) if any(location): info.update(SubjectLocation=', '.join(filter(None, location))) for signer_info in signature['content'].get('signer_infos', ()): try: if signer_info['sid']['serial_number'] != crt['serial_number']: continue for attr in signer_info['signed_attrs']: if attr['type'] == 'authenticode_info': info.update(ProgramName=attr['value']['programName']) info.update(MoreInfo=attr['value']['moreInfo']) except KeyError: continue try: valid_from = crt['validity']['not_before'] valid_until = crt['validity']['not_after'] except KeyError: pass else: info.update(ValidFrom=valid_from, ValidUntil=valid_until) info.update( Issuer=crt['issuer']['common_name'], Fingerprint=main_certificate['fingerprint'], Serial=serial) return info return info def _pe_characteristics(self, pe: PE): return {name for name, mask in image_characteristics if pe.FILE_HEADER.Characteristics & mask} def _pe_address_width(self, pe: PE, default=16) -> int: if 'IMAGE_FILE_16BIT_MACHINE' in self._pe_characteristics(pe): return 4 elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in ['IMAGE_FILE_MACHINE_I386']: return 8 elif MACHINE_TYPE[pe.FILE_HEADER.Machine] in [ 'IMAGE_FILE_MACHINE_AMD64', 'IMAGE_FILE_MACHINE_IA64', ]: return 16 else: return default def _vint(self, pe: PE, value: int): if not self.args.tabular: return value aw = self._pe_address_width(pe) return F'0x{value:0{aw}X}' def parse_version(self, pe: PE, data=None) -> dict: """ Extracts a JSON-serializable and human-readable dictionary with information about the version resource of an input PE file, if available. """ pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']]) string_table_entries = [] for FileInfo in pe.FileInfo: for FileInfoEntry in FileInfo: with suppress(AttributeError): for StringTableEntry in FileInfoEntry.StringTable: StringTableEntryParsed = self._parse_pedict(StringTableEntry.entries) with suppress(AttributeError): LangID = StringTableEntry.entries.get('LangID', None) or StringTableEntry.LangID LangID = int(LangID, 0x10) if not isinstance(LangID, int) else LangID LangHi = LangID >> 0x10 LangLo = LangID & 0xFFFF Language = self._LCID.get(LangHi, 'Language Neutral') Charset = self._CHARSET.get(LangLo, 'Unknown Charset') StringTableEntryParsed.update( LangID=F'{LangID:08X}', Charset=Charset, Language=Language ) for key in StringTableEntryParsed: if key.endswith('Version'): value = StringTableEntryParsed[key] separator = ', ' if re.match(F'\\d+({re.escape(separator)}\\d+){{3}}', value): StringTableEntryParsed[key] = '.'.join(value.split(separator)) string_table_entries.append(StringTableEntryParsed) if not string_table_entries: return None elif len(string_table_entries) == 1: return string_table_entries[0] else: return string_table_entries def parse_exports(self, pe: PE, data=None, include_addresses=False) -> list: pe.parse_data_directories(directories=[DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']]) base = pe.OPTIONAL_HEADER.ImageBase info = [] for k, exp in enumerate(pe.DIRECTORY_ENTRY_EXPORT.symbols): if not exp.name: name = F'@{k}' else: name = exp.name.decode('ascii') item = {'Name': name, 'Address': self._vint(pe, exp.address + base)} if include_addresses else name info.append(item) return info def parse_imports(self, pe: PE, data=None, include_addresses=False) -> list: info = {} dirs = [] for name in [ 'DIRECTORY_ENTRY_IMPORT', 'DIRECTORY_ENTRY_DELAY_IMPORT', ]: pe.parse_data_directories(directories=[DIRECTORY_ENTRY[F'IMAGE_{name}']]) with suppress(AttributeError): dirs.append(getattr(pe, name)) self.log_warn(dirs) for idd in itertools.chain(*dirs): dll: bytes = idd.dll dll = dll.decode('ascii') if dll.lower().endswith('.dll'): dll = dll[:~3] imports: list[str] = info.setdefault(dll, []) with suppress(AttributeError): symbols = idd.imports with suppress(AttributeError): symbols = idd.entries try: for imp in symbols: name: bytes = imp.name name = name and name.decode('ascii') or F'@{imp.ordinal}' if not include_addresses: imports.append(name) else: imports.append(dict(Name=name, Address=self._vint(pe, imp.address))) except Exception as e: self.log_warn(F'error parsing {name}: {e!s}') return info def parse_header(self, pe: PE, data=None) -> dict: def format_macro_name(name: str, prefix, convert=True): name = name.split('_')[prefix:] if convert: for k, part in enumerate(name): name[k] = part.upper() if len(part) <= 3 else part.capitalize() return ' '.join(name) major = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion minor = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion version = self._WINVER.get(major, {0: 'Unknown'}) try: MinimumOS = version[minor] except LookupError: MinimumOS = version[0] header_information = { 'Machine': format_macro_name(MACHINE_TYPE[pe.FILE_HEADER.Machine], 3, False), 'Subsystem': format_macro_name(SUBSYSTEM_TYPE[pe.OPTIONAL_HEADER.Subsystem], 2), 'MinimumOS': MinimumOS, } pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'], ]) try: export_name = pe.DIRECTORY_ENTRY_EXPORT.name if isinstance(export_name, bytes): export_name = export_name.decode('utf8') if not export_name.isprintable(): export_name = None except Exception: export_name = None if export_name: header_information['ExportName'] = export_name rich_header = pe.parse_rich_header() rich = [] if rich_header: it = rich_header.get('values', []) if self.args.tabular: cw = max(len(F'{c:d}') for c in it[1::2]) for idv, count in zip(it[0::2], it[1::2]): info = get_rich_info(idv) if not info: continue pid = info.pid.upper() if self.args.tabular: short_pid = get_rich_short_pid(pid) rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}') else: rich.append({ 'Counter': count, 'Encoded': F'{idv:08x}', 'Library': pid, 'Product': info.ver, }) header_information['RICH'] = rich characteristics = self._pe_characteristics(pe) for typespec, flag in { 'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE', 'DLL': 'IMAGE_FILE_DLL', 'SYS': 'IMAGE_FILE_SYSTEM' }.items(): if flag in characteristics: header_information['Type'] = typespec base = pe.OPTIONAL_HEADER.ImageBase header_information['ImageBase'] = self._vint(pe, base) header_information['ImageSize'] = get_pe_size(pe) header_information['Bits'] = 4 * self._pe_address_width(pe, 16) header_information['EntryPoint'] = self._vint(pe, pe.OPTIONAL_HEADER.AddressOfEntryPoint + base) return header_information def parse_time_stamps(self, pe: PE, raw_time_stamps: bool, more_detail: bool) -> dict: """ Extracts time stamps from the PE header (link time), as well as from the imports, exports, debug, and resource directory. The resource time stamp is also parsed as a DOS time stamp and returned as the "Delphi" time stamp. """ if raw_time_stamps: def dt(ts): return ts else: dt = date_from_timestamp pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG'], DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE'] ]) info = {} with suppress(AttributeError): info.update(Linker=dt(pe.FILE_HEADER.TimeDateStamp)) for dir_name, _dll, info_key in [ ('DIRECTORY_ENTRY_IMPORT', 'dll', 'Import'), # noqa ('DIRECTORY_ENTRY_DELAY_IMPORT', 'dll', 'Symbol'), # noqa ('DIRECTORY_ENTRY_BOUND_IMPORT', 'name', 'Module'), # noqa ]: impts = {} for entry in getattr(pe, dir_name, []): ts = 0 with suppress(AttributeError): ts = entry.struct.dwTimeDateStamp with suppress(AttributeError): ts = entry.struct.TimeDateStamp if ts == 0 or ts == 0xFFFFFFFF: continue name = getattr(entry, _dll, B'').decode() if name.lower().endswith('.dll'): name = name[:-4] impts[name] = dt(ts) if not impts: continue if not more_detail: dmin = min(impts.values()) dmax = max(impts.values()) small_delta = 2 * 60 * 60 if not raw_time_stamps: small_delta = timedelta(seconds=small_delta) if dmax - dmin < small_delta: impts = dmin info[info_key] = impts with suppress(AttributeError): Export = pe.DIRECTORY_ENTRY_EXPORT.struct.TimeDateStamp if Export: info.update(Export=dt(Export)) with suppress(AttributeError): res_timestamp = pe.DIRECTORY_ENTRY_RESOURCE.struct.TimeDateStamp if res_timestamp: with suppress(ValueError): from refinery.units.misc.datefix import datefix dos = datefix.dostime(res_timestamp) info.update(Delphi=dos) info.update(RsrcTS=dt(res_timestamp)) def norm(value): if isinstance(value, list): return [norm(v) for v in value] if isinstance(value, dict): return {k: norm(v) for k, v in value.items()} if isinstance(value, int): return value return str(value) return {key: norm(value) for key, value in info.items()} def parse_dotnet(self, pe: PE, data): """ Extracts a JSON-serializable and human-readable dictionary with information about the .NET metadata of an input PE file. """ header = DotNetHeader(data, pe=pe) tables = header.meta.Streams.Tables info = dict( RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}', Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}', VersionString=header.meta.VersionString ) info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check] if len(tables.Assembly) == 1: assembly = tables.Assembly[0] info.update( AssemblyName=assembly.Name, Release='{}.{}.{}.{}'.format( assembly.MajorVersion, assembly.MinorVersion, assembly.BuildNumber, assembly.RevisionNumber ) ) try: entry = self._vint(pe, header.head.EntryPointToken + pe.OPTIONAL_HEADER.ImageBase) info.update(EntryPoint=entry) except AttributeError: pass if len(tables.Module) == 1: module = tables.Module[0] info.update(ModuleName=module.Name) return info def parse_debug(self, pe: PE, data=None): result = {} pe.parse_data_directories(directories=[ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']]) for dbg in pe.DIRECTORY_ENTRY_DEBUG: if DEBUG_TYPE.get(dbg.struct.Type, None) != 'IMAGE_DEBUG_TYPE_CODEVIEW': continue with suppress(Exception): pdb = dbg.entry.PdbFileName if 0 in pdb: pdb = pdb[:pdb.index(0)] result.update( PdbPath=pdb.decode(self.codec), PdbAge=dbg.entry.Age ) return result def process(self, data): result = {} pe = PE(data=data, fast_load=True) for switch, resolver, name in [ (self.args.debug, self.parse_debug, 'Debug'), # noqa (self.args.dotnet, self.parse_dotnet, 'DotNet'), # noqa (self.args.header, self.parse_header, 'Header'), # noqa (self.args.version, self.parse_version, 'Version'), # noqa (self.args.imports, self.parse_imports, 'Imports'), # noqa (self.args.exports, self.parse_exports, 'Exports'), # noqa ]: if not switch: continue self.log_debug(F'parsing: {name}') args = pe, data if switch > 1: args = *args, True try: info = resolver(*args) except Exception as E: self.log_info(F'failed to obtain {name}: {E!s}') continue if info: result[name] = info signature = {} if self.args.timestamps or self.args.signatures: with suppress(Exception): from refinery.units.formats.pe.pesig import pesig signature = self.parse_signature(next(data | pesig)) if self.args.timestamps: ts = self.parse_time_stamps(pe, self.args.timeraw, self.args.timestamps > 1) with suppress(KeyError): ts.update(Signed=signature['Timestamp']) result.update(TimeStamp=ts) if signature and self.args.signatures: result['Signature'] = signature if result: yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False) _LCID = { 0x0C00: 'Default Custom Locale Language', 0x1400: 'Default Custom MUI Locale Language', 0x007F: 'Invariant Locale Language', 0x0000: 'Neutral Locale Language', 0x0800: 'System Default Locale Language', 0x1000: 'Unspecified Custom Locale Language', 0x0400: 'User Default Locale Language', 0x0436: 'Afrikaans-South Africa', 0x041c: 'Albanian-Albania', 0x045e: 'Amharic-Ethiopia', 0x0401: 'Arabic (Saudi Arabia)', 0x1401: 'Arabic (Algeria)', 0x3c01: 'Arabic (Bahrain)', 0x0c01: 'Arabic (Egypt)', 0x0801: 'Arabic (Iraq)', 0x2c01: 'Arabic (Jordan)', 0x3401: 'Arabic (Kuwait)', 0x3001: 'Arabic (Lebanon)', 0x1001: 'Arabic (Libya)', 0x1801: 'Arabic (Morocco)', 0x2001: 'Arabic (Oman)', 0x4001: 'Arabic (Qatar)', 0x2801: 'Arabic (Syria)', 0x1c01: 'Arabic (Tunisia)', 0x3801: 'Arabic (U.A.E.)', 0x2401: 'Arabic (Yemen)', 0x042b: 'Armenian-Armenia', 0x044d: 'Assamese', 0x082c: 'Azeri (Cyrillic)', 0x042c: 'Azeri (Latin)', 0x042d: 'Basque', 0x0423: 'Belarusian', 0x0445: 'Bengali (India)', 0x0845: 'Bengali (Bangladesh)', 0x141A: 'Bosnian (Bosnia/Herzegovina)', 0x0402: 'Bulgarian', 0x0455: 'Burmese', 0x0403: 'Catalan', 0x045c: 'Cherokee-United States', 0x0804: 'Chinese (People\'s Republic of China)', 0x1004: 'Chinese (Singapore)', 0x0404: 'Chinese (Taiwan)', 0x0c04: 'Chinese (Hong Kong SAR)', 0x1404: 'Chinese (Macao SAR)', 0x041a: 'Croatian', 0x101a: 'Croatian (Bosnia/Herzegovina)', 0x0405: 'Czech', 0x0406: 'Danish', 0x0465: 'Divehi', 0x0413: 'Dutch-Netherlands', 0x0813: 'Dutch-Belgium', 0x0466: 'Edo', 0x0409: 'English (United States)', 0x0809: 'English (United Kingdom)', 0x0c09: 'English (Australia)', 0x2809: 'English (Belize)', 0x1009: 'English (Canada)', 0x2409: 'English (Caribbean)', 0x3c09: 'English (Hong Kong SAR)', 0x4009: 'English (India)', 0x3809: 'English (Indonesia)', 0x1809: 'English (Ireland)', 0x2009: 'English (Jamaica)', 0x4409: 'English (Malaysia)', 0x1409: 'English (New Zealand)', 0x3409: 'English (Philippines)', 0x4809: 'English (Singapore)', 0x1c09: 'English (South Africa)', 0x2c09: 'English (Trinidad)', 0x3009: 'English (Zimbabwe)', 0x0425: 'Estonian', 0x0438: 'Faroese', 0x0429: 'Farsi', 0x0464: 'Filipino', 0x040b: 'Finnish', 0x040c: 'French (France)', 0x080c: 'French (Belgium)', 0x2c0c: 'French (Cameroon)', 0x0c0c: 'French (Canada)', 0x240c: 'French (Democratic Rep. of Congo)', 0x300c: 'French (Cote d\'Ivoire)', 0x3c0c: 'French (Haiti)', 0x140c: 'French (Luxembourg)', 0x340c: 'French (Mali)', 0x180c: 'French (Monaco)', 0x380c: 'French (Morocco)', 0xe40c: 'French (North Africa)', 0x200c: 'French (Reunion)', 0x280c: 'French (Senegal)', 0x100c: 'French (Switzerland)', 0x1c0c: 'French (West Indies)', 0x0462: 'Frisian-Netherlands', 0x0467: 'Fulfulde-Nigeria', 0x042f: 'FYRO Macedonian', 0x083c: 'Gaelic (Ireland)', 0x043c: 'Gaelic (Scotland)', 0x0456: 'Galician', 0x0437: 'Georgian', 0x0407: 'German (Germany)', 0x0c07: 'German (Austria)', 0x1407: 'German (Liechtenstein)', 0x1007: 'German (Luxembourg)', 0x0807: 'German (Switzerland)', 0x0408: 'Greek', 0x0474: 'Guarani-Paraguay', 0x0447: 'Gujarati', 0x0468: 'Hausa-Nigeria', 0x0475: 'Hawaiian (United States)', 0x040d: 'Hebrew', 0x0439: 'Hindi', 0x040e: 'Hungarian', 0x0469: 'Ibibio-Nigeria', 0x040f: 'Icelandic', 0x0470: 'Igbo-Nigeria', 0x0421: 'Indonesian', 0x045d: 'Inuktitut', 0x0410: 'Italian (Italy)', 0x0810: 'Italian (Switzerland)', 0x0411: 'Japanese', 0x044b: 'Kannada', 0x0471: 'Kanuri-Nigeria', 0x0860: 'Kashmiri', 0x0460: 'Kashmiri (Arabic)', 0x043f: 'Kazakh', 0x0453: 'Khmer', 0x0457: 'Konkani', 0x0412: 'Korean', 0x0440: 'Kyrgyz (Cyrillic)', 0x0454: 'Lao', 0x0476: 'Latin', 0x0426: 'Latvian', 0x0427: 'Lithuanian', 0x043e: 'Malay-Malaysia', 0x083e: 'Malay-Brunei Darussalam', 0x044c: 'Malayalam', 0x043a: 'Maltese', 0x0458: 'Manipuri', 0x0481: 'Maori-New Zealand', 0x044e: 'Marathi', 0x0450: 'Mongolian (Cyrillic)', 0x0850: 'Mongolian (Mongolian)', 0x0461: 'Nepali', 0x0861: 'Nepali-India', 0x0414: 'Norwegian (Bokmål)', 0x0814: 'Norwegian (Nynorsk)', 0x0448: 'Oriya', 0x0472: 'Oromo', 0x0479: 'Papiamentu', 0x0463: 'Pashto', 0x0415: 'Polish', 0x0416: 'Portuguese-Brazil', 0x0816: 'Portuguese-Portugal', 0x0446: 'Punjabi', 0x0846: 'Punjabi (Pakistan)', 0x046B: 'Quecha (Bolivia)', 0x086B: 'Quecha (Ecuador)', 0x0C6B: 'Quecha (Peru)', 0x0417: 'Rhaeto-Romanic', 0x0418: 'Romanian', 0x0818: 'Romanian (Moldava)', 0x0419: 'Russian', 0x0819: 'Russian (Moldava)', 0x043b: 'Sami (Lappish)', 0x044f: 'Sanskrit', 0x046c: 'Sepedi', 0x0c1a: 'Serbian (Cyrillic)', 0x081a: 'Serbian (Latin)', 0x0459: 'Sindhi (India)', 0x0859: 'Sindhi (Pakistan)', 0x045b: 'Sinhalese-Sri Lanka', 0x041b: 'Slovak', 0x0424: 'Slovenian', 0x0477: 'Somali', 0x042e: 'Sorbian', 0x0c0a: 'Spanish (Modern Sort)', 0x040a: 'Spanish (Traditional Sort)', 0x2c0a: 'Spanish (Argentina)', 0x400a: 'Spanish (Bolivia)', 0x340a: 'Spanish (Chile)', 0x240a: 'Spanish (Colombia)', 0x140a: 'Spanish (Costa Rica)', 0x1c0a: 'Spanish (Dominican Republic)', 0x300a: 'Spanish (Ecuador)', 0x440a: 'Spanish (El Salvador)', 0x100a: 'Spanish (Guatemala)', 0x480a: 'Spanish (Honduras)', 0x580a: 'Spanish (Latin America)', 0x080a: 'Spanish (Mexico)', 0x4c0a: 'Spanish (Nicaragua)', 0x180a: 'Spanish (Panama)', 0x3c0a: 'Spanish (Paraguay)', 0x280a: 'Spanish (Peru)', 0x500a: 'Spanish (Puerto Rico)', 0x540a: 'Spanish (United States)', 0x380a: 'Spanish (Uruguay)', 0x200a: 'Spanish (Venezuela)', 0x0430: 'Sutu', 0x0441: 'Swahili', 0x041d: 'Swedish', 0x081d: 'Swedish-Finland', 0x045a: 'Syriac', 0x0428: 'Tajik', 0x045f: 'Tamazight (Arabic)', 0x085f: 'Tamazight (Latin)', 0x0449: 'Tamil', 0x0444: 'Tatar', 0x044a: 'Telugu', 0x041e: 'Thai', 0x0851: 'Tibetan (Bhutan)', 0x0451: 'Tibetan (People\'s Republic of China)', 0x0873: 'Tigrigna (Eritrea)', 0x0473: 'Tigrigna (Ethiopia)', 0x0431: 'Tsonga', 0x0432: 'Tswana', 0x041f: 'Turkish', 0x0442: 'Turkmen', 0x0480: 'Uighur-China', 0x0422: 'Ukrainian', 0x0420: 'Urdu', 0x0820: 'Urdu-India', 0x0843: 'Uzbek (Cyrillic)', 0x0443: 'Uzbek (Latin)', 0x0433: 'Venda', 0x042a: 'Vietnamese', 0x0452: 'Welsh', 0x0434: 'Xhosa', 0x0478: 'Yi', 0x043d: 'Yiddish', 0x046a: 'Yoruba', 0x0435: 'Zulu', 0x04ff: 'HID (Human Interface DeVITe)' } _CHARSET = { 0x0000: '7-bit ASCII', 0x03A4: 'Japan (Shift ? JIS X-0208)', 0x03B5: 'Korea (Shift ? KSC 5601)', 0x03B6: 'Taiwan (Big5)', 0x04B0: 'Unicode', 0x04E2: 'Latin-2 (Eastern European)', 0x04E3: 'Cyrillic', 0x04E4: 'Multilingual', 0x04E5: 'Greek', 0x04E6: 'Turkish', 0x04E7: 'Hebrew', 0x04E8: 'Arabic', } _WINVER = { 3: { 0x00: 'Windows NT 3', 0x0A: 'Windows NT 3.1', 0x32: 'Windows NT 3.5', 0x33: 'Windows NT 3.51', }, 4: { 0x00: 'Windows 95', 0x0A: 'Windows 98', }, 5: { 0x00: 'Windows 2000', 0x5A: 'Windows Me', 0x01: 'Windows XP', 0x02: 'Windows Server 2003', }, 6: { 0x00: 'Windows Vista', 0x01: 'Windows 7', 0x02: 'Windows 8', 0x03: 'Windows 8.1', }, 10: { 0x00: 'Windows 10', } }
class peoverlay (certificate=False, directories=False, memdump=False)
-
Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This does not include digital signatures. Use
pestrip
to obtain only the body of the PE file after removing the overlay.Expand source code Browse git
class peoverlay(OverlayUnit): """ Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This does not include digital signatures. Use `refinery.pestrip` to obtain only the body of the PE file after removing the overlay. """ def process(self, data: bytearray) -> bytearray: size = self._get_size(data) try: data[:size] = [] except Exception: return data[size:] else: return data
class perc (*paths, pretty=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)
-
Extract PE file resources.
Expand source code Browse git
class perc(PathExtractorUnit): """ Extract PE file resources. """ def __init__( self, *paths, pretty: Arg.Switch('-p', help='Add missing headers to bitmap and icon resources.') = False, **kwargs ): super().__init__(*paths, pretty=pretty, **kwargs) def _get_icon_dir(self, pe: pefile.PE): try: group, = (e for e in pe.DIRECTORY_ENTRY_RESOURCE.entries if e.id == RSRC.ICON_GROUP.value) group = group.directory.entries[0].directory.entries[0].data.struct return GRPICONDIR(pe.get_data(group.OffsetToData, group.Size)) except Exception: return None def _search(self, pe: pefile.PE, directory, level=0, *parts): if level >= 3: self.log_warn(F'unexpected resource tree level {level + 1:d}') for entry in directory.entries: if entry.name: identifier = str(entry.name) elif level == 0 and entry.id in iter(RSRC): identifier = RSRC(entry.id) elif entry.id is not None: identifier = entry.id else: self.log_warn(F'resource entry has name {entry.name} and id {entry.id} at level {level + 1:d}') continue if entry.struct.DataIsDirectory: yield from self._search(pe, entry.directory, level + 1, *parts, identifier) else: rva = entry.data.struct.OffsetToData size = entry.data.struct.Size path = '/'.join(str(p) for p in (*parts, identifier)) extract = None if self.args.pretty: if parts[0] is RSRC.BITMAP: extract = self._handle_bitmap(pe, rva, size) elif parts[0] is RSRC.ICON: extract = self._handle_icon(pe, parts, rva, size) elif parts[0] is RSRC.STRING: extract = self._handle_strings(pe, parts, rva, size) if extract is None: def extract(pe=pe): return pe.get_data(rva, size) yield UnpackResult( path, extract, offset=pe.get_offset_from_rva(rva), lcid=self._get_lcid(entry.data), ) def _get_lcid(self, node_data) -> Optional[str]: try: pid = node_data.lang or 0 sid = node_data.sublang or 0 except AttributeError: return None try: pid = self._LANG_ID_TO_LCID[pid] except KeyError: return None lcid = pid.get(sid, 0) return pemeta._LCID.get(lcid) def _handle_strings(self, pe: pefile.PE, parts: Tuple[RSRC, int, int], rva: int, size: int): def extract(pe=pe): self.log_debug(parts) base = (parts[1] - 1) << 4 reader = StructReader(pe.get_data(rva, size)) table = {} index = 0 while not reader.eof: string = reader.read_exactly(reader.u16() * 2) if not string: break key = F'{base + index:04X}' table[key] = string.decode('utf-16le') index += 1 return json.dumps(table, indent=4).encode(self.codec) return extract def _handle_bitmap(self, pe: pefile.PE, rva: int, size: int): def extract(pe=pe): bitmap = pe.get_data(rva, size) total = (len(bitmap) + 14).to_bytes(4, 'little') return B'BM' + total + B'\0\0\0\0\x36\0\0\0' + bitmap return extract def _handle_icon(self, pe: pefile.PE, parts: Tuple[RSRC, int, int], rva: int, size: int): try: icondir = self._get_icon_dir(pe) index = int(parts[1]) - 1 info = icondir.entries[index] icon = pe.get_data(rva, size) except Exception: return None if icon.startswith(B'(\0\0\0'): header = struct.pack('<HHHBBBBHHII', 0, 1, 1, info.width, info.height, info.color_count, 0, info.planes, info.bit_count, len(icon), 0x16 ) icon = header + icon return icon def unpack(self, data): pe = pefile.PE(data=data, fast_load=True) pe.parse_data_directories( directories=pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']) try: rsrc = pe.DIRECTORY_ENTRY_RESOURCE except AttributeError: pass else: yield from self._search(pe, rsrc) def _mktbl(ids: List[Tuple[int, int, int]]) -> Dict[int, Dict[int, int]]: table = {} for pid, sid, lcid in ids: if pid not in table: table[pid] = {0: lcid} table[pid][sid] = lcid return table _LANG_ID_TO_LCID = _mktbl([ (0x00, 0x03, 0x0C00), (0x00, 0x05, 0x1400), (0x7F, 0x00, 0x007F), (0x00, 0x00, 0x0000), (0x02, 0x02, 0x0800), (0x00, 0x04, 0x1000), (0x00, 0x01, 0x0400), (0x36, 0x01, 0x0436), (0x1c, 0x01, 0x041C), (0x84, 0x01, 0x0484), (0x5E, 0x01, 0x045E), (0x01, 0x05, 0x1401), (0x01, 0x0f, 0x3C01), (0x01, 0x03, 0x0C01), (0x01, 0x02, 0x0801), (0x01, 0x0B, 0x2C01), (0x01, 0x0D, 0x3401), (0x01, 0x0C, 0x3001), (0x01, 0x04, 0x1001), (0x01, 0x06, 0x1801), (0x01, 0x08, 0x2001), (0x01, 0x10, 0x4001), (0x01, 0x01, 0x0401), (0x01, 0x0A, 0x2801), (0x01, 0x07, 0x1C01), (0x01, 0x0E, 0x3801), (0x01, 0x09, 0x2401), (0x2B, 0x01, 0x042B), (0x4D, 0x01, 0x044D), (0x2C, 0x02, 0x082C), (0x2C, 0x01, 0x042C), (0x45, 0x02, 0x0445), (0x6D, 0x01, 0x046D), (0x2d, 0x01, 0x042D), (0x23, 0x01, 0x0423), (0x1A, 0x08, 0x201A), (0x1A, 0x05, 0x141A), (0x7E, 0x01, 0x047E), (0x02, 0x01, 0x0402), (0x92, 0x01, 0x0492), (0x5C, 0x01, 0x045C), (0x03, 0x01, 0x0403), (0x04, 0x03, 0x0C04), (0x04, 0x05, 0x1404), (0x04, 0x04, 0x1004), (0x04, 0x02, 0x0004), (0x04, 0x01, 0x7C04), (0x83, 0x01, 0x0483), (0x1A, None, 0x001A), (0x1a, 0x04, 0x101A), (0x1a, 0x01, 0x041A), (0x05, 0x01, 0x0405), (0x06, 0x01, 0x0406), (0x8C, 0x01, 0x048C), (0x65, 0x01, 0x0465), (0x13, 0x02, 0x0813), (0x13, 0x01, 0x0413), (0x09, 0x03, 0x0C09), (0x09, 0x0A, 0x2809), (0x09, 0x04, 0x1009), (0x09, 0x09, 0x2409), (0x09, 0x10, 0x4009), (0x09, 0x06, 0x1809), (0x09, 0x08, 0x2009), (0x09, 0x11, 0x4409), (0x09, 0x05, 0x1409), (0x09, 0x0D, 0x3409), (0x09, 0x12, 0x4809), (0x09, 0x07, 0x1c09), (0x09, 0x0B, 0x2C09), (0x09, 0x02, 0x0809), (0x09, 0x01, 0x0409), (0x09, 0x0C, 0x3009), (0x25, 0x01, 0x0425), (0x38, 0x01, 0x0438), (0x64, 0x01, 0x0464), (0x0B, 0x01, 0x040B), (0x0C, 0x02, 0x080c), (0x0C, 0x03, 0x0C0C), (0x0C, 0x01, 0x040c), (0x0C, 0x05, 0x140C), (0x0C, 0x06, 0x180C), (0x0C, 0x04, 0x100C), (0x62, 0x01, 0x0462), (0x56, 0x01, 0x0456), (0x37, 0x01, 0x0437), (0x07, 0x03, 0x0C07), (0x07, 0x01, 0x0407), (0x07, 0x05, 0x1407), (0x07, 0x04, 0x1007), (0x07, 0x02, 0x0807), (0x08, 0x01, 0x0408), (0x6F, 0x01, 0x046F), (0x47, 0x01, 0x0447), (0x68, 0x01, 0x0468), (0x75, 0x01, 0x0475), (0x0D, 0x01, 0x040D), (0x39, 0x01, 0x0439), (0x0E, 0x01, 0x040E), (0x0F, 0x01, 0x040F), (0x70, 0x01, 0x0470), (0x21, 0x01, 0x0421), (0x5D, 0x02, 0x085D), (0x5D, 0x01, 0x045D), (0x3C, 0x02, 0x083C), (0x34, 0x01, 0x0434), (0x35, 0x01, 0x0435), (0x10, 0x01, 0x0410), (0x10, 0x02, 0x0810), (0x11, 0x01, 0x0411), (0x4B, 0x01, 0x044B), (0x3F, 0x01, 0x043F), (0x53, 0x01, 0x0453), (0x86, 0x01, 0x0486), (0x87, 0x01, 0x0487), (0x57, 0x01, 0x0457), (0x12, 0x01, 0x0412), (0x40, 0x01, 0x0440), (0x54, 0x01, 0x0454), (0x26, 0x01, 0x0426), (0x27, 0x01, 0x0427), (0x2E, 0x02, 0x082E), (0x6E, 0x01, 0x046E), (0x2F, 0x01, 0x042F), (0x3E, 0x02, 0x083E), (0x3E, 0x01, 0x043e), (0x4C, 0x01, 0x044C), (0x3A, 0x01, 0x043A), (0x81, 0x01, 0x0481), (0x7A, 0x01, 0x047A), (0x4E, 0x01, 0x044E), (0x7C, 0x01, 0x047C), (0x50, 0x01, 0x0450), (0x50, 0x02, 0x0850), (0x61, 0x01, 0x0461), (0x14, 0x01, 0x0414), (0x14, 0x02, 0x0814), (0x82, 0x01, 0x0482), (0x48, 0x01, 0x0448), (0x63, 0x01, 0x0463), (0x29, 0x01, 0x0429), (0x15, 0x01, 0x0415), (0x16, 0x01, 0x0416), (0x16, 0x02, 0x0816), (0x67, 0x02, 0x0867), (0x46, 0x01, 0x0446), (0x46, 0x02, 0x0846), (0x6B, 0x01, 0x046B), (0x6B, 0x02, 0x086B), (0x6B, 0x03, 0x0C6B), (0x18, 0x01, 0x0418), (0x17, 0x01, 0x0417), (0x19, 0x01, 0x0419), (0x85, 0x01, 0x0485), (0x3B, 0x09, 0x243B), (0x3B, 0x04, 0x103B), (0x3B, 0x05, 0x143B), (0x3B, 0x03, 0x0C3B), (0x3B, 0x01, 0x043B), (0x3B, 0x02, 0x083B), (0x3B, 0x08, 0x203B), (0x3B, 0x06, 0x183B), (0x3B, 0x07, 0x1C3B), (0x4F, 0x01, 0x044F), (0x1a, 0x07, 0x1C1A), (0x1a, 0x06, 0x181A), (0x1a, 0x03, 0x0C1A), (0x1a, 0x02, 0x081A), (0x6C, 0x01, 0x046C), (0x32, 0x02, 0x0832), (0x32, 0x01, 0x0432), (0x32, 0x01, 0x0459), (0x32, 0x02, 0x0859), (0x5B, 0x01, 0x045B), (0x1b, 0x01, 0x041B), (0x24, 0x01, 0x0424), (0x0A, 0x0b, 0x2C0A), (0x0A, 0x10, 0x400A), (0x0A, 0x0D, 0x340A), (0x0A, 0x09, 0x240A), (0x0A, 0x05, 0x140A), (0x0A, 0x07, 0x1C0A), (0x0A, 0x0C, 0x300A), (0x0A, 0x11, 0x440A), (0x0A, 0x04, 0x100A), (0x0A, 0x12, 0x480A), (0x0A, 0x02, 0x080A), (0x0A, 0x13, 0x4C0A), (0x0A, 0x06, 0x180A), (0x0A, 0x0F, 0x3C0A), (0x0A, 0x0A, 0x280A), (0x0A, 0x14, 0x500A), (0x0A, 0x03, 0x0C0A), (0x0A, 0x01, 0x040A), (0x0A, 0x15, 0x540A), (0x0A, 0x0E, 0x380A), (0x0A, 0x08, 0x200A), (0x41, 0x01, 0x0441), (0x1D, 0x02, 0x081D), (0x1D, 0x01, 0x041D), (0x5A, 0x01, 0x045A), (0x28, 0x01, 0x0428), (0x5F, 0x02, 0x085F), (0x49, 0x01, 0x0449), (0x49, 0x02, 0x0849), (0x44, 0x01, 0x0444), (0x4A, 0x01, 0x044A), (0x1E, 0x01, 0x041E), (0x51, 0x01, 0x0451), (0x73, 0x02, 0x0873), (0x73, 0x01, 0x0473), (0x1F, 0x01, 0x041F), (0x42, 0x01, 0x0442), (0x22, 0x01, 0x0422), (0x2E, 0x01, 0x042E), (0x20, 0x02, 0x0820), (0x20, 0x01, 0x0420), (0x80, 0x01, 0x0480), (0x43, 0x02, 0x0843), (0x43, 0x01, 0x0443), (0x03, 0x02, 0x0803), (0x2A, 0x01, 0x042A), (0x52, 0x01, 0x0452), (0x88, 0x01, 0x0488), (0x78, 0x01, 0x0478), (0x6A, 0x01, 0x046A), ])
class pesig
-
Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital signatures in DER format.
Expand source code Browse git
class pesig(Unit): """ Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital signatures in DER format. """ _SECDIRID = DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY'] def __init__(self): pass def process(self, data: bytearray) -> bytearray: pe = PE(data=data, fast_load=True) pe.parse_data_directories(directories=[self._SECDIRID]) security = pe.OPTIONAL_HEADER.DATA_DIRECTORY[self._SECDIRID] self.log_info(F'signature offset: 0x{security.VirtualAddress:08X}') self.log_info(F'signature length: 0x{security.Size:08X}') if security.VirtualAddress == 0 or security.Size == 0: raise ValueError(F'IMAGE_DIRECTORY_ENTRY_SECURITY ({self._SECDIRID}) is corrupt.') sgnoff = security.VirtualAddress + 8 sgnend = sgnoff + security.Size length, revision, certtype = unpack('<IHH', data[sgnoff - 8:sgnoff]) signature = data[sgnoff:sgnend] if len(signature) + 8 != length: raise RefineryPartialResult( F'Found {len(signature) + 8} bytes of signature, but length should be {length}.', partial=signature) return signature
class pestrip (certificate=False, directories=False, memdump=False)
-
Removes the overlay of a PE file and returns the main executable. Use
peoverlay
to extract the overlay.Expand source code Browse git
class pestrip(OverlayUnit): """ Removes the overlay of a PE file and returns the main executable. Use `refinery.peoverlay` to extract the overlay. """ def process(self, data: bytearray) -> bytearray: size = self._get_size(data) try: data[size:] = [] except Exception: data = data[:size] else: return data
class pick (*bounds)
-
Picks sequences from the array of multiple inputs. For example,
pick 0 2:
will return all but the second ingested input (which has index1
).Expand source code Browse git
class pick(Unit): """ Picks sequences from the array of multiple inputs. For example, `pick 0 2:` will return all but the second ingested input (which has index `1`). """ def __init__(self, *bounds: Arg.Bounds(nargs='*', default=[0])): super().__init__(bounds=[sliceobj(s) for s in bounds]) def process(self, data: Chunk): if not data.visible: yield data return state: _PickState = data.temp a = state.accessor lower = a.start upper = a.stop if lower is not None: lower -= state.discarded if upper is not None: upper -= state.discarded if state.consumed: yield from state.remaining[slice(lower, upper, a.step)] return while lower: try: chunk = next(state.chunks) except StopIteration: upper = None break if chunk.visible: lower -= 1 upper -= 1 state.discarded += 1 else: yield chunk if upper is None: yield from state.chunks return while upper: try: chunk = next(state.chunks) except StopIteration: break if chunk.visible: upper -= 1 state.discarded += 1 yield chunk def filter(self, chunks: Iterable[Chunk]): chunks = begin(chunks) if chunks is None: return container, chunks = chunks if container.scope < 1: raise RuntimeError(F'{self.__class__.__name__} cannot be used outside a frame; maybe you meant to use snip?') container = container.copy() container.visible = True state = _PickState(deque(self.args.bounds), chunks) while state.next(): if not state.consumed: if not state.discardable(): self.log_debug(F'consumed input into buffer after {state.discarded} skips') for chunk in state.chunks: if not chunk.visible: yield chunk continue state.remaining.append(chunk) state.consumed = True container.temp = state yield container
class pkcs7
-
Converts PKCS7 encoded data to a JSON representation.
Expand source code Browse git
class pkcs7(Unit): """ Converts PKCS7 encoded data to a JSON representation. """ @Unit.Requires('asn1crypto', 'default', 'extended') def _asn1crypto(): import asn1crypto import asn1crypto.cms import asn1crypto.core import asn1crypto.x509 return asn1crypto def process(self, data: bytes): asn1 = self._asn1crypto.core cms = self._asn1crypto.cms signature = cms.ContentInfo.load(data) def unsign(data): if isinstance(data, int): size = data.bit_length() if data < 0: data = (1 << (size + 1)) - ~data - 1 if data > 0xFFFFFFFF_FFFFFFFF: size, r = divmod(size, 8) size += bool(r) data = data.to_bytes(size, 'big').hex() return data elif isinstance(data, dict): return {key: unsign(value) for key, value in data.items()} elif isinstance(data, list): return [unsign(x) for x in data] else: return data class SpcString(asn1.Choice): _alternatives = [ ('unicode', asn1.BMPString, {'implicit': 0}), ('ascii', asn1.IA5String, {'implicit': 1}) ] SpcUuid = asn1.OctetString class SpcSerializedObject(asn1.Sequence): _fields = [ ('classId', SpcUuid), ('serializedData', asn1.OctetString), ] class SpcLink(asn1.Choice): _alternatives = [ ('url', asn1.IA5String, {'implicit': 0}), ('monikier', SpcSerializedObject, {'implicit': 1}), ('file', SpcString, {'explicit': 2}) ] class SpcSpOpusInfo(asn1.Sequence): _fields = [ ('programName', SpcString, {'optional': True, 'explicit': 0}), ('moreInfo', SpcLink, {'optional': True, 'explicit': 1}), ] class SetOfInfos(asn1.SetOf): _child_spec = SpcSpOpusInfo cms.CMSAttributeType._map['1.3.6.1.4.1.311.2.1.12'] = 'authenticode_info' cms.CMSAttribute._oid_specs['authenticode_info'] = SetOfInfos class ParsedASN1ToJSON(BytesAsArrayEncoder): unit = self @classmethod def _is_keyval(cls, obj): return ( isinstance(obj, dict) and set(obj.keys()) == {'type', 'values'} and len(obj['values']) == 1 ) @classmethod def handled(cls, obj) -> bool: return BytesAsArrayEncoder.handled(obj) or cls._is_keyval(obj) def encode_bytes(self, obj: bytes): with suppress(Exception): string = obj.decode('latin1') if string.isprintable(): return string return super().encode_bytes(obj) def default(self, obj): if self._is_keyval(obj): return dict(type=obj['type'], value=obj['values'][0]) with suppress(TypeError): return super().default(obj) if isinstance(obj, (set, tuple)): return list(obj) if isinstance(obj, datetime): return str(obj) dict_result = {} list_result = None if isinstance(obj, self.unit._asn1crypto.x509.Certificate): dict_result.update(fingerprint=obj.sha1.hex()) if isinstance(obj, asn1.BitString): return {'bit_string': obj.native} with suppress(Exception): list_result = list(obj) if all(isinstance(k, str) for k in list_result): dict_result.update((key, obj[key]) for key in list_result) if dict_result: return dict_result if list_result is not None: return list_result if isinstance(obj, self.unit._asn1crypto.cms.CertificateChoices): return obj.chosen if isinstance(obj, asn1.Sequence): children = obj.children if children: return children return obj.dump() with suppress(Exception): return obj.native if isinstance(obj, asn1.Any): parsed = None with suppress(Exception): parsed = obj.parse() if parsed: return parsed return obj.dump() if isinstance(obj, asn1.Asn1Value): return obj.dump() raise ValueError(F'Unable to determine JSON encoding of {obj.__class__.__name__} object.') with ParsedASN1ToJSON as encoder: encoded = encoder.dumps(signature) converted = unsign(json.loads(encoded)) return json.dumps(converted, indent=4).encode(self.codec)
class pkcs7sig (tabular=False)
-
Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to parse authenticode signatures appended to files that are not PE files to get the same output that is produced by the pemeta unit.
Expand source code Browse git
class pkcs7sig(Unit): """ Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to parse authenticode signatures appended to files that are not PE files to get the same output that is produced by the pemeta unit. """ def __init__(self, tabular: Arg('-t', help='Print information in a table rather than as JSON') = False): super().__init__(tabular=tabular) def process(self, data: bytes): json = pemeta.parse_signature(data) yield from ppjson(tabular=self.args.tabular)._pretty_output(json, indent=4, ensure_ascii=False)
class pop (*names)
-
In processing order, remove visible chunks from the current frame and store their contents in the given meta variables. All chunks in the input stream are consequently made visible again. If pop is used at the end of a frame, then variables will be local to the parent frame.
Expand source code Browse git
class pop(Unit): """ In processing order, remove visible chunks from the current frame and store their contents in the given meta variables. All chunks in the input stream are consequently made visible again. If pop is used at the end of a frame, then variables will be local to the parent frame. """ def __init__( self, *names: Arg(type=str, metavar=F'[name[:conversion]|count|{_popcount._MERGE_SYMBOL}]', help=( R'Specify either the name of a single variable to receive the contents of an input chunk, or ' R'an integer expression that specifies a number of values to be removed from the input without ' F'storing them. Additionally, it is possible to specify the symbol "{_popcount._MERGE_SYMBOL}" ' R'to remove a single chunk from the input and merge its meta data into the following ones. By ' R'default, a single merge is performed. When a variable name is specified, a sequence of ' R'transformations can be appended to be applied before storing it. For example, the argument ' R'k:le:b64 would first decode the chunk using base64, then convert it to an integer in little ' R'endian format, and store the integer result in the variable `k`. The visual aid is that the ' R'content is passed from right to left through all conversions, into the variable `k`.' )) ): if not names: names = _popcount._MERGE_SYMBOL, super().__init__(names=[_popcount(n) for n in names]) def process(self, data): return data def filter(self, chunks: Iterable[Chunk]): invisible = [] variables = {} remaining: Iterator[_popcount] = iter(self.args.names) it = iter(chunks) pop = next(remaining).reset() done = False for chunk in it: if not chunk.visible: self.log_debug('buffering invisible chunk') invisible.append(chunk) continue try: while not pop.into(variables, chunk): pop = next(remaining).reset() except StopIteration: done = True invisible.append(chunk) break if not done and pop.done: try: next(remaining) except StopIteration: done = True if not done: raise ValueError('Not all variables could be assigned.') nesting = self.args.nesting for chunk in chain(invisible, it): meta = chunk.meta meta.update(variables) if nesting < 0: for name in variables: meta.set_scope(name, chunk.scope + nesting) chunk.visible = True yield chunk
class ppjscript (indent=4)
-
Pretty-prints JavaScript without any reflection or evaluation.
Expand source code Browse git
class ppjscript(Unit): """ Pretty-prints JavaScript without any reflection or evaluation. """ def __init__(self, indent: Arg.Number('-i', help=( 'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4 ): return super().__init__(indent=indent) @Unit.Requires('jsbeautifier', 'display', 'extended') def _jsb(): import jsbeautifier import jsbeautifier.unpackers.javascriptobfuscator # TODO: This is a workaround for the following bug: # https://github.com/beautify-web/js-beautify/issues/1350 jsbeautifier.unpackers.javascriptobfuscator.detect = lambda *_: False return jsbeautifier @unicoded def process(self, data: str) -> str: return self._jsb.beautify(data, dict(eval_code=False, indent_size=self.args.indent))
class ppjson (tabular=False, indent=4)
-
Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to zero, the output is minified.
Expand source code Browse git
class ppjson(Unit): """ Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to zero, the output is minified. """ _TRAILING_COMMA = re.compile(BR',\s*(}|])') def __init__( self, tabular: Arg.Switch('-t', group='OUT', help='Convert JSON input into a flattened table.') = False, indent : Arg.Number('-i', group='OUT', help='Number of spaces used for indentation. Default is {default}.') = 4 ): return super().__init__(indent=indent, tabular=tabular) def _pretty_output(self, parsed, **kwargs): if self.args.tabular: table = list(flattened(parsed)) width = max(len(key) for key, _ in table) tsize = get_terminal_size(80) - width - 4 for key, value in table: value = str(value).rstrip() value = textwrap.wrap(value, tsize) it = iter(value) try: item = next(it) except StopIteration: continue yield F'{key:<{width}} : {item}'.encode(self.codec) for wrap in it: yield F'{"":<{width + 3}}{wrap}'.encode(self.codec) else: yield json.dumps(parsed, **kwargs).encode(self.codec) def process(self, data): if self._TRAILING_COMMA.search(data): def smartfix(match): k = match.start() return match.group(0 if any(k in s for s in strings) else 1) from refinery.lib.patterns import formats strings = {range(*m.span()) for m in formats.string.finditer(data)} data = self._TRAILING_COMMA.sub(smartfix, data) kwargs = {'indent': self.args.indent} if self.args.indent else {'separators': (',', ':')} yield from self._pretty_output(json.loads(data), **kwargs)
class ppxml (indent=4, header=False)
-
Expects XML input data and outputs it in a neatly formatted manner.
Expand source code Browse git
class ppxml(Unit): """ Expects XML input data and outputs it in a neatly formatted manner. """ def __init__(self, indent: Arg.Number('-i', help=( 'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4, header: Arg.Switch('-x', help='Add an XML header to the formatted output.') = False ): super().__init__(indent=indent, header=header) def process(self, data): pad = self.args.indent * ' ' etm = {} try: dom = ForgivingParse(data, etm) except Exception: from refinery.lib.meta import metavars msg = 'error parsing as XML, returning original content' path = metavars(data).get('path') if path: msg = F'{msg}: {path}' self.log_warn(msg) return data def indent(element, level=0, more_sibs=False): """ The credit for this one goes to: https://stackoverflow.com/a/12940014 """ indentation = '\n' if level: indentation += (level - 1) * pad childcount = len(element) if childcount: if not element.text or not element.text.strip(): element.text = indentation + pad if level: element.text += pad for count, child in enumerate(element): indent(child, level + 1, count < childcount - 1) if level and (not element.tail or element.tail.isspace()): element.tail = indentation if more_sibs: element.tail += pad elif level and (not element.tail or element.tail.isspace()): element.tail = indentation if more_sibs: element.tail += pad indent(dom.getroot()) with io.BytesIO() as output: dom.write(output, encoding=self.codec, xml_declaration=self.args.header) result = output.getvalue() for uid, key in etm.items(): entity = F'&{key};'.encode(self.codec) needle = uid.encode(self.codec) result = result.replace(needle, entity) return result
class ps1str
-
Escapes and unescapes PowerShell strings.
Expand source code Browse git
class ps1str(Unit): """ Escapes and unescapes PowerShell strings. """ UNESCAPE = { '`0': '\0', '`a': '\a', '`b': '\b', '`f': '\f', '`n': '\n', '`r': '\r', '`t': '\t', '`v': '\v', '``': '`', "`'": '\'', '`"': '\"', } ESCAPE = { '`' : '``', '$' : '`$', '\0': '`0', '\a': '`a', '\b': '`b', '\f': '`f', '\n': '`n', '\r': '`r', '\t': '`t', '\v': '`v', '\'': "`'", '\"': '""', } def __init__(self): pass @unicoded def process(self, data): match = re.fullmatch(R'''@(['"])\s*?[\r\n](.*?)[\r\n]\1@''', data, flags=re.DOTALL) if match: return match.group(2) if data[0] not in '\'\"' or data[-1] != data[0]: raise ValueError( 'No quotes found at beginning of input. To escape a PowerShell string, the ' 'quotes must be included because quote escaping depends on whether a single ' 'or double quote was used.') quote, data = data[0], data[1:-1] def unescape(match): string = match[0] return self.UNESCAPE.get(string, string[1:]) if quote == '"': if re.search(R'(?<!`)\$(?=[\w\(\{\$\?\^:])', data): self.log_warn('Loss of information: double quoted string contains variable substitutions.') data = re.sub('`.', unescape, data) return data.replace(quote + quote, quote) @unicoded def reverse(self, data): def escaper(match): char = match[0] return ps1str.ESCAPE.get(char, char) return '"{}"'.format(re.sub(R'''[\x00\x07-\x0D`$'"]''', escaper, data))
class push (data=b'')
-
The unit inserts an additional chunk before each input chunk and moves the original data out of scope. This chunk is considered the "original" data, while the one inserted in front of it is used as an intermediate result. By default, this intermediate data is a copy of the input data. For example:
emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]
will output
key=censored
. The application ofrex
turns the (duplicated) data into just the value, which is then stored in the variablev
. The application ofrepl
replaces this value with the hard-coded stringcensored
.Expand source code Browse git
class push(Unit): """ The unit inserts an additional chunk before each input chunk and moves the original data out of scope. This chunk is considered the "original" data, while the one inserted in front of it is used as an intermediate result. By default, this intermediate data is a copy of the input data. For example: emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ] will output `key=censored`. The application of `refinery.rex` turns the (duplicated) data into just the value, which is then stored in the variable `v`. The application of `refinery.repl` replaces this value with the hard-coded string `censored`. """ def __init__(self, data: Arg(help='The data to be pushed, by default a copy of the input.') = B''): super().__init__(data=data) def process(self, data: Chunk): src = self.args.data tos = data.copy(meta=True, data=False) tos[:] = src or data if self.args.nesting > 0: data.set_next_scope(False) else: try: data.visible = False except AttributeError: self.log_warn('application has no effect outside frame.') yield data yield tos
class put (name, value=<object object>)
-
Can be used to add a meta variable to the processed chunk. Note that meta variables cease to exist outside a frame.
Expand source code Browse git
class put(Unit): """ Can be used to add a meta variable to the processed chunk. Note that meta variables cease to exist outside a frame. """ def __init__( self, name : Arg(help='The name of the variable to be used.', type=str), value: Arg(help='The value for the variable. If no value is given, the entire current chunk is stored.', type=functools.partial(numseq, typecheck=False)) = _EMPTY ): super().__init__(name=check_variable_name(name), value=value) def process(self, data: Chunk): value = self.args.value if value is _EMPTY: value = data if not isinstance(value, (int, float)) and not isbuffer(value): try: len(value) except TypeError: if isinstance(value, itertools.repeat): value = next(value) if not isinstance(value, (int, float)): raise NotImplementedError(F'put does not support {value.__class__.__name__} values.') else: if not isinstance(value, list): value = list(value) self.log_debug(F'storing {typename(value)}:', value, clip=True) data.meta[self.args.name] = value return data
class pyc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does not work on recent Python versions, but anything below 3.9 should work.
Expand source code Browse git
class pyc(ArchiveUnit): """ Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does not work on recent Python versions, but anything below 3.9 should work. """ def unpack(self, data): input_path = metavars(data).get(self.args.path.decode(self.codec)) for k, code in enumerate(extract_code_from_buffer(bytes(data), input_path)): path = code.container.co_filename or F'__unknown_name_{k:02d}.py' date = datetime.fromtimestamp(code.timestamp) data = decompile_buffer(code) yield self._pack(path, date, data)
class pym
-
Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an older Python version, you can use the
pyc
unit to then decompile the code, but for more recent versions a separate Python decompiler will be required.WARNING: This unit will invoke the
marshal.loads
function, which may be unsafe. Please refer to the official Python documentation for more details.Expand source code Browse git
class pym(Unit): """ Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an older Python version, you can use the `refinery.pyc` unit to then decompile the code, but for more recent versions a separate Python decompiler will be required. WARNING: This unit will invoke the `marshal.loads` function, which may be unsafe. Please refer to the official Python documentation for more details. """ def reverse(self, data): return marshal.dumps(data) def process(self, data): data = marshal.loads(data) code = (lambda: 0).__code__.__class__ def toblob(data): if isinstance(data, (bytes, bytearray)): self.log_info(U'unmarshalled a byte string, returning as is') return data if isinstance(data, str): self.log_info(F'unmarshalled a string object, encoding as {self.codec}') return data.encode(self.codec) if isinstance(data, code): self.log_info(U'unmarshalled a code object, converting to pyc') import importlib return importlib._bootstrap_external._code_to_timestamp_pyc(data) if isinstance(data, int): self.log_info(U'unmarshalled an integer, returning big endian encoding') q, r = divmod(data.bit_length(), 8) q += int(bool(r)) return data.to_bytes(q, 'big') if isinstance(data, dict): try: import json serialized = json.dumps(data, indent=4) except Exception: pass else: self.log_info(U'unmarshalled a serializable dictionary, returning JSON') return serialized.encode(self.codec) raise NotImplementedError( F'No serialization implemented for object of type {data.__class__.__name__}') if isinstance(data, list): self.log_info('object is a list, converting each item individually') for item in data: yield toblob(item) else: yield toblob(data)
class qb (*data)
-
Short for "queue back": Insert new chunks at the end of the current frame.
Expand source code Browse git
class qb(QueueUnit): """ Short for "queue back": Insert new chunks at the end of the current frame. """ def filter(self, chunks: Iterable[Chunk]): yield from self._queue(chunks, False)
class qf (*data)
-
Short for "queue front": Insert new chunks at the beginning of the current frame.
Expand source code Browse git
class qf(QueueUnit): """ Short for "queue front": Insert new chunks at the beginning of the current frame. """ def filter(self, chunks: Iterable[Chunk]): yield from self._queue(chunks, True)
class qlz
-
This unit implements QuickLZ decompression levels 1 and 3.
Expand source code Browse git
class qlz(Unit): """ This unit implements QuickLZ decompression levels 1 and 3. """ def process(self, data): source = memoryview(data) head = source[0] clvl = (head >> 2) & 0x3 if head & 2: self.log_info('long header detected') size = int.from_bytes(source[5:9], 'little') source = source[9:] else: self.log_info('short header detected') size = source[3] source = source[3:] if head & 1 != 1: self.log_warn('header indicates that data is uncompressed, returning remaining data') return source else: self.log_info(F'compression level {clvl}, decompressed size {SizeInt(size)!r}') def fetchhash(): return int.from_bytes(destination[hashvalue + 1:hashvalue + 4], byteorder='little') codeword = 1 destination = bytearray() hashtable = [0] * _HASH_VALUES hashvalue = -1 last_matchstart = size - _UNCONDITIONAL_MATCHLEN - _UNCOMPRESSED_END - 1 fetch = 0 if clvl == 2: raise ValueError("This version only supports level 1 and 3") while source: if codeword == 1: codeword = int.from_bytes(source[:4], byteorder='little') source = source[4:] if len(destination) <= last_matchstart: c = 3 if clvl == 1 else 4 fetch = int.from_bytes(source[:c], byteorder='little') if codeword & 1: codeword = codeword >> 1 if clvl == 1: hash = (fetch >> 4) & 0xFFF offset = hashtable[hash] if fetch & 0xF: matchlen = (fetch & 0xF) + 2 source = source[2:] else: matchlen = source[2] source = source[3:] else: if (fetch & 3) == 0: delta = (fetch & 0xFF) >> 2 matchlen = 3 source = source[1:] elif (fetch & 2) == 0: delta = (fetch & 0xFFFF) >> 2 matchlen = 3 source = source[2:] elif (fetch & 1) == 0: delta = (fetch & 0xFFFF) >> 6 matchlen = ((fetch >> 2) & 15) + 3 source = source[2:] elif (fetch & 127) != 3: delta = (fetch >> 7) & 0x1FFFF matchlen = ((fetch >> 2) & 0x1F) + 2 source = source[3:] else: delta = fetch >> 15 matchlen = ((fetch >> 7) & 255) + 3 source = source[4:] offset = (len(destination) - delta) & 0xFFFFFFFF for i in range(offset, offset + matchlen): destination.append(destination[i]) if clvl == 1: fetch = fetchhash() while hashvalue < len(destination) - matchlen: hashvalue += 1 hash = ((fetch >> 12) ^ fetch) & _HASH_MASK hashtable[hash] = hashvalue fetch = fetch >> 8 & 0xFFFF try: fetch |= destination[hashvalue + 3] << 16 except IndexError: pass fetch = int.from_bytes(source[:3], byteorder='little') else: fetch = int.from_bytes(source[:4], byteorder='little') hashvalue = len(destination) - 1 else: if len(destination) <= last_matchstart: destination.append(source[0]) source = source[1:] codeword = codeword >> 1 if clvl == 1: while hashvalue < len(destination) - 3: fetch2 = fetchhash() hashvalue += 1 hash = ((fetch2 >> 12) ^ fetch2) & _HASH_MASK hashtable[hash] = hashvalue fetch = fetch >> 8 & 0xFFFF | source[2] << 16 else: fetch = fetch >> 8 & 0xFFFF fetch |= source[2] << 16 fetch |= source[3] << 24 else: while len(destination) <= size - 1: if codeword == 1: source = source[4:] codeword = 0x80000000 destination.append(source[0]) source = source[1:] codeword = codeword >> 1 break if len(destination) != size: raise RefineryPartialResult( F'Header indicates decompressed size 0x{size:X}, but 0x{len(destination):X} bytes ' F'were decompressed.', destination) return destination
class rabbit (key, discard=0, stateful=False, iv=b'')
-
RABBIT encryption and decryption.
Expand source code Browse git
class rabbit(StreamCipherUnit): """ RABBIT encryption and decryption. """ key_size = {16} def __init__(self, key, discard=0, stateful=False, iv: Arg('-i', '--iv', help='Optional initialization vector.') = B''): super().__init__(key=key, iv=iv, stateful=stateful, discard=discard) def keystream(self) -> Iterable[int]: if len(self.args.iv) not in (0, 8): raise ValueError('The IV length must be exactly 8 bytes.') return RabbitCipher(self.args.key, self.args.iv)
class rc2 (key, iv=b'', *, eks=1024, derive_eks=False, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
RC2 encryption and decryption.
Expand source code Browse git
class rc2(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(ARC2)): """ RC2 encryption and decryption. """ def __init__( self, key, iv=b'', *, eks: Arg.Number('-k', '--eks', group='EKS', help='Set the effective key size. Default is {default}.') = 1024, derive_eks: Arg.Switch('-d', '--dks', group='EKS', help='Act as .NET and derive the effective key size from the key length.') = False, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0, **keywords ): super().__init__( key, iv, eks=eks, derive_eks=derive_eks, padding=padding, mode=mode, raw=raw, little_endian=little_endian, segment_size=segment_size, mac_len=mac_len, assoc_len=assoc_len, **keywords ) def _new_cipher(self, **optionals) -> CipherInterface: eks = len(self.args.key) * 8 if self.args.derive_eks else self.args.eks optionals.update(effective_keylen=eks) return super()._new_cipher(**optionals)
class rc4 (key, discard=0)
-
RC4 encryption and decryption.
Expand source code Browse git
class rc4(StandardCipherUnit, cipher=PyCryptoFactoryWrapper(ARC4)): """ RC4 encryption and decryption. """ def __init__( self, key, discard: Arg.Number('-d', help='Discard the first {varname} bytes of the keystream, {default} by default.') = 0, ): super().__init__(key, discard=discard) def _new_cipher(self, **optionals): return super()._new_cipher(drop=self.args.discard, **optionals)
class rc4mod (key, stateful=False, discard=0, *, size=256)
-
Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered.
Expand source code Browse git
class rc4mod(StreamCipherUnit): """ Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered. """ def __init__( self, key, stateful=False, discard=0, *, size: Arg.Number('-t', help='Table size, {default} by default.', bound=(1, None)) = 0x100 ): super().__init__(key=key, stateful=stateful, discard=discard, size=size) def keystream(self): size = self.args.size tablerange = range(max(size, 0x100)) b, table = 0, bytearray(k & 0xFF for k in tablerange) for a, keybyte in zip(tablerange, cycle(self.args.key)): t = table[a] b = (b + keybyte + t) % size table[a] = table[b] table[b] = t self.log_debug(lambda: F'SBOX = {table.hex(" ").upper()}', clip=True) b, a = 0, 0 while True: a = (a + 1) % size t = table[a] b = (b + t) % size table[a] = table[b] table[b] = t yield table[(table[a] + t) % size]
class rc5 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=12, word_size=32, assoc_len=0, mac_len=0)
-
RC5 encryption and decryption.
Expand source code Browse git
class rc5(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC5)): """ RC5 encryption and decryption. """ def __init__( self, key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R, word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W, **more ): super().__init__( key, iv, padding=padding, mode=mode, raw=raw, little_endian=little_endian, segment_size=segment_size, rounds=rounds, word_size=word_size, **more ) @property def block_size(self): return self.args.word_size // 4 def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( rounds=self.args.rounds, word_size=self.args.word_size, **optionals )
class rc6 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=20, word_size=32)
-
RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the unit will allow any key size up to 256 bits.
Expand source code Browse git
class rc6(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC6)): """ RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the unit will allow any key size up to 256 bits. """ def __init__( self, key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R, word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W, ): super().__init__( key, iv, padding=padding, mode=mode, raw=raw, little_endian=little_endian, segment_size=segment_size, rounds=rounds, word_size=word_size ) @property def block_size(self): return self.args.word_size // 2 def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher( rounds=self.args.rounds, word_size=self.args.word_size, **optionals )
class recode (decode=None, encode='UTF8', decerr=None, encerr=None, errors=None)
-
Expects input string data encoded in the
from
encoding and encodes it in theto
encoding, then outputs the result.Expand source code Browse git
class recode(Unit): """ Expects input string data encoded in the `from` encoding and encodes it in the `to` encoding, then outputs the result. """ def __init__( self, decode: Arg(metavar='decode-as', type=str, help='Input encoding; Guess encoding by default.') = None, encode: Arg(metavar='encode-as', type=str, help=F'Output encoding; The default is {Unit.codec}.') = Unit.codec, decerr: Arg.Option('-d', choices=Handler, help='Specify an error handler for decoding.') = None, encerr: Arg.Option('-e', choices=Handler, help='Specify an error handler for encoding.') = None, errors: Arg.Option('-E', choices=Handler, help=( 'Specify an error handler for both encoding and decoding. ' 'The possible choices are the following: {choices}')) = None, ): super().__init__( decode=decode, encode=encode, decerr=Arg.AsOption(decerr or errors or 'STRICT', Handler).value, encerr=Arg.AsOption(encerr or errors or 'STRICT', Handler).value ) @Unit.Requires('chardet', 'default', 'extended') def _chardet(): import chardet return chardet def _detect(self, data): mv = memoryview(data) if not any(mv[1::2]): return 'utf-16le' if not any(mv[0::2]): return 'utf-16be' detection = self._chardet.detect(data) codec = detection['encoding'] self.log_info(lambda: F'Using input encoding: {codec}, detected with {int(detection["confidence"] * 100)}% confidence.') return codec def _recode(self, enc, dec, encerr, decerr, data): dec = dec or self._detect(data) return codecs.encode(codecs.decode(data, dec, errors=decerr), enc, errors=encerr) def reverse(self, data): return self._recode(self.args.decode, self.args.encode, self.args.decerr, self.args.encerr, data) def process(self, data): return self._recode(self.args.encode, self.args.decode, self.args.encerr, self.args.decerr, data)
class reduce (suffix, just=0, temp='t')
-
The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a single chunk. The first chunk in the frame serves as initialization.
Expand source code Browse git
class reduce(Unit): """ The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a single chunk. The first chunk in the frame serves as initialization. """ def __init__(self, suffix: Arg(type=str, help=( 'The remaining command line is a multibin suffix. The reduction accumulator is initialized ' 'with the first chunk in the frame. Then, each remaining chunk is processed with the given ' 'suffix and the result is used to overwrite the accumulator.' )), just: Arg.Number('-j', help='Optionally specify a maximum number of chunks to process beyond the first.') = 0, temp: Arg.String('-t', metavar='name', help='The name of the accumulator variable. The default is "{default}".') = 't', ): super().__init__(suffix=suffix, temp=temp, just=just) def filter(self, chunks: Iterable[Chunk]): it = iter(chunks) just = self.args.just name = self.args.temp accu = next(it) if not just: scope = it else: import itertools self.log_info(F'reducing only the next {just} chunks') scope = itertools.islice(it, 0, just) for chunk in scope: chunk.meta[name] = accu accu[:] = DelayedBinaryArgument(self.args.suffix, reverse=True, seed=chunk)(chunk) self.log_debug('reduced:', accu, clip=True) accu.meta.discard(name) yield accu yield from it
class rep (count=2, label=None)
-
Duplicates the given input a given number of times. It is also possible to specify an iterable instead of a number, in which case the input will be replicated once for each item in this iterable.
Expand source code Browse git
class rep(Unit): """ Duplicates the given input a given number of times. It is also possible to specify an iterable instead of a number, in which case the input will be replicated once for each item in this iterable. """ def __init__( self, count: Arg.NumSeq(help=( 'Defines the number of outputs to generate for each input. The default is {default}. ' 'You can specify any multibin expression that defines an integer iterable here: Each ' 'input chunk will be replicated once for each element of that sequence.')) = 2, label: Arg(type=str, help=( 'If specified, the meta variable with this name will be populated with the index of ' 'the replicated chunk. When the count parameter is an integer, this label will be ' 'equivalent to the index meta variable.')) = None ): super().__init__(count=count, label=label) def process(self, data: bytes): def count(): count = self.args.count if isinstance(count, int): return count return sum(1 for _ in count) if self.args.squeeze or not self._framed: self.log_debug('compressing all repeated items into a single chunk') yield data * count() return self.log_debug('emitting each repeated item as an individual chunk') label = self.args.label if label is None: yield from repeat(data, count()) return meta = {} for counter in self.args.count: meta[label] = counter yield self.labelled(data, **meta)
class repl (search, replace=b'', count=-1)
-
Performs a simple binary string replacement on the input data.
Expand source code Browse git
class repl(Unit): """ Performs a simple binary string replacement on the input data. """ def __init__( self, search : Arg(help='This is the search term.'), replace: Arg(help='The substitution string. Leave this empty to remove all occurrences of the search term.') = B'', count : Arg.Number('-n', help='Only replace the given number of occurrences') = -1 ): super().__init__(search=search, replace=replace, count=count) def process(self, data: bytes): return data.replace( self.args.search, self.args.replace, self.args.count )
class resplit (regex=b'\\r?\\n', multiline=False, ignorecase=False, count=0)
-
Splits the data at the given regular expression and returns the sequence of chunks between the separators. By default, the input is split along line breaks.
Expand source code Browse git
class resplit(SingleRegexUnit): """ Splits the data at the given regular expression and returns the sequence of chunks between the separators. By default, the input is split along line breaks. """ def __init__( self, regex=RB'\r?\n', multiline=False, ignorecase=False, count=0 ): super().__init__(regex=regex, multiline=multiline, ignorecase=ignorecase, count=count) def process(self, data): view = memoryview(data) cursor = 0 count = self.args.count for k, match in enumerate(self.regex.finditer(view), 2): yield view[cursor:match.start()] cursor = match.end() yield from match.groups() if k > count > 0: break yield view[cursor:]
class resub (regex='\\s+', subst=b'', multiline=False, ignorecase=False, count=0)
-
A unit for performing substitutions based on a binary regular expression pattern. Besides the syntax
{k}
to insert thek
-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:{match-group:handlers}
where
:handlers
is an optional reverse multibin expression that is used to post-process the binary data from the match. For example,{2:hex:b64}
represents the base64-decoding of the hex-decoding of the second match group.Expand source code Browse git
class resub(SingleRegexUnit): """ A unit for performing substitutions based on a binary regular expression pattern. Besides the syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax: {match-group:handlers} where `:handlers` is an optional reverse multibin expression that is used to post-process the binary data from the match. For example, `{2:hex:b64}` represents the base64-decoding of the hex-decoding of the second match group. """ def __init__( self, regex: Arg(help='Regular expression to be searched and replaced. The default is "{default}".') = '\\s+', subst: Arg('subst', help=( 'Substitution value: use {1} for group 1, {0} for entire match. Matches are removed ' '(replaced by an empty string) by default.' )) = B'', multiline=False, ignorecase=False, count=0 ): super().__init__(regex=regex, subst=subst, multiline=multiline, ignorecase=ignorecase, count=count) def process(self, data): def repl(match: Match): return meta.format_bin(spec, self.codec, [match[0], *match.groups()], match.groupdict()) self.log_info('pattern:', getattr(self.regex, 'pattern', self.regex)) self.log_info('replace:', self.args.subst) meta = metavars(data) spec = self.args.subst.decode('ascii', 'backslashreplace') substitute = self.regex.sub if self.args.count: from functools import partial substitute = partial(substitute, count=self.args.count) return substitute(repl, data)
class rev (blocksize=None)
-
The blocks of the input data are output in reverse order. If the length of the input data is not a multiple of the block size, the data is truncated.
Expand source code Browse git
class rev(UnaryOperation): """ The blocks of the input data are output in reverse order. If the length of the input data is not a multiple of the block size, the data is truncated. """ def __init__(self, blocksize=None): super().__init__(blocksize=blocksize, _truncate=2) def inplace(self, block: ndarray): return self._numpy.flip(block) operate = NotImplemented def process(self, data: bytearray): if self.bytestream: data.reverse() return data try: return self._fastblock(data) except FastBlockError: b = self.blocksize n = len(data) q = n // b m = q * b view = memoryview(data) temp = bytearray(b) for k in range(0, (q // 2) * b, b): lhs = slice(k, k + b) rhs = slice(m - k - b, m - k) temp[:] = view[rhs] data[rhs] = view[lhs] data[lhs] = temp if m < n: del view del temp del data[m:] return data
class rex (regex, *transformation, unicode=False, unique=False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None)
-
Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each match. Each match is an individual output. Besides the syntax
{k}
to insert thek
-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:{match-group:pipeline}
where
:pipeline
is an optional pipeline of refinery commands as it would be specified on the command line. The value of the corresponding match is post-processed with this command.Expand source code Browse git
class rex(SingleRegexUnit, PatternExtractor): """ Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each match. Each match is an individual output. Besides the syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax: {match-group:pipeline} where `:pipeline` is an optional pipeline of refinery commands as it would be specified on the command line. The value of the corresponding match is post-processed with this command. """ def __init__( self, regex, # TODO: Use positional only in Python 3.8 # /, *transformation: Arg(type=utf8, help=( 'An optional sequence of transformations to be applied to each match. ' 'Each transformation produces one output in the order in which they ' 'are given. The default transformation is {0}, i.e. the entire match. ' )), unicode: Arg.Switch('-u', help='Also find unicode strings.') = False, unique: Arg.Switch('-q', help='Yield every (transformed) match only once.') = False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None ): super().__init__( regex=regex, transformation=transformation, unicode=unicode, unique=unique, multiline=multiline, ignorecase=ignorecase, min=min, max=max, len=len, stripspace=stripspace, longest=longest, take=take, utf16=unicode, ascii=True, duplicates=not unique ) def process(self, data): meta = metavars(data) self.log_debug('regular expression:', getattr(self.regex, 'pattern', self.regex)) transformations = [] specs: List[bytes] = list(self.args.transformation) if not specs: specs.append(B'{0}') for spec in specs: def transformation(match: Match, s=spec.decode(self.codec)): symb: dict = match.groupdict() args: list = [match.group(0), *match.groups()] used = set() for key, value in symb.items(): if value is None: symb[key] = B'' item = meta.format(s, self.codec, args, symb, True, True, used) used.update(key for key, value in symb.items() if not value) for variable in used: symb.pop(variable, None) symb.update(offset=match.start()) chunk = Chunk(item) chunk.meta.update(meta) chunk.meta.update(symb) return chunk transformations.append(transformation) yield from self.matches_filtered(memoryview(data), self.regex, *transformations)
class rijndael (key, iv=b'', block_size=16, *, assoc_len=0, mac_len=0, segment_size=0, little_endian=False, raw=False, mode=None, padding=None)
-
Rijndael encryption and decryption. Note that there is also a
aes
unit which has much better performance because it calls into the PyCryptodome library. You would have to use this specific Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in which case it is equivalent to AES.Expand source code Browse git
class rijndael(StandardBlockCipherUnit, cipher=BlockCipherFactory(Rijndael)): """ Rijndael encryption and decryption. Note that there is also a `refinery.aes` unit which has much better performance because it calls into the PyCryptodome library. You would have to use this specific Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in which case it is equivalent to AES. """ def __init__( self, key, iv=b'', block_size: Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 16, 24, and 32.') = 16, **more ): return super().__init__(key, iv, block_size=block_size, **more) @property def block_size(self): return self.args.block_size def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher(block_size=self.args.block_size, **optionals)
class ripemd128 (text=False)
-
Returns the RIPEMD-128 hash of the input data.
Expand source code Browse git
class ripemd128(HashUnit): """ Returns the RIPEMD-128 hash of the input data. """ def _algorithm(self, data): from refinery.lib.ripemd128 import ripemd128 return ripemd128(data)
class ripemd160 (text=False)
-
Returns the RIPEMD160 hash of the input data.
class rmv (*names)
-
Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no variable names are given, the unit removes all of them. Note that this can recover variables from outer frames that were previously shadowed.
Expand source code Browse git
class rmv(Unit): """ Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no variable names are given, the unit removes all of them. Note that this can recover variables from outer frames that were previously shadowed. """ def __init__(self, *names: Arg(type=str, metavar='name', help='Name of a variable to be removed.')): super().__init__(names=names) def process(self, data: Chunk): meta = metavars(data) keys = self.args.names or list(meta.variable_names()) for key in keys: meta.discard(key) return data
class rncrypt (password)
-
Implements encryption and decryption using the RNCryptor specification. See also: https://github.com/RNCryptor
Expand source code Browse git
class rncrypt(Unit): """ Implements encryption and decryption using the RNCryptor specification. See also: https://github.com/RNCryptor """ def __init__(self, password: bytearray): super().__init__(password=password) def process(self, data: bytes) -> bytes: encryption_salt = data[2:10] hmac_salt = data[10:18] iv = data[18:34] cipher_text = data[34:-32] hmac_signature = data[-32:] encryption_key = self._pbkdf2(self.args.password, encryption_salt) hmac_key = self._pbkdf2(self.args.password, hmac_salt) if not hmac.compare_digest(self._hmac(hmac_key, data[:-32]), hmac_signature): raise ValueError("Failed to verify signature.") return unpad( self._aes_decrypt(encryption_key, iv, cipher_text), block_size=AES.block_size ) def reverse(self, data: bytes) -> bytes: prng = Random.new() data = pad(data, block_size=AES.block_size) encryption_salt = prng.read(8) encryption_key = self._pbkdf2(self.args.password, encryption_salt) hmac_salt = prng.read(8) hmac_key = self._pbkdf2(self.args.password, hmac_salt) iv = prng.read(AES.block_size) cipher_text = self._aes_encrypt(encryption_key, iv, data) new_data = b'\x03\x01' + encryption_salt + hmac_salt + iv + cipher_text return new_data + self._hmac(hmac_key, new_data) def _aes_encrypt(self, key, iv, text): return AES.new(key, AES.MODE_CBC, iv).encrypt(text) def _aes_decrypt(self, key, iv, text): return AES.new(key, AES.MODE_CBC, iv).decrypt(text) def _hmac(self, key, data): return hmac.new(key, data, hashlib.sha256).digest() def _prf(self, secret, salt): return hmac.new(secret, salt, hashlib.sha1).digest() def _pbkdf2(self, password, salt, iterations=10000, key_length=32): return KDF.PBKDF2(password, salt, dkLen=key_length, count=iterations, prf=self._prf)
class rot (amount=13)
-
Rotate the characters of the alphabet by the given amount. The default amount is 13, providing the common (and weak) string obfuscation method.
Expand source code Browse git
class rot(Unit): """ Rotate the characters of the alphabet by the given amount. The default amount is 13, providing the common (and weak) string obfuscation method. """ def __init__(self, amount: Arg.Number(help='Number of letters to rotate by; Default is 13.') = 13): super().__init__(amount=amount) def process(self, data: bytearray): rot = self.args.amount % 26 for index, byte in enumerate(data): for alphabet in _LCASE, _UCASE: if byte in alphabet: zero = alphabet[0] data[index] = zero + (byte - zero + rot) % 26 break return data
class rotl (argument, bigendian=False, blocksize=None)
-
Rotate the bits of each block left.
Expand source code Browse git
class rotl(BinaryOperation): """ Rotate the bits of each block left. """ def operate(self, value, shift): shift %= self.fbits return (value << shift) | (value >> (self.fbits - shift)) def inplace(self, value, shift): shift %= self.fbits lower = value >> (self.fbits - shift) value <<= shift value |= lower
class rotr (argument, bigendian=False, blocksize=None)
-
Rotate the bits of each block right.
Expand source code Browse git
class rotr(BinaryOperation): """ Rotate the bits of each block right. """ def operate(self, value, shift): shift %= self.fbits return (value >> shift) | (value << (self.fbits - shift)) def inplace(self, value, shift): shift %= self.fbits lower = value >> shift value <<= self.fbits - shift value |= lower
class rsa (key, swapkeys=False, textbook=False, padding=PAD.AUTO, rsautl=False)
-
Implements single block RSA encryption and decryption. This unit can be used to encrypt and decrypt blocks generated by openssl's
rsautl
tool when using the mode-verify
. When it is executed with a public key for decryption or with a private key for encryption, it will perform a raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5.Expand source code Browse git
class rsa(Unit): """ Implements single block RSA encryption and decryption. This unit can be used to encrypt and decrypt blocks generated by openssl's `rsautl` tool when using the mode `-verify`. When it is executed with a public key for decryption or with a private key for encryption, it will perform a raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5. """ def __init__( self, key: Arg(help='RSA key in PEM, DER, or Microsoft BLOB format.'), swapkeys: Arg.Switch('-s', help='Swap public and private exponent.') = False, textbook: Arg.Switch('-t', group='PAD', help='Equivalent to --padding=NONE.') = False, padding : Arg.Option('-p', group='PAD', choices=PAD, help='Choose one of the following padding modes: {choices}. The default is AUTO.') = PAD.AUTO, rsautl : Arg.Switch('-r', group='PAD', help='Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10') = False, ): padding = Arg.AsOption(padding, PAD) if textbook: if padding != PAD.AUTO: raise ValueError('Conflicting padding options!') padding = padding.NONE if rsautl: if padding and padding != PAD.PKCS10: raise ValueError('Conflicting padding options!') swapkeys = True padding = PAD.PKCS10 super().__init__(key=key, textbook=textbook, padding=padding, swapkeys=swapkeys) self._key_hash = None self._key_data = None @property def blocksize(self) -> int: return self.key.size_in_bytes() @property def _blocksize_plain(self) -> int: # PKCS#1 v1.5 padding is at least 11 bytes. return self.blocksize - 11 @property def pub(self): return self.key.d if self.args.swapkeys else self.key.e @property def prv(self): return self.key.e if self.args.swapkeys else self.key.d def _get_msg(self, data): msg = int.from_bytes(data, byteorder='big') if msg > self.key.n: raise ValueError(F'This key can only handle messages of size {self.blocksize}.') return msg def _encrypt_raw(self, data): return pow( self._get_msg(data), self.pub, self.key.n ).to_bytes(self.blocksize, byteorder='big') def _decrypt_raw(self, data): return pow( self._get_msg(data), self.prv, self.key.n ).to_bytes(self.blocksize, byteorder='big') def _unpad(self, data, head, padbyte=None): if len(data) > self.blocksize: raise ValueError(F'This key can only handle messages of size {self.blocksize}.') if data.startswith(head): pos = data.find(B'\0', 2) if pos > 0: pad = data[2:pos] if padbyte is None or all(b == padbyte for b in pad): return data[pos + 1:] raise ValueError('Incorrect padding') def _pad(self, data, head, padbyte=None): if len(data) > self._blocksize_plain: raise ValueError(F'This key can only encrypt messages of size at most {self._blocksize_plain}.') pad = self.blocksize - len(data) - len(head) - 1 if padbyte is not None: padding = pad * bytes((padbyte,)) else: padding = bytearray(1) while not all(padding): padding = bytearray(filter(None, padding)) padding.extend(get_random_bytes(pad - len(padding))) return head + padding + B'\0' + data def _unpad_pkcs10(self, data): return self._unpad(data, B'\x00\x01', 0xFF) def _unpad_pkcs15(self, data): return self._unpad(data, B'\x00\x02', None) def _pad_pkcs10(self, data): return self._pad(data, B'\x00\x01', 0xFF) def _pad_pkcs15(self, data): return self._pad(data, B'\x00\x02', None) def _decrypt_block_OAEP(self, data): self.log_debug('Attempting decryption with PyCrypto PKCS1 OAEP.') return PKCS1_OAEP.new(self.key).decrypt(data) def _encrypt_block_OAEP(self, data): self.log_debug('Attempting encryption with PyCrypto PKCS1 OAEP.') return PKCS1_OAEP.new(self.key).encrypt(data) def _decrypt_block(self, data): if self._oaep and self._pads in {PAD.AUTO, PAD.OAEP}: try: return self._decrypt_block_OAEP(data) except ValueError as E: if self._pads: raise self.log_debug(F'{E!s} No longer attempting OAEP.') self._oaep = False data = self._decrypt_raw(data) return self._unpad_per_argument(data) def _unpad_per_argument(self, data): if self._pads == PAD.NONE: return data elif self._pads == PAD.PKCS10: return self._unpad_pkcs10(data) elif self._pads == PAD.PKCS15: return self._unpad_pkcs15(data) elif self._pads == PAD.AUTO: with suppress(ValueError): data = self._unpad_pkcs10(data) self.log_info('Detected PKCS1.0 padding.') self._pads = PAD.PKCS10 return data with suppress(ValueError): data = self._unpad_pkcs15(data) self.log_info('Detected PKCS1.5 padding.') self._pads = PAD.PKCS15 return data raise RefineryPartialResult('No padding worked, returning raw decrypted blocks.', data) else: raise ValueError(F'Invalid padding value: {self._pads!r}') def _encrypt_block(self, data): if self._pads in {PAD.AUTO, PAD.OAEP}: try: return self._encrypt_block_OAEP(data) except ValueError: if self._pads: raise self.log_debug('PyCrypto primitives for OAEP failed, falling back to PKCS1.5.') self._pads = PAD.PKCS15 if self._pads == PAD.PKCS15: data = self._pad_pkcs15(data) elif self._pads == PAD.PKCS10: data = self._pad_pkcs10(data) return self._encrypt_raw(data) @property def key(self) -> RSA.RsaKey: key_blob = self.args.key key_hash = hash(key_blob) if key_hash != self._key_hash: fmt, key_data = normalize_rsa_key(key_blob) self.log_info(F'successfully parsed RSA key as {fmt.value}') self._key_hash = key_hash self._key_data = key_data return self._key_data def process(self, data): self._oaep = True self._pads = self.args.padding if not self.key.has_private(): try: return self._unpad_per_argument(self._encrypt_raw(data)) except Exception as E: raise ValueError(F'A public key was given for decryption and rsautl mode resulted in an error: {E}') from E return B''.join(self._decrypt_block(block) for block in splitchunks(data, self.blocksize)) def reverse(self, data): self._pads = self.args.padding return B''.join(self._encrypt_block(block) for block in splitchunks(data, self._blocksize_plain))
class rsakey (public=False, output=RSAFormat.PEM)
-
Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported. The same formats are supported for the input format, but you can also specify a key in the following format, where both modulus and exponent have to be hex-encoded:
[modulus]:[exponent]
Expand source code Browse git
class rsakey(Unit): """ Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported. The same formats are supported for the input format, but you can also specify a key in the following format, where both modulus and exponent have to be hex-encoded: `[modulus]:[exponent]` """ def __init__( self, public: Arg.Switch('-p', help='Force public key output even if the input is private.') = False, output: Arg.Option(help='Select an output format ({choices}), default is {default}.', choices=RSAFormat) = RSAFormat.PEM ): super().__init__(public=public, output=Arg.AsOption(output, RSAFormat)) def _xkms_wrap(self, number: int): size, r = divmod(number.bit_length(), 8) size += int(bool(r)) return base64.b64encode(number.to_bytes(size, 'big')) def process(self, data): from refinery.lib.mscrypto import TYPES, ALGORITHMS fmt, key = normalize_rsa_key(data, force_public=self.args.public) self.log_info(F'parsing input as {fmt.value} format') out = self.args.output if out is RSAFormat.PEM: yield key.export_key('PEM') return if out is RSAFormat.DER: yield key.export_key('DER') return if out is RSAFormat.BLOB: def le(v: int, s: int): return v.to_bytes(s, 'little') buffer = bytearray() buffer.append(TYPES.PRIVATEKEYBLOB if key.has_private() else TYPES.PUBLICKEYBLOB) buffer.extend(le(2, 3)) buffer.extend(le(ALGORITHMS.CALG_RSA_KEYX, 4)) buffer.extend(B'RSA2' if key.has_private() else B'RSA1') size = 2 while size < key.n.bit_length(): size <<= 1 self.log_info(F'using bit size {size}') buffer.extend(le(size, 4)) size //= 8 buffer.extend(le(key.e, 4)) buffer.extend(le(key.n, size)) if key.has_private(): exp_1 = key.d % (key.p - 1) exp_2 = key.d % (key.q - 1) coeff = pow(key.q, -1, key.p) half = size // 2 buffer.extend(le(key.p, half)) buffer.extend(le(key.q, half)) buffer.extend(le(exp_1, half)) buffer.extend(le(exp_2, half)) buffer.extend(le(coeff, half)) buffer.extend(le(key.d, size)) yield buffer return components = { 'Modulus' : key.n, 'Exponent': key.e, } if key.has_private(): decoded = DerSequence() decoded.decode(key.export_key('DER')) it = itertools.islice(decoded, 3, None) for v in ('D', 'P', 'Q', 'DP', 'DQ', 'InverseQ'): try: components[v] = next(it) except StopIteration: break if out is RSAFormat.XKMS: for tag in components: components[tag] = base64.b64encode(number.long_to_bytes(components[tag])).decode('ascii') tags = '\n'.join(F'\t<{tag}>{value}</{tag}>' for tag, value in components.items()) yield F'<RSAKeyPair>\n{tags}\n</RSAKeyPair>'.encode(self.codec) return components['BitSize'] = key.n.bit_length() for tag, value in components.items(): if value.bit_length() > 32: components[tag] = F'{value:X}' if out is RSAFormat.JSON: yield json.dumps(components, indent=4).encode(self.codec) return if out is RSAFormat.TEXT: table = list(flattened(components)) for key, value in table: value = F'0x{value}' if isinstance(value, str) else str(value) value = '\n'.join(F'{L}' for L in textwrap.wrap(value, 80)) yield F'-- {key + " ":-<77}\n{value!s}'.encode(self.codec)
class salsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored.
Expand source code Browse git
class salsa(LatinCipherUnit): """ Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored. """ def keystream(self) -> Iterable[int]: key = self.args.key if len(key) == 64: it = SalsaCipher.FromState(key) else: it = SalsaCipher( key, self.args.nonce, self.args.magic, self.args.rounds, self.args.offset, ) yield from it
class salsa20 (key, nonce=b'REFINERY')
-
Salsa20 encryption and decryption. This unit is functionally equivalent to
salsa
with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure Python implementation used bysalsa
.Expand source code Browse git
class salsa20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(Salsa20)): """ Salsa20 encryption and decryption. This unit is functionally equivalent to `refinery.salsa` with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure Python implementation used by `refinery.salsa`. """ pass
class scope (*slice, visible=True)
-
After using
scope
within in arefinery.lib.frame
, all the following operations will be applied only to the selected indices. All remaining chunks still exist, they are just not operated on. When the frame closes or the frame is being rescoped by a second application of this unit, they become visible again.Expand source code Browse git
class scope(FrameSlicer): """ After using `refinery.scope` within in a `refinery.lib.frame`, all the following operations will be applied only to the selected indices. All remaining chunks still exist, they are just not operated on. When the frame closes or the frame is being rescoped by a second application of this unit, they become visible again. """ def __init__(self, *slice, visible: Arg.Switch('-n', '--not', off=True, help=( 'Hide the given chunks instead of making them the only ones visible.')) = True ): super().__init__(*slice, visible=visible) # Sort any slices with negative arguments to the back so we check # them last. This delays potential consumption of the chunks iterator # as much as possible. self.args.slice.sort( key=lambda s: (s.start or 0, s.stop or 0), reverse=True) def filter(self, chunks): it = iter(chunks) consumed = None size = None def buffered() -> Generator[Chunk, None, None]: yield from it while consumed: yield consumed.popleft() def shift(offset, default): nonlocal consumed, it, size if offset is None: return default if offset >= 0: return offset if consumed is None: from collections import deque self.log_info(F'consuming iterator to compute negative offset {offset}.') consumed = deque(it) size = len(consumed) + k + 1 return max(0, offset + size) for k, chunk in enumerate(buffered()): for s in self.args.slice: if k in range(shift(s.start, 0), shift(s.stop, k + 1), s.step or 1): chunk.visible = self.args.visible break else: chunk.visible = not self.args.visible self.log_debug(chunk) yield chunk
class seal (key, discard=0, stateful=False)
-
SEAL encryption and decryption.
Expand source code Browse git
class seal(StreamCipherUnit): """ SEAL encryption and decryption. """ key_size = {20} def keystream(self) -> Iterable[bytes]: return SEAL_Cipher(self.args.key)
class secstr (key=b'\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10', iv=None)
-
Implements the AES-based encryption scheme used by the PowerShell commands
ConvertFrom-SecureString
andConvertTo-SecureString
.Expand source code Browse git
class secstr(Unit): """ Implements the AES-based encryption scheme used by the PowerShell commands `ConvertFrom-SecureString` and `ConvertTo-SecureString`. """ # This is a magic header value used for PowerShell secure strings. _MAGIC = bytes(( 0xEF, 0xAE, 0x3D, 0xD9, 0xDD, 0x75, 0xD7, 0xAE, 0xF8, 0xDD, 0xFD, 0x38, 0xDB, 0x7E, 0x35, 0xDD, 0xBD, 0x7A, 0xD3, 0x9D, 0x1A, 0xE7, 0x7E, 0x39)) # Secure strings include a decimal number formatted as a string directly # following the header. Presumably, this is the PowerShell version. _PSVER = 2 def __init__( self, key: Arg( help='Secure string encryption 16-byte AES key; the default are the bytes from 1 to 16.' ) = bytes(range(1, 17)), iv: Arg('-i', help='Optionally specify an IV to use for encryption.') = None ): super().__init__(key=key, iv=iv) @property def key(self): key = self.args.key if len(key) not in (0x10, 0x18, 0x20): raise ValueError('The encryption key has to be 16 bytes long.') return key @property def iv(self): iv = self.args.iv if iv is not None and len(iv) != 0x10: raise ValueError('The IV has to be 16 bytes long.') return iv def reverse(self, data): ivec = self.iv or urandom(0x10) if len(ivec) != 0x10: raise ValueError(self._IVERR) cipher = AES.new(self.key, AES.MODE_CBC, ivec) data = data.decode('latin-1').encode('utf-16LE') data = cipher.encrypt(pad(data, block_size=0x10)) data = base64.b16encode(data).lower().decode('ascii') ivec = base64.b64encode(ivec).decode('ascii') data = '|'.join(('%d' % self._PSVER, ivec, data)).encode('utf-16LE') return base64.b64encode(self._MAGIC + data) def process(self, data): head, ivec, data = base64.b64decode(data).split(b'|\0') self.log_info('head:', head.hex()) ivec = base64.b64decode(ivec.decode('utf-16LE')) self.log_info('ivec:', ivec.hex()) data = base64.b16decode(data.decode('utf-16LE'), casefold=True) if len(data) % 0x10 != 0: self.log_info('data not block-aligned, padding with zeros') data += B'\0' * (0x10 - len(data) % 0x10) cipher = AES.new(self.key, AES.MODE_CBC, ivec) data = cipher.decrypt(data) try: data = unpad(data, block_size=0x10) except Exception: self.log_warn('decrypted data does not have PKCS7 padding') for p in range(0x10): try: return data[-p:].decode('utf-16LE').encode('latin-1') except UnicodeDecodeError: pass except UnicodeEncodeError: pass self.log_warn('result is not a padded unicode string, key is likely wrong') return data
class sep (separator=b'\n', scoped=False)
-
Multiple inputs are joined along a specified separator. If any of the input
Chunk
s is currently out of scope,sep
turns makes them visible by default. This can be prevented by using the-s
flag.Expand source code Browse git
class sep(Unit): """ Multiple inputs are joined along a specified separator. If any of the input `refinery.lib.frame.Chunk`s is currently out of scope, `refinery.sep` turns makes them visible by default. This can be prevented by using the `-s` flag. """ def __init__( self, separator: Arg(help='Separator; the default is a line break.') = B'\n', scoped: Arg.Switch('-s', help=( 'Maintain chunk scope; i.e. do not turn all input chunks visible.')) = False ): super().__init__(separator=separator, scoped=scoped) self.separate = False def filter(self, chunks): it = iter(chunks) try: chunk = next(it) except StopIteration: return self.separate = True for upcoming in it: if not self.args.scoped: chunk.visible = True yield chunk chunk = upcoming self.separate = False yield chunk def process(self, data): yield data if self.separate: yield self.args.separator
class serpent (key, iv=b'', padding=None, mode=None, raw=False, swap=False)
-
Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in one direction, some in the other. When decryption results with this unit do not yield the expected result, try using the
--swap
(or-s
) option to swap the bytes in each block. Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by prefixing the input key by the multibin handlersnip[::-1]
.Expand source code Browse git
class serpent(StandardBlockCipherUnit, cipher=BlockCipherFactory(Serpent)): """ Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in one direction, some in the other. When decryption results with this unit do not yield the expected result, try using the `--swap` (or `-s`) option to swap the bytes in each block. Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by prefixing the input key by the multibin handler `snip[::-1]`. """ def __init__( self, key, iv=b'', padding=None, mode=None, raw=False, swap: Arg.Switch('-s', help='Read the bytes in each block in reverse order.') = False ): super().__init__(key, iv, padding=padding, mode=mode, raw=raw, swap=swap) def _new_cipher(self, **optionals) -> CipherInterface: instance: Serpent = super()._new_cipher() instance.swap = self.args.swap return instance
class sha1 (text=False)
-
Returns the SHA1 hash of the input data.
class sha224 (text=False)
-
Returns the SHA224 hash of the input data.
class sha256 (text=False)
-
Returns the SHA256 hash of the input data.
class sha384 (text=False)
-
Returns the SHA384 hash of the input data.
class sha3_224 (text=False)
-
Returns the SHA3-224 hash of the input data.
class sha3_256 (text=False)
-
Returns the SHA3-256 hash of the input data.
class sha3_384 (text=False)
-
Returns the SHA3-384 hash of the input data.
class sha3_512 (text=False)
-
Returns the SHA3-512 hash of the input data.
class sha512 (text=False)
-
Returns the SHA512 hash of the input data.
class shl (argument, bigendian=False, blocksize=None)
-
Shift the bits of each block left, filling with zero bits.
Expand source code Browse git
class shl(BinaryOperation): """ Shift the bits of each block left, filling with zero bits. """ @staticmethod def operate(a, b): return a << b @staticmethod def inplace(a, b): a <<= b
class shr (argument, bigendian=False, blocksize=None)
-
Shift the bits of each block right, filling with zero bits.
Expand source code Browse git
class shr(BinaryOperation): """ Shift the bits of each block right, filling with zero bits. """ @staticmethod def operate(a, b): return a >> b @staticmethod def inplace(a, b): a >>= b
class sm4 (key, iv=b'', *, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, mac_len=0, assoc_len=0)
-
The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography Administration of China (SCA).
Expand source code Browse git
class sm4(StandardBlockCipherUnit, cipher=BlockCipherFactory(SM4)): """ The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography Administration of China (SCA). """ pass
class snip (slices=[slice(None, None, None)], length=False, stream=False, remove=False)
-
Snips the input data based on a Python slice expression. For example, the initialization
slice 0::1 1::1
would yield a unit that first extracts every byte at an even position and then, every byte at an odd position. In this case, multiple outputs are produced. The unit can be used in reverse mode, in which case the specified ranges are deleted sequentially from the input.Expand source code Browse git
class snip(Unit): """ Snips the input data based on a Python slice expression. For example, the initialization `slice 0::1 1::1` would yield a unit that first extracts every byte at an even position and then, every byte at an odd position. In this case, multiple outputs are produced. The unit can be used in reverse mode, in which case the specified ranges are deleted sequentially from the input. """ def __init__( self, slices: Arg(help='Specify start:stop:step in Python slice syntax.') = [slice(None, None)], length: Arg.Switch('-l', help=( 'Interpret the end of a slice as a length rather than as an offset.')) = False, stream: Arg.Switch('-s', help=( 'After each slice, consider only the data that follows after it for subsequent ' 'slicing.')) = False, remove: Arg.Switch('-r', help=( 'Remove the slices from the input rather than selecting them.')) = False, ): super().__init__(slices=slices, length=length, stream=stream, remove=remove) def process(self, data: bytearray): slices: list[slice] = list(self.args.slices) stream = self.args.stream remove = self.args.remove length = self.args.length cursor = 0 view = memoryview(data) for k, bounds in enumerate(slices): upper = bounds.stop lower = bounds.start or 0 if upper is None: upper = len(data) else: upper += cursor if length: upper += lower bounds = slice( lower + cursor, upper, bounds.step) if stream: cursor = upper if not remove: temp = view[bounds] else: if k + 1 >= len(slices): view.release() del view temp = data else: temp = bytearray(data) del temp[bounds] yield temp
class sorted (key=None, ascending=False)
-
Sorts all elements of the input
refinery.lib.frame
lexicographically. This unit is anop
on single inputs.Expand source code Browse git
class sorted(Unit): """ Sorts all elements of the input `refinery.lib.frame` lexicographically. This unit is a `refinery.nop` on single inputs. """ def __init__( self, key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None, ascending: Arg.Switch('-a', help='Sort in ascending order, the default is descending.') = False ): super().__init__(key=key, ascending=ascending) def filter(self, chunks): sortbuffer = [] invisibles = [] key = self.args.key rev = not self.args.ascending if key is not None: def _key(chunk): return expression(metavars(chunk)), chunk expression = PythonExpression(key, all_variables_allowed=True) key = _key def sorted(): if not sortbuffer: return sortbuffer.sort(key=key, reverse=rev) yield from sortbuffer sortbuffer.clear() for chunk in chunks: if chunk.visible: yield from invisibles invisibles.clear() sortbuffer.append(chunk) else: yield from sorted() invisibles.append(chunk) yield from invisibles yield from sorted()
class sosemanuk (key, stateful=False, discard=0, nonce=b'')
-
Expand source code Browse git
class sosemanuk(StreamCipherUnit): def __init__( self, key, stateful=False, discard=0, nonce: Arg(help='The nonce. Default is empty, which is equivalent to 16 null bytes.') = B'', ): super().__init__(key=key, nonce=nonce, stateful=stateful, discard=discard) def keystream(self): yield from Sosemanuk(self.args.key, self.args.nonce)
class stego (transpose, split=False, parts='RGB')
-
Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs these values as bytes. By default, the pixels are converted left to right, top to bottom.
Expand source code Browse git
class stego(Unit): """ Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs these values as bytes. By default, the pixels are converted left to right, top to bottom. """ def __init__( self, transpose: Arg.Switch('-t', help='Return the columns of the image rather than the rows.'), split: Arg.Switch('-m', help='Emit the individual rows or columns as separate outputs.') = False, parts: Arg('parts', nargs='?', type=str, help=( 'A string containing any ordering of the letters R, G, B, and A (case-insensitive). ' 'These pixel components will be extracted from every pixel in the given order. The ' 'default value is {default}.' )) = 'RGB' ): super().__init__( transpose=transpose, split=split, parts=tuple(Arg.AsOption(p, PIXEL_PART) for p in parts) ) @Unit.Requires('Pillow', 'formats') def _image(): from PIL import Image return Image def process(self, data): split = self.args.split parts = self.args.parts image = self._image.open(MemoryFile(data)) if self.args.transpose: image = image.transpose(self._image.Transpose.ROTATE_90) width, height = image.size chunk_size = len(parts) output = MemoryFile() buffer = bytearray(chunk_size * width) for y in range(height): offset = 0 for x in range(width): pixel = image.getpixel((x, y)) next_offset = offset + chunk_size buffer[offset:next_offset] = (pixel[p] for p in parts) offset = next_offset if split: yield buffer else: output.write(buffer) if not split: yield output.getvalue()
class stretch (*count)
-
Stretch the input data by repeating every byte a number of times.
Expand source code Browse git
class stretch(Unit): """ Stretch the input data by repeating every byte a number of times. """ def __init__(self, *count: Arg.Number(metavar='count', help=( 'The number of times every byte should be repeated. By default, ' 'every byte is repeated once.' ))): count = count or (2,) if any(k <= 0 for k in count): raise ValueError('You can not use a stretching factor of less than 1.') super().__init__(count=count or (2,)) def process(self, data): def stretched(it): factor = cycle(self.args.count) for byte in it: yield from repeat(byte, next(factor)) return bytearray(stretched(iter(data))) def reverse(self, data): # one-sided inverse def clinched(it): factor = cycle(self.args.count) while True: try: take = islice(it, next(factor)) yield next(take) for _ in take: pass except StopIteration: break return bytearray(clinched(iter(data)))
class struct (spec, *outputs, multi=False, count=∞, until=None, more=False)
-
Read structured data from the beginning of a chunk and store the extracted fields in chunk meta variables. The structure format is specified in extended Python struct format, and all remaining arguments to this unit are the names of the variables that receive the values from this struct. The extended struct format supports all field types supported by Python, as well as the following:
a
for null-terminated ASCII strings,u
to read encoded, null-terminated UTF16 strings,w
to read decoded, null-terminated UTF16 strings,g
to read Microsoft GUID values,E
to read 7-bit encoded integers.
For example, the string
LLxxHaa
will read two unsigned 32bit integers, then skip two bytes, then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to using native byte order with no alignment. Thespec
parameter may additionally contain format expressions of the following form:{name[!alignment]:format}
The
alignment
parameter is optional. It must be an expression that evaluates to an integer value. The current data pointer is aligned to a multiple of this value before reading the field. Theformat
can either be an integer expression specifying a number of bytes to read, or any format string. Ifname
is specified for an extracted field, its value is made available as a meta variable under the given name. For example, the expressionLLxxH{foo:a}{bar:a}
would be parsed in the same way as the previous example, but the two ASCII strings would also be stored in meta variables under the namesfoo
andbar
, respectively. Theformat
string of a named field is itself parsed as a foramt string expression, where all the previously parsed fields are already available. For example,I{:{}}
reads a single 32-bit integer length prefix and then reads as many bytes as that prefix specifies.A second format string expression is used to specify the output format. For example, the format string
LLxxH{foo:a}{bar:a}
together with the output format{foo}/{bar}
would parse data as before, but the output body would be the concatnation of the fieldfoo
, a forward slash, and the fieldbar
. Variables used in the output expression are not included as meta variables. As format fields in the output expression, one can also use{1}
,{2}
or{-1}
to access extracted fields by index. The value{0}
represents the entire chunk of structured data. By default, the output format{#}
is used, which represents either the last byte string field that was extracted, or the entire chunk of structured data if none of the fields were extracted.Reverse
multibin()
expressions can be used to post-process the fields included in any output format. For example,{F:b64:zl}
will be the base64-decoded and inflate- decompressed contents of the data that was read as fieldF
.Finally, it is possible to specify a byte alignment by using the syntax
{field!T:a:b:c}
where the letterT
is either a single digit specifying the alignment, or a single letter variable that holds the byte alignment value in the current metadata.Expand source code Browse git
class struct(Unit): """ Read structured data from the beginning of a chunk and store the extracted fields in chunk meta variables. The structure format is specified in extended Python struct format, and all remaining arguments to this unit are the names of the variables that receive the values from this struct. The extended struct format supports all field types supported by Python, as well as the following: - `a` for null-terminated ASCII strings, - `u` to read encoded, null-terminated UTF16 strings, - `w` to read decoded, null-terminated UTF16 strings, - `g` to read Microsoft GUID values, - `E` to read 7-bit encoded integers. For example, the string `LLxxHaa` will read two unsigned 32bit integers, then skip two bytes, then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to using native byte order with no alignment. The `spec` parameter may additionally contain format expressions of the following form: {name[!alignment]:format} The `alignment` parameter is optional. It must be an expression that evaluates to an integer value. The current data pointer is aligned to a multiple of this value before reading the field. The `format` can either be an integer expression specifying a number of bytes to read, or any format string. If `name` is specified for an extracted field, its value is made available as a meta variable under the given name. For example, the expression `LLxxH{foo:a}{bar:a}` would be parsed in the same way as the previous example, but the two ASCII strings would also be stored in meta variables under the names `foo` and `bar`, respectively. The `format` string of a named field is itself parsed as a foramt string expression, where all the previously parsed fields are already available. For example, `I{:{}}` reads a single 32-bit integer length prefix and then reads as many bytes as that prefix specifies. A second format string expression is used to specify the output format. For example, the format string `LLxxH{foo:a}{bar:a}` together with the output format `{foo}/{bar}` would parse data as before, but the output body would be the concatnation of the field `foo`, a forward slash, and the field `bar`. Variables used in the output expression are not included as meta variables. As format fields in the output expression, one can also use `{1}`, `{2}` or `{-1}` to access extracted fields by index. The value `{0}` represents the entire chunk of structured data. By default, the output format `{#}` is used, which represents either the last byte string field that was extracted, or the entire chunk of structured data if none of the fields were extracted. Reverse `refinery.lib.argformats.multibin` expressions can be used to post-process the fields included in any output format. For example, `{F:b64:zl}` will be the base64-decoded and inflate- decompressed contents of the data that was read as field `F`. Finally, it is possible to specify a byte alignment by using the syntax `{field!T:a:b:c}` where the letter `T` is either a single digit specifying the alignment, or a single letter variable that holds the byte alignment value in the current metadata. """ def __init__( self, spec: Arg(type=str, help='Structure format as explained above.'), *outputs: Arg(metavar='output', type=str, help='Output format as explained above.'), multi: Arg.Switch('-m', help=( 'Read as many pieces of structured data as possible intead of just one.')) = False, count: Arg.Number('-n', help=( 'A limit on the number of chunks to read in multi mode; default is {default}.')) = INF, until: Arg('-u', metavar='E', type=str, help=( 'An expression evaluated on each chunk in multi mode. New chunks will be parsed ' 'only if the result is nonzero.')) = None, more : Arg.Switch('-M', help=( 'After parsing the struct, emit one chunk that contains the data that was left ' 'over in the buffer. If no data was left over, this chunk will be empty.')) = False ): outputs = outputs or [F'{{{_SHARP}}}'] super().__init__(spec=spec, outputs=outputs, until=until, count=count, multi=multi, more=more) def process(self, data: Chunk): formatter = string.Formatter() until = self.args.until until = until and PythonExpression(until, all_variables_allowed=True) reader = StructReader(memoryview(data)) checkpoint = 0 mainspec = self.args.spec byteorder = mainspec[:1] if byteorder in '<@=!>': mainspec = mainspec[1:] else: byteorder = '=' def fixorder(spec): if spec[0] not in '<@=!>': spec = byteorder + spec return spec previously_existing_variables = set(metavars(data).variable_names()) it = itertools.count() if self.args.multi else (0,) for index in it: checkpoint = reader.tell() if reader.eof: break if index >= self.args.count: break meta = metavars(data) meta.ghost = True meta.update_index(index) args = [] last = None self.log_debug(F'starting new read at: 0x{checkpoint:08X}') try: for prefix, name, spec, conversion in formatter.parse(mainspec): name: str spec: str = spec and spec.strip() if prefix: args.extend(reader.read_struct(fixorder(prefix))) if name is None: continue if name and not name.isdecimal(): check_variable_name(name) if conversion: _aa = reader.tell() reader.byte_align(PythonExpression.Evaluate(conversion, meta)) _ab = reader.tell() if _aa != _ab: self.log_info(F'aligned from 0x{_aa:X} to 0x{_ab:X}') spec, _, pipeline = spec.partition(':') if spec: spec = meta.format_str(spec, self.codec, args) if spec: try: _exp = PythonExpression.Evaluate(spec, meta) except ParserError: pass else: spec = _exp if spec == '': last = value = reader.read() elif isinstance(spec, int): if spec < 0: spec += reader.remaining_bytes if spec < 0: raise ValueError(F'The specified negative read offset is {-spec} beyond the cursor.') last = value = reader.read_bytes(spec) else: value = reader.read_struct(fixorder(spec)) if not value: self.log_debug(F'field {name} was empty, ignoring.') continue if len(value) > 1: self.log_info(F'parsing field {name} produced {len(value)} items reading a tuple') else: value = value[0] if pipeline: value = numseq(pipeline, reverse=True, seed=value) args.append(value) if name == _SHARP: raise ValueError('Extracting a field with name # is forbidden.') elif name.isdecimal(): index = int(name) limit = len(args) - 1 if index > limit: self.log_warn(F'cannot assign index field {name}, the highest index is {limit}') else: args[index] = value continue elif name: meta[name] = value if until and until(meta): self.log_info(F'the expression ({until}) evaluated to true; aborting.') break with StreamDetour(reader, checkpoint) as detour: full = reader.read(detour.cursor - checkpoint) if last is None: last = full outputs = [] symbols = dict(meta) symbols[_SHARP] = last for template in self.args.outputs: used = set() outputs.append(meta.format(template, self.codec, [full, *args], symbols, True, used=used)) for key in used: if key in previously_existing_variables: continue meta.discard(key) for output in outputs: chunk = Chunk(output) chunk.meta.update(meta) chunk.set_next_batch(index) yield chunk except EOFError: break leftover = len(reader) - checkpoint if not leftover: return elif self.args.more: reader.seekset(checkpoint) yield reader.read() else: leftover = repr(SizeInt(leftover)).strip() self.log_info(F'discarding {leftover} left in buffer')
class sub (argument, bigendian=False, blocksize=None)
-
Subtract the given argument from each block.
Expand source code Browse git
class sub(BinaryOperationWithAutoBlockAdjustment): """ Subtract the given argument from each block. """ @staticmethod def operate(a, b): return a - b @staticmethod def inplace(a, b): a -= b
class subfiles (memdump=False, recursive=False)
-
Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against the input data and generates one output chunk for each successfully carved subfile.
Expand source code Browse git
class subfiles(Unit): """ Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against the input data and generates one output chunk for each successfully carved subfile. """ _MINLENGTH = { 'json': 300, 'xml' : 300, 'rtf' : 100, } def __init__( self, memdump : Unit.Arg.Switch('-m', help='Assume that the input is a memdump for PE file carving.') = False, recursive: Unit.Arg.Switch('-r', help='Extract files that are subfiles of other extracted files as separate chunks.') = False, ): super().__init__(memdump=memdump, recursive=recursive) def process(self, data: bytearray): carvers = { 'zip' : carve_zip(), '7z' : carve_7z(), 'pe' : carve_pe(memdump=self.args.memdump, fileinfo=True, recursive=True, keep_root=True), 'lnk' : carve_lnk(), 'json' : carve_json(dictonly=True), 'xml' : carve_xml(), 'rtf' : carve_rtf(), } covered = [] for extension, unit in carvers.items(): self.log_info(F'carving {extension} files') for chunk in data | unit: if len(chunk) < self._MINLENGTH.get(extension, 1): continue start = chunk['offset'] end = start + len(chunk) if any(start > left and end < right for left, right in covered): continue if not self.args.recursive: covered.append((start, end)) yield chunk
class swap (src, dst=None)
-
Swap the contents of an existing variable with the contents of the chunk or with another meta variable. When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable that does not exist, the original variable is cleared, essentially renaming the variable.
Expand source code Browse git
class swap(Unit): """ Swap the contents of an existing variable with the contents of the chunk or with another meta variable. When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable that does not exist, the original variable is cleared, essentially renaming the variable. """ def __init__( self, src: Arg(type=str, help='The meta variable name.'), dst: Arg(type=str, help='Optional name of the second meta variable.') = None ): super().__init__( src=check_variable_name(src), dst=check_variable_name(dst) ) def filter(self, chunks: Iterable[Chunk]): src = self.args.src dst = self.args.dst for chunk in chunks: if not chunk.visible: pass elif dst is None: try: value = chunk.meta[src] except KeyError: value = bytearray() if isinstance(value, str): value = value.encode(self.codec) elif not isbuffer(value): raise ValueError(F'Unable to swap data with variable {src} because it has type {type(value).__name__}.') if not chunk: chunk.meta.discard(src) else: chunk.meta[src] = bytes(chunk) chunk[:] = value else: try: value = chunk.meta.pop(src) except KeyError: raise KeyError(F'The variable {src} does not exist.') try: swap = chunk.meta.pop(dst) except KeyError: chunk.meta[dst] = value else: chunk.meta[src], chunk.meta[dst] = swap, value yield chunk
class szdd
-
Extract files from SZDD archives.
Expand source code Browse git
class szdd(Unit): """ Extract files from SZDD archives. """ def process(self, data): with StructReader(data) as archive: if archive.read(8) != b'SZDD\x88\xF0\x27\x33': if not self.args.lenient: raise ValueError('signature missing') self.log_warn('the header signature is invalid, this is likely not an SZDD archive') if archive.read_byte() != 0x41: raise ValueError('Unsupported compression mode') # ignore the missing file extension letter: archive.seekrel(1) output_len = archive.u32() window_pos = 0x1000 - 0x10 output_pos = 0 output = bytearray(output_len) window = bytearray(0x1000) for k in range(len(window)): window[k] = 0x20 while not archive.eof: control = archive.read_byte() for cb in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80): if archive.eof: break if control & cb: output[output_pos] = window[window_pos] = archive.read_byte() output_pos += 1 window_pos += 1 window_pos &= 0xFFF else: match_pos = archive.read_byte() match_len = archive.read_byte() match_pos |= (match_len & 0xF0) << 4 match_len = (match_len & 0x0F) + 3 match_pos &= 0xFFF for _ in range(match_len): window[window_pos] = window[match_pos] output[output_pos] = window[window_pos] output_pos += 1 window_pos += 1 match_pos += 1 window_pos &= 0xFFF match_pos &= 0xFFF return output @classmethod def handles(self, data: bytearray): return data[:4] == B'SZDD'
class tea (key, iv=b'', padding=None, mode=None, raw=False, swap=False)
-
TEA encryption and decryption.
Expand source code Browse git
class tea(TEAUnit, cipher=BlockCipherFactory(TEA)): """ TEA encryption and decryption. """
class termfit (width=0, delta=0, tight=False)
-
Reformat incoming text data to fit a certain width.
Expand source code Browse git
class termfit(Unit): """ Reformat incoming text data to fit a certain width. """ def __init__( self, width: Arg('width', help='Optionally specify the width, by default the current terminal width is used.') = 0, delta: Arg.Number('-d', help='Subtract this number from the calculated width (0 by default).') = 0, tight: Arg.Switch('-t', help='Separate paragraphs by a single line break instead of two.') = False, ): super().__init__(width=width, delta=delta, tight=tight) @unicoded def process(self, data: str) -> str: parsep = '\n' if self.args.tight else '\n\n' return terminalfit(data, self.args.delta, self.args.width, parsep)
class terminate (sentinel=b'\x00', blocksize=None, bigendian=False)
-
The unit reads data from the incoming chunk in blocks of any given size until the sentinel value is encountered. The output of the unit is all data that was read, excluding the sentinel. The default block size is one and the default sentinel value is zero, which corresponds to reading a null-terminated string from the input. If the sentinel value is not found anywhere in the incoming data, the complete input is returned as output.
Expand source code Browse git
class terminate(BlockTransformationBase): """ The unit reads data from the incoming chunk in blocks of any given size until the sentinel value is encountered. The output of the unit is all data that was read, excluding the sentinel. The default block size is one and the default sentinel value is zero, which corresponds to reading a null-terminated string from the input. If the sentinel value is not found anywhere in the incoming data, the complete input is returned as output. """ def __init__( self, sentinel: Arg(help='sentinel value to look for; default is {default}') = B'\0', blocksize=None, bigendian=False ): super().__init__(blocksize=blocksize, bigendian=bigendian, sentinel=sentinel) def process(self, data: bytearray): sentinel = self.args.sentinel position = 0 blocksize = self.blocksize self.log_info('blocksize:', blocksize) self.log_debug('separator:', sentinel) while position >= 0: position = data.find(sentinel, position) if position < 0: self.log_info(F'The sentinel value {sentinel} was not found.') break q, r = divmod(position, blocksize) if r: position = (q + 1) * blocksize continue else: data[position:] = [] break return data def reverse(self, data: bytearray): sentinel = self.args.sentinel position = 0 while True: position = data.find(sentinel, position) if position < 0: data.extend(sentinel) break if position % self.blocksize == 0: self.log_warn('input string already contains the termination character; returning unmodified input') break position += 1 return data
class tnetmtm (headers_as_meta_vars, list_header_names, header_filter)
-
Parses out payloads from tnetstring files generated by mitmproxy. The unit is also able to populate HTTP headers as meta variables or emitting header values instead of actual payloads.
Expand source code Browse git
class tnetmtm(Unit): """ Parses out payloads from tnetstring files generated by mitmproxy. The unit is also able to populate HTTP headers as meta variables or emitting header values instead of actual payloads. """ def __init__( self, headers_as_meta_vars: Arg.Switch('--populate-headers', '-p'), list_header_names: Arg.Switch('--list-header-names', '-l'), header_filter: Arg('--header-filter', '-f'), ): ... @Unit.Requires('mitmproxy', 'all') def _tnetstring(): from mitmproxy.io import tnetstring return tnetstring @staticmethod def _generate_errors(log_line: Dict) -> Iterator[str]: def _extract_error(d: Optional[Dict]) -> Optional[str]: return ((d or {}).get('error') or {}).get('msg') proxy_error = _extract_error(log_line.get('client_conn')) if proxy_error: yield proxy_error error = _extract_error(log_line) if error: yield error return error def _default_meta_vars(self, log_line, request: Dict, response: Dict) -> Dict[str, Union[str, int]]: ret = { 'request_method': request.get('method').decode('utf-8'), 'request_scheme': request.get('scheme').decode('utf-8'), 'request_host': request.get('host'), 'request_query_string': request.get('path').decode('utf-8'), 'request_header_count': len(request.get('headers', [])), 'response_status_code': response.get('status_code'), 'response_header_count': len(response.get('headers', [])), } for num, error in enumerate(self._generate_errors(log_line)): ret[f'error_{num}'] = error request_http_version = request.get('http_version') if request_http_version: ret['request_http_version'] = request_http_version.decode('utf-8') response_http_version = response.get('http_version') if response_http_version: ret['response_http_version'] = response_http_version.decode('utf-8') return ret @staticmethod def _output_type(args) -> OutputType: if args.list_header_names: return OutputType.header_names if args.header_filter: return OutputType.header_value return OutputType.payloads def process(self, data: bytearray): args = self.args tnetstring = self._tnetstring output_type = self._output_type(args) with io.BytesIO(data) as fp: while True: try: log_line = tnetstring.load(fp) request = log_line.get('request') or {} response = log_line.get('response') or {} labels = {} if args.headers_as_meta_vars else self._default_meta_vars(log_line, request, response) for header_name, header_value in request.get('headers', []) + response.get('headers', []): if output_type == OutputType.header_names: yield header_name if output_type == OutputType.header_value: if header_name == args.header_filter: yield header_value if args.headers_as_meta_vars: labels[header_name.decode('utf-8').replace('-', '')] = header_value if output_type == OutputType.payloads: yield self.labelled(response.get('content'), **labels) except ValueError: break
class transpose (padding=b'')
-
Interprets the chunks in the current frame as rows of a matrix and yields the columns of that matrix. When chunks are not of even length, the matrix is considered to have empty entries in some positions. Optionally, a padding sequence can be provided to pad all rows to the same length.
Expand source code Browse git
class transpose(Unit): """ Interprets the chunks in the current frame as rows of a matrix and yields the columns of that matrix. When chunks are not of even length, the matrix is considered to have empty entries in some positions. Optionally, a padding sequence can be provided to pad all rows to the same length. """ @Unit.Requires('numpy', 'speed', 'default', 'extended') def _numpy(): import numpy return numpy def __init__( self, padding: Arg(help='Optional byte sequence to use as padding for incomplete rows.') = B'', ): super().__init__(bigendian=False, padding=padding) def filter(self, chunks: Iterable[Chunk]): rows = [] for chunk in chunks: if not chunk.visible: yield chunk continue rows.append(chunk) if not rows: return matrix = rows[0] matrix.temp = rows yield matrix def process(self, data: Chunk): chunks: List[Chunk] = data.temp if not chunks: return length = [len(chunk) for chunk in chunks] n = min(length) m = max(length) pad = self.args.padding if pad: for chunk in chunks: while len(chunk) < m: chunk.extend(pad) del chunk[m:] if n > 0: try: np = self._numpy except ImportError: pass else: t = [chunk[n:] for chunk in chunks if len(chunk) > n] for chunk in chunks: del chunk[n:] a = np.array(chunks, dtype=np.uint8).transpose() for row in a: yield row.tobytes('C') m = m - n chunks = t for i in range(m): yield bytes(chunk[i] for chunk in chunks if len(chunk) > i)
class trim (*junk, unpad=False, left=True, right=True, nocase=False)
-
Removes byte sequences at beginning and end of input data.
Expand source code Browse git
class trim(Unit): """ Removes byte sequences at beginning and end of input data. """ def __init__( self, *junk: Arg(help='Binary strings to be removed, default are all whitespace characters.'), unpad: Arg.Switch('-u', help='Also trim partial occurrences of the junk string.') = False, left: Arg.Switch('-r', '--right-only', group='SIDE', help='Do not trim left.') = True, right: Arg.Switch('-l', '--left-only', group='SIDE', help='Do not trim right.') = True, nocase: Arg.Switch('-i', help='Ignore capitalization for alphabetic characters.') = False, ): super().__init__(junk=junk, left=left, right=right, unpad=unpad, nocase=nocase) def _trimfast(self, view: memoryview, *junks: bytes, right=False) -> Tuple[bool, memoryview]: done = False pos = 0 while not done: done = True for junk in junks: temp = junk size = len(junk) if right and self.args.unpad: for k in range(size): n = size - k if view[pos:pos + n] == junk[k:]: pos += n done = False break if view[pos:pos + size] == temp: m = len(temp) while True: mm = m << 1 if view[pos + m:pos + mm] != temp: break temp += temp m = mm temp = memoryview(temp) while m >= size: if view[pos:pos + m] == temp[:m]: done = False pos += m m //= 2 if right or not self.args.unpad: continue while size > 0: if view[pos:pos + size] == temp[:size]: done = False pos += size break size -= 1 return pos def process(self, data: bytearray): junk = list(self.args.junk) if not junk: import string space = string.whitespace.encode('ascii') junk = [space[k - 1:k] for k in range(1, len(space))] lpos = 0 rpos = 0 if self.args.nocase: work = data.lower() junk = [j.lower() for j in junk] else: work = data if self.args.left: lpos = self._trimfast(memoryview(work), *junk) if self.args.right: work.reverse() junk = [bytes(reversed(j)) for j in junk] rpos = self._trimfast(memoryview(work), *junk, right=True) work.reverse() view = memoryview(data) if lpos: view = view[+lpos:] if rpos: view = view[:-rpos] return view
class u16
-
Encodes and decodes UTF-16 encoded string data.
Expand source code Browse git
class u16(Unit): """ Encodes and decodes UTF-16 encoded string data. """ def reverse(self, data): return data.decode(self.codec).encode('utf-16LE') def process(self, data): return data.decode('utf-16').encode(self.codec)
class ucrypt (size=13, salt=b'AA')
-
Implements the classic Unix crypt algorithm.
Expand source code Browse git
class ucrypt(KeyDerivation): """ Implements the classic Unix crypt algorithm. """ def __init__( self, size: Arg(help='The number of bytes to generate, default is 13.') = 13, salt: Arg(help='Salt for the derivation, the default is "AA".') = B'AA' ): super().__init__(size=size, salt=salt) def process(self, data): crypted = bytes(UnixCrypt(data, salt=self.args.salt)) if len(crypted) < self.args.size: raise RefineryPartialResult( F'unix crypt only provided {len(crypted)} bytes, but {self.args.size} ' F'were requested.', partial=crypted ) return crypted[:self.args.size]
class url (plus=False, hex=False)
-
Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the following symbols:
_
,.
,-
,~
,\
,/
. Every other character is escaped by hex-encoding it and prefixing it with a percent symbol.Expand source code Browse git
class url(Unit): """ Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the following symbols: `_`, `.`, `-`, `~`, `\\`, `/`. Every other character is escaped by hex-encoding it and prefixing it with a percent symbol. """ def __init__( self, plus: Arg.Switch('-p', help='also replace plus signs by spaces') = False, hex : Arg.Switch('-x', help='hex encode every character in reverse mode') = False ): super().__init__(plus=plus, hex=hex) def process(self, data): if self.args.plus: data = data.replace(B'+', B' ') data = unquote_to_bytes(bytes(data)) data = re.sub( B'%[uU]([0-9a-fA-F]{4})', lambda m: int(m[1], 16).to_bytes(2, 'little'), data) return data def reverse(self, data): if self.args.hex: result = bytearray(len(data) * 3) offset = 0 for byte in data: result[offset + 0] = 0x25 offset += 1 result[offset:offset + 2] = B'%02X' % byte offset += 2 return result elif self.args.plus: def replace(m): c = m[0][0] return b'+' if c == 0x20 else B'%%%02X' % c else: def replace(m): return B'%%%02X' % m[0][0] return re.sub(B'[^a-zA-Z0-9_.-~\\/]', replace, data)
class urlfix (meta=False, keep=0)
-
Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output.
Expand source code Browse git
class urlfix(Unit): """ Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output. """ def __init__( self, meta: Arg.Switch('-m', help='Extract the query string parameters as metadata.') = False, keep: Arg.Counts('-k', help=( 'If specified once, keeps the it keeps the URL params and query string. If specified ' 'twice, it keeps the URL fragment as well. At this level, the unit still filters out ' 'anything that does not parse as a URL.' )) = 0 ): super().__init__(keep=keep, meta=meta) def process(self, data): def fix(string): return quote(unquote(string)) keep = self.args.keep meta = self.args.meta parsed = urlparse(data.decode(self.codec)) if not parsed.scheme or not parsed.netloc: return None query_dict = {key: unquote(value) for key, value in parse_qsl(parsed.query)} query_string = '&'.join(F'{key}={quote(value)}' for key, value in query_dict.items()) replacements = dict( netloc=parsed.netloc.lower(), params=fix(parsed.params), path=fix(parsed.path), query=query_string, fragment=fix(parsed.fragment), ) if keep < 2: replacements.update(fragment='') if keep < 1: replacements.update(params='', query='') url = urlunparse(parsed._replace(**replacements)) url = url.encode(self.codec) if meta: url = self.labelled(url, **query_dict) return url
class urlguards
-
Restores the original URLs from their 'protected' versions as generated by Outlook protection and ProofPoint.
Expand source code Browse git
class urlguards(Unit): """ Restores the original URLs from their 'protected' versions as generated by Outlook protection and ProofPoint. """ _PP3RLENC = { letter: rl for rl, letter in enumerate( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789-_', 2 ) } @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v([12])/url\?([:;/_=!?#&.,\w\%\-\+|]+)') def _proofpointV2(self, match): version = int(match[1]) self.log_info('proofpoint match:', version) argmatch = re.match( R'^u=(.+?)&(?:amp;)?{}='.format('k' if version == 1 else '[dc]'), match[2], flags=re.DOTALL ) if not argmatch: self.log_warn('not able to translate unexpected proofpoint format:', match) return match[0] encoded = argmatch[1] if match[1] == '2': encoded = encoded.translate(str.maketrans('-_', '%/')) return unescape(unquote(encoded)) @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v3/__(.+?)__;(.*?)![-\w!?$]+') def _proofpointV3(self, match): data = unquote(match[1]) cmap = match[2] + '=' * (-len(match[2]) % 4) cmap = urlsafe_b64decode(cmap).decode('UTF-8') cursor = 0 result = '' for k in range(len(cmap)): ast = data.find('*', cursor) if ast < 0: break result += data[cursor:ast] if data[ast + 1] == '*': end = self._PP3RLENC[data[ast + 2]] result += cmap[k:end] ast += 2 else: result += cmap[k] cursor = ast + 1 self.log_debug(result) self.log_debug(data[cursor:]) return result + data[cursor:] @unguard(r'https?://\w+.safelinks\.protection\.outlook\.com/([:;/_=!?#&.,\w\%\-\+|]+)') def _outlook(self, match): result = match[0] self.log_info('outlook match:', result) parsed = urlparse(result) params = parse_qs(parsed.query) try: result = unquote(params['url'][0]) except Exception: pass return result @unguard(r'https?://outlook.office.com/actions/ei\?u=([:;/_=!?#&.,\w\%\-\+|]+)') def _outlook_image_proxy(self, match): return unquote(match[1]) @unguard(r'https?://(?:[\w-]+\.)?trendmicro.com(?::\d+)?/wis/clicktime/v[12]/(?:query|clickthrough)[:;/_=!?#&.,\w\%\-\+|]+') def _trendmicro(self, match): result = match[0] self.log_info('trendmicro match:', result) parsed = urlparse(result) params = parse_qs(parsed.query) try: result = unquote(params['url'][0]) except Exception: pass return result @unicoded def process(self, data: str) -> str: newsize, size = 0, len(data) while newsize != size: for handler in ( self._proofpointV2, self._proofpointV3, self._outlook, self._outlook_image_proxy, self._trendmicro ): data = handler(data) size = newsize newsize = len(data) return data
class urn (size='N:N', keep=False, sort=False)
-
Treat the chunks in the current frame as items in an urn and produce every possible sequence that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent to generating all possible permutations of these chunks.
Expand source code Browse git
class urn(Unit): """ Treat the chunks in the current frame as items in an urn and produce every possible sequence that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent to generating all possible permutations of these chunks. """ def __init__(self, size: Arg.String(metavar='a:b', help=( 'Generate sequences of length x, where x is in [a:b]. The default value is {default}, ' 'where N is the number of chunks in the current frame.')) = 'N:N', keep: Arg.Switch('-k', help=( 'Chunks are not returned back to the urn after being drawn.')) = False, sort: Arg.Switch('-s', help=( 'The order of items does not matter; for the output, chunks are sorted according to ' 'their original position in the frame.')) = False ): super().__init__(size=size, keep=keep, sort=sort) def process(self, data: Chunk): yield from data.temp def filter(self, chunks: Iterable[Chunk]): it = iter(chunks) head = next(it) buffer = [bytes(head)] buffer.extend(bytes(c) for c in it) head = head.copy(meta=True, data=False) head.meta['N'] = len(buffer) size = sliceobj(self.args.size, head) a = size.start or 1 b = size.stop or len(buffer) b = max(b, a + 1) c = size.step or 1 self.log_debug(F'using size [{a}:{b}:{c}]') s = 1 if self.args.sort else 0 k = 1 if self.args.keep else 0 m = (s << 1) | k method = { 0b00: lambda i, r: product(i, repeat=r), 0b01: combinations, 0b10: combinations_with_replacement, 0b11: permutations }[m] self.log_info(F'choosing {method.__name__}') for n in range(a, b, c): self.log_debug(F'generating sequences of length {n}') for head.temp in method(buffer, n): yield head
class uuenc
-
Unit for uuencode.
Expand source code Browse git
class uuenc(Unit): """ Unit for uuencode. """ def process(self, data): header = re.search( B'^begin ([0-7]{3}) (.*?)$', data, flags=re.M) if header is None: raise ValueError('invalid uu header') output = bytearray() view = memoryview(data) breaks = [m.end() for m in iter(re.finditer(B'^', data, flags=re.M))] eol = False for k, br in enumerate(itertools.islice(breaks, 1, None)): if eol and view[br:br + 3] == b'end': path = header[2] if path != B'-': output = self.labelled(output, path=path) return output count = view[br] - 0x20 if count not in range(0x41): raise ValueError(F'Invalid length encoding 0x{view[br]:02X} in line {k}.') count %= 0x40 cursor = len(output) q, r = divmod(count, 3) q += int(bool(r)) end = br + 1 + q * 4 for b in range(br + 1, end, 4): chunk = 0 for j in range(4): character = view[b + j] if character not in range(0x21, 0x61): raise ValueError(F'Invalid character 0x{character:02X} in line {k}.') chunk = ((character - 0x20) % 0x40) | (chunk << 6) output.extend(chunk.to_bytes(3, 'big')) del output[cursor + count:] eol = count == 0 if len(output) < cursor + count: break raise RefineryPartialResult(F'Data truncated in line {k}', output) def reverse(self, data): meta = metavars(data) path = meta.get('path', None) name = path and pathlib.Path(path).name or '-' view = memoryview(data) with MemoryFile() as stream: stream.write(B'begin 666 ') stream.write(name.encode(self.codec)) for k in range(0, len(view), 45): slice = view[k:k + 45] stream.write_byte(0x0A) stream.write_byte(0x20 + len(slice)) for chunk in chunks.unpack(slice, 3, bigendian=True, pad=True): for j in range(3, -1, -1): stream.write_byte(0x20 + (((chunk >> j * 6) & 0x3F) or 0x40)) stream.write(B'\n`\nend\n') return stream.getvalue()
class vaddr (*name, base=None)
-
Converts a metadata variable holding a file offset to a virtual address. This unit only works when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you would like to retain the original value, it is recommended to use the
put
unit first to create a copy of an already existing variable, and then convert the copy.Expand source code Browse git
class vaddr(Unit): """ Converts a metadata variable holding a file offset to a virtual address. This unit only works when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you would like to retain the original value, it is recommended to use the `refinery.put` unit first to create a copy of an already existing variable, and then convert the copy. """ def __init__( self, *name: Arg(type=str, help='The name of a metadata variable holding an integer.'), base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None ): return super().__init__(names=name, base=base) def process(self, data): try: exe = Executable.Load(data, self.args.base) except Exception: self.log_warn('unable to parse input as executable; no variable conversion was performed') return data meta = metavars(data) for name in self.args.names: value = meta[name] meta[name] = exe.location_from_offset(value).virtual.position return data def reverse(self, data): try: exe = Executable.Load(data, self.args.base) except Exception: self.log_warn('unable to parse input as executable; no variable conversion was performed') return data meta = metavars(data) for name in self.args.names: value = meta[name] meta[name] = exe.location_from_address(value).physical.position return data
class vbapc (raw=False)
-
Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to decompile the disassembled p-code. This unit is specifically useful for macro documents that use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the p-code functionality that the document will actually execute.
Expand source code Browse git
class vbapc(Unit): """ Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to decompile the disassembled p-code. This unit is specifically useful for macro documents that use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the p-code functionality that the document will actually execute. """ def __init__(self, raw: Unit.Arg.Switch('-r', help='Return disassembled p-code, do not try to decompile.') = False): super().__init__(raw=raw) @Unit.Requires('oletools', 'formats', 'office', 'extended') def _pcodedmp(): with NoLogging(): import pcodedmp.pcodedmp return pcodedmp.pcodedmp def process(self, data): class args: disasmOnly = True verbose = False with io.StringIO() as output: with VirtualFileSystem() as vfs: vf = vfs.new(data) self._pcodedmp.processFile(vf, args, output) code = output.getvalue() if not self.args.raw: from refinery.lib.thirdparty.pcode2code import Parser parser = Parser(code) parser.parseInput() parser.processInput(False) code = parser.getOutput() code = re.sub(R'(?m)^((?:Sub|Function).*?)$(?!\n[^\s])', r'\n\1', code) return code.encode(self.codec)
class vbastr (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract VBA macro variables from Office documents. The items are extracted in a directory hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same as the variable's name. The variable can define a caption, a control tip text, and a value; the unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively.
Expand source code Browse git
class vbastr(PathExtractorUnit): """ Extract VBA macro variables from Office documents. The items are extracted in a directory hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same as the variable's name. The variable can define a caption, a control tip text, and a value; the unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively. """ @PathExtractorUnit.Requires('oletools', 'formats', 'office') def _olevba(): from oletools import olevba return olevba def unpack(self, value): try: parser = self._olevba.VBA_Parser('.', data=bytes(value), relaxed=True) except self._olevba.FileOpenError: raise ValueError('Input data not recognized by VBA parser') try: for path, name, vars in parser.extract_form_strings_extended(): if not vars: continue name = _txt(vars['name']) for ext, key in { 'cap': 'caption', 'tip': 'control_tip_text', 'val': 'value', }.items(): value = _bin(vars.get(key)) if not value: continue yield UnpackResult(F'{path!s}/{name!s}/{name}.{ext}', value) except self._olevba.oleform.OleFormParsingError as error: from collections import Counter self.log_debug(str(error)) self.log_info('extended form extraction failed with error; falling back to simple method') form_strings = list(parser.extract_form_strings()) name_counter = Counter(name for _, name, _ in form_strings) dedup = Counter() for path, name, string in form_strings: if string is None: continue if name_counter[name] > 1: dedup[name] += 1 name = F'{name!s}.v{dedup[name]}' yield UnpackResult(F'{path!s}/{name!s}.val', _bin(string))
class vigenere (key, alphabet=b'abcdefghijklmnopqrstuvwxyz', operator='add', case_sensitive=False, ignore_unknown=False)
-
Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.
Expand source code Browse git
class vigenere(Unit): """ Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher. """ def __init__( self, key: Arg(help='The encryption key'), alphabet: Arg( help='The alphabet, by default the Latin one is used: "{default}"' ) = b'abcdefghijklmnopqrstuvwxyz', operator: Arg.Choice('-:', choices=['add', 'sub', 'xor'], metavar='OP', help=( 'Choose the vigenere block operation. The default is {default}, and the available options are: {choices}')) = 'add', case_sensitive: Arg.Switch('-c', help=( 'Unless this option is set, the key will be case insensitive. Uppercase letters from the input are transformed ' 'using the same shift as would be the lowercase variant, but case is retained.')) = False, ignore_unknown: Arg.Switch('-i', help=( 'Unless this option is set, the key stream will be iterated even ' 'for letters that are not contained in the alphabet.' )) = False ): if not callable(operator): operator = { 'add': __add__, 'sub': __sub__, 'xor': __xor__, }.get(operator.lower(), None) if operator is None: raise ValueError(F'The value {operator!r} is not valid as an operator.') self.superinit(super(), **vars()) def _tabula_recta(self, data, reverse=True): key: str = self.args.key.decode(self.codec) alphabet: str = self.args.alphabet.decode(self.codec) operator = self.args.operator case_sensitive: bool = self.args.case_sensitive ignore_unknown: bool = self.args.ignore_unknown if not case_sensitive: key = key.lower() alphabet = alphabet.lower() if len(set(alphabet)) != len(alphabet): raise ValueError('Duplicate entries detected in alphabet.') if not set(key) <= set(alphabet): diff = set(key) - set(alphabet) diff = ', '.join(diff) raise ValueError(F'key contains letters which are not from the given alphabet: {diff}') self.log_info(F'using key {key} and alphabet {alphabet}') keystream = cycle(key) alph_size = len(alphabet) if reverse: operator = _opeator_inverse[operator] for letter in data: uppercase = not case_sensitive and letter.isupper() if uppercase: letter = letter.lower() try: position = alphabet.index(letter) except ValueError: yield letter if not ignore_unknown: next(keystream) continue shift = alphabet.index(next(keystream)) result = alphabet[operator(position, shift) % alph_size] yield result.upper() if uppercase else result @unicoded def process(self, data): return ''.join(self._tabula_recta(data, True)) @unicoded def reverse(self, data): return ''.join(self._tabula_recta(data, False))
class vmemref (address, base=None)
-
The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual address for memory references. For each memory reference, the unit looks up the corresponding section and file offset for the reference. It then returns all data from that section starting at the given offset.
Expand source code Browse git
class vmemref(Unit): """ The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual address for memory references. For each memory reference, the unit looks up the corresponding section and file offset for the reference. It then returns all data from that section starting at the given offset. """ @Unit.Requires('angr', 'all') def _angr(): import angr import angr.project import angr.engines return angr def _memory_references( self, function: Function, memory: Clemory, functions: Container[int], pointer_size: int, max_dereference: int = 1 ): pointer_size //= 8 references = [] code = set() for block in function.blocks: code.update(block.instruction_addrs) try: constants = function.code_constants except Exception: pass else: def is_valid_data_address(address): if not isinstance(address, int): return False if address not in memory: return False if address in code: return False if address in functions: return False return True def dereference(address): data = bytes(memory[k] for k in range(address, address + pointer_size)) return int.from_bytes(data, 'little') for address in constants: try: address = int(address) except Exception: continue times_dereferenced = 0 while is_valid_data_address(address) and address not in references: references.append(address) times_dereferenced += 1 if max_dereference and max_dereference > 0 and times_dereferenced > max_dereference: break try: address = dereference(address) except Exception: break return references def __init__( self, address: Arg.Number(metavar='ADDR', help='Specify the address of a function to scan.'), base: Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None, ): super().__init__(address=address, base=base) def process(self, data): address = self.args.address executable = Executable.Load(data, self.args.base) code = executable.location_from_address(address).virtual.box self.log_info(R'loading project into angr') with NoLogging(): project: Project = self._angr.Project(MemoryFile(data), load_options={'auto_load_libs': False}) self.log_info(F'scanning function at 0x{address:X}') with NoLogging(): cfg: CFGEmulated = project.analyses.CFGEmulated( call_depth=0, starts=[address], enable_symbolic_back_traversal=True, address_whitelist=code.range(), ) function = cfg.functions[address] code_addresses = cfg.functions if executable.type is ET.PE: code_addresses = code self.log_info(R'extracting memory references from lifted function') for ref in self._memory_references( function, project.loader.memory, code_addresses, executable.pointer_size ): try: yield executable[ref:] except CompartmentNotFound: self.log_info(F'memory reference could not be resolved: 0x{ref:0{executable.pointer_size // 4}X}')
class vsect (*paths, meta=False, synthetic=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)
-
Extract sections/segments from PE, ELF, and MachO executables.
Expand source code Browse git
class vsect(PathExtractorUnit): """ Extract sections/segments from PE, ELF, and MachO executables. """ def __init__( self, *paths, meta: Arg.Switch('-m', help=( 'Populates the metadata variables vaddr and vsize containing the virtual address and size ' 'of each section, respectively.')) = False, synthetic: Arg.Switch('-s', help=( 'Include synthesized sections: These represent data regions that are outside the sections ' 'as listed by the executable metadata, such as headers and overlays.')) = False, **keywords ): super().__init__(*paths, meta=meta, synthetic=synthetic, **keywords) def unpack(self, data): exe = Executable.Load(data) mv = memoryview(data) for k, section in enumerate(exe.sections()): if section.synthetic and not self.args.synthetic: continue start = section.physical.lower end = section.physical.upper va = section.virtual.lower vs = len(section.virtual) kwargs = {'offset': start} if self.args.meta: if va is not None: kwargs['vaddr'] = va if vs is not None: kwargs['vsize'] = vs name = section.name if not name: addr = F'{section.virtual.lower:0{exe.pointer_size // 4}X}' self.log_warn(F'section {k} had no name, synthesizing name from virtual address 0x{addr}') name = F'.{addr}' yield UnpackResult(name, mv[start:end], **kwargs)
class vsnip (*addresses, ascii=False, utf16=False, until=b'', base=None)
-
Extract data from PE, ELF, and MachO files based on virtual offsets.
Expand source code Browse git
class vsnip(Unit): """ Extract data from PE, ELF, and MachO files based on virtual offsets. """ def __init__( self, *addresses: Arg.Bounds(metavar='start:count:align', help=( 'Use Python slice syntax to describe an area of virtual memory to read. If a chunksize is ' 'specified, then the unit will always read a multiple of that number of bytes')), ascii: Arg.Switch('-a', group='END', help='Read ASCII strings; equivalent to -th:00') = False, utf16: Arg.Switch('-u', group='END', help='Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)') = False, until: Arg.Binary('-t', group='END', help='Read until sequence {varname} is read.') = B'', base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None, ): if sum(1 for t in (until, utf16, ascii) if t) > 1: raise ValueError('Only one of utf16, ascii, and until can be specified.') return super().__init__(addresses=addresses, utf16=utf16, ascii=ascii, until=until, base=base) def process(self, data: bytearray): until = self.args.until addrs = self.args.addresses if self.args.ascii: until = B'\0' if self.args.utf16: until = B'\0\0' addrs = (slice(a.start, a.stop, 2) for a in addrs) exe = Executable.Load(data, self.args.base) for addr in addrs: area = MemoryArea(addr) location = exe.location_from_address(area.start) offset = location.physical.position max_offset = location.physical.box.upper if not until: end = max_offset else: end = offset - 1 align = area.align while True: end = data.find(until, end + 1) if end not in range(offset, max_offset): raise EndOfStringNotFound if (end - offset) % align == 0: break if area.count: end = min(end, offset + area.count) yield self.labelled(data[offset:end], offset=offset)
class vstack (*address, stop=None, base=None, arch=Arch.X32, meta_registers=False, timeout=None, patch_range=slice(5, None, None), write_range=slice(1, None, None), wait=20, wait_calls=False, skip_calls=0, stack_size=65536, block_size=4096, max_visits=4096, log_writes_in_calls=False, log_stack_addresses=False, log_other_addresses=False, log_zero_overwrites=False, log_stack_cookies=False)
-
The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and extracts data patches that are written to the stack during emulation. Emulation is halted as soon as a certain number of instructions has not performed any memory writes, or when an error occurs. By default, most registers are set to the current location in the emulated stack. However, if you want to initialize certain registers differently, you can set an environment variable to the desired value.
Expand source code Browse git
class vstack(Unit): """ The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and extracts data patches that are written to the stack during emulation. Emulation is halted as soon as a certain number of instructions has not performed any memory writes, or when an error occurs. By default, most registers are set to the current location in the emulated stack. However, if you want to initialize certain registers differently, you can set an environment variable to the desired value. """ @Unit.Requires('intervaltree', 'default', 'extended') def _intervaltree(): import intervaltree return intervaltree @Unit.Requires('unicorn==2.0.1.post1', 'default', 'extended') def _unicorn(): with NoLogging(): import unicorn import unicorn.x86_const import unicorn.arm64_const import unicorn.mips_const import unicorn.sparc_const try: import unicorn.ppc_const except ImportError: pass return unicorn @Unit.Requires('capstone', 'default', 'extended') def _capstone(): import capstone return capstone def __init__( self, *address: Arg.Number(metavar='start', help='Specify the (virtual) addresses of a stack string instruction sequences.'), stop: Arg.Number('-s', metavar='stop', help='Optional: Stop when reaching this address.') = None, base: Arg.Number('-b', metavar='Addr', help='Optionally specify a custom base address B.') = None, arch: Arg.Option('-a', help='Specify for blob inputs: {choices}', choices=Arch) = Arch.X32, meta_registers: Arg.Switch('-r', help='Consume register initialization values from the chunk\'s metadata.') = False, timeout: Arg.Number('-t', help='Optionally stop emulating after a given number of instructions.') = None, patch_range: Arg.Bounds('-p', metavar='MIN:MAX', help='Extract only patches that are in the given range, default is {default}.') = slice(5, None), write_range: Arg.Bounds('-n', metavar='MIN:MAX', help='Log only writes whose size is in the given range, default is {default}.') = slice(1, None), wait: Arg.Number('-w', help=( 'When this many instructions did not write to memory, emulation is halted. The default is {default}.')) = 20, wait_calls: Arg.Switch('-c', group='CALL', help='Wait indefinitely when inside a function call.') = False, skip_calls: Arg.Counts('-C', group='CALL', help='Skip function calls entirely. Use twice to treat each call as allocating memory.') = 0, stack_size: Arg.Number('-S', help='Optionally specify the stack size. The default is 0x{default:X}.') = 0x10000, block_size: Arg.Number('-B', help='Standard memory block size for the emulator, 0x{default:X} by default.') = 0x1000, max_visits: Arg.Number('-V', help='Maximum number of times a code address is visited. Default is {default}.') = 0x1000, log_writes_in_calls: Arg.Switch('-W', help='Log writes of values that occur in functions calls.') = False, log_stack_addresses: Arg.Switch('-X', help='Log writes of values that are stack addresses.') = False, log_other_addresses: Arg.Switch('-Y', help='Log writes of values that are addresses to mapped segments.') = False, log_zero_overwrites: Arg.Switch('-Z', help='Log writes of zeros to memory that contained nonzero values.') = False, log_stack_cookies : Arg.Switch('-E', help='Log writes that look like stack cookies.') = False, ): super().__init__( address=address or [0], stop=stop, base=base, arch=Arg.AsOption(arch, Arch), meta_registers=meta_registers, timeout=timeout, patch_range=patch_range, write_range=write_range, wait=wait, stack_size=stack_size, wait_calls=wait_calls, skip_calls=skip_calls, block_size=block_size, max_visits=max_visits, log_writes_in_calls=log_writes_in_calls, log_stack_addresses=log_stack_addresses, log_other_addresses=log_other_addresses, log_zero_overwrites=log_zero_overwrites, log_stack_cookies=log_stack_cookies ) def _find_stack_location(self, exe: Executable): stack_size = self.args.stack_size memory_max = 1 << exe.pointer_size space = exe.image_defined_address_space() aligned = align(stack_size, space.upper) if aligned + stack_size < memory_max: return aligned aligned = align(stack_size, space.lower - stack_size, down=True) if aligned > 0: return aligned raise RuntimeError('The primitive method used to map stack memory has failed.') def process(self, data): uc = self._unicorn blob = False try: exe = Executable.Load(data, self.args.base) except ValueError: exe = ExecutableCodeBlob(data, self.args.base, self.args.arch) blob = True arch = exe.arch() width = exe.pointer_size // 4 block_size = self.args.block_size stack_size = self.args.stack_size stack_addr = self._find_stack_location(exe) self.log_info(F'mapping {SizeInt(stack_size)!r} of stack at 0x{stack_addr:X}') image = memoryview(data) disassembler = self._capstone.Cs(*self._cs_arch(arch, exe.byte_order())) register_values = {} if arch in (Arch.PPC32, Arch.PPC64): try: sp = uc.ppc_const.UC_PPC_REG_1 rv = uc.ppc_const.UC_PPC_REG_3 ip = uc.ppc_const.UC_PPC_REG_PC except AttributeError: raise RuntimeError('The installed unicorn version does not support the PPC architecture.') else: sp, ip, rv = { Arch.X32 : ( uc.x86_const.UC_X86_REG_ESP, uc.x86_const.UC_X86_REG_EIP, uc.x86_const.UC_X86_REG_EAX, ), Arch.X64 : ( uc.x86_const.UC_X86_REG_RSP, uc.x86_const.UC_X86_REG_RIP, uc.x86_const.UC_X86_REG_RAX, ), Arch.ARM32 : ( uc.arm_const.UC_ARM_REG_SP, uc.arm_const.UC_ARM_REG_IP, uc.arm_const.UC_ARM_REG_R0, ), Arch.ARM64 : ( uc.arm_const.UC_ARM_REG_SP, uc.arm_const.UC_ARM_REG_IP, uc.arm_const.UC_ARM_REG_R0, ), Arch.MIPS16 : ( uc.mips_const.UC_MIPS_REG_SP, uc.mips_const.UC_MIPS_REG_PC, uc.mips_const.UC_MIPS_REG_0, ), Arch.MIPS32 : ( uc.mips_const.UC_MIPS_REG_SP, uc.mips_const.UC_MIPS_REG_PC, uc.mips_const.UC_MIPS_REG_V0, ), Arch.MIPS64 : ( uc.mips_const.UC_MIPS_REG_SP, uc.mips_const.UC_MIPS_REG_PC, uc.mips_const.UC_MIPS_REG_V0, ), Arch.SPARC32 : ( uc.sparc_const.UC_SPARC_REG_SP, uc.sparc_const.UC_SPARC_REG_PC, uc.sparc_const.UC_SPARC_REG_O0, ), Arch.SPARC64 : ( uc.sparc_const.UC_SPARC_REG_SP, uc.sparc_const.UC_SPARC_REG_PC, uc.sparc_const.UC_SPARC_REG_O0, ), }[arch] if self.args.meta_registers: from refinery.lib.meta import metavars meta = metavars(data) for module in [uc.x86_const, uc.arm_const, uc.mips_const, uc.sparc_const]: md: Dict[str, Any] = module.__dict__ for name, register in md.items(): try: u, *_, kind, name = name.split('_') except Exception: continue if kind != 'REG' or u != 'UC': continue for var, value in list(meta.items()): if var.upper() != name: continue meta.discard(var) register_values[register] = value break for address in self.args.address: emulator = uc.Uc(*self._uc_arch(arch, exe.byte_order())) stack = Range(stack_addr, stack_addr + 3 * stack_size) emulator.mem_map(stack.lower, len(stack)) emulator.reg_write(sp, stack.lower + 2 * len(stack) // 3) if arch is Arch.X32: for reg in [ uc.x86_const.UC_X86_REG_EAX, uc.x86_const.UC_X86_REG_EBX, uc.x86_const.UC_X86_REG_ECX, uc.x86_const.UC_X86_REG_EDX, uc.x86_const.UC_X86_REG_ESI, uc.x86_const.UC_X86_REG_EDI, uc.x86_const.UC_X86_REG_EBP, ]: emulator.reg_write(reg, stack_addr + stack_size) if arch is Arch.X64: for reg in [ uc.x86_const.UC_X86_REG_RAX, uc.x86_const.UC_X86_REG_RBX, uc.x86_const.UC_X86_REG_RCX, uc.x86_const.UC_X86_REG_RDX, uc.x86_const.UC_X86_REG_RSI, uc.x86_const.UC_X86_REG_RDI, uc.x86_const.UC_X86_REG_RBP, uc.x86_const.UC_X86_REG_R8, uc.x86_const.UC_X86_REG_R9, uc.x86_const.UC_X86_REG_R10, uc.x86_const.UC_X86_REG_R11, uc.x86_const.UC_X86_REG_R12, uc.x86_const.UC_X86_REG_R13, uc.x86_const.UC_X86_REG_R14, uc.x86_const.UC_X86_REG_R15, ]: emulator.reg_write(reg, stack_addr + stack_size) for reg, value in register_values.items(): emulator.reg_write(reg, value) for segment in exe.segments(): pmem = segment.physical vmem = segment.virtual try: emulator.mem_map(vmem.lower, align(block_size, len(vmem))) emulator.mem_write(vmem.lower, bytes(image[pmem.slice()])) except KeyboardInterrupt: raise except Exception as error: if address in vmem: raise self.log_info(F'error mapping segment [{vmem.lower:0{width}X}-{vmem.upper:0{width}X}]: {error!s}') tree = self._intervaltree.IntervalTree() state = EmuState( exe, tree, address, stack, blob, disassembler, stop=self.args.stop, sp_register=sp, ip_register=ip, rv_register=rv, allocations=[stack], max_wait=self.args.wait, max_loop=self.args.max_visits, ) timeout = self.args.timeout if timeout is not None: self.log_info(F'setting timeout of {timeout} steps') state.ticks = timeout emulator.hook_add(uc.UC_HOOK_CODE, self._hook_code, user_data=state) emulator.hook_add(uc.UC_HOOK_MEM_WRITE, self._hook_mem_write, user_data=state, ) emulator.hook_add(uc.UC_HOOK_MEM_READ_AFTER, self._hook_mem_read, user_data=state, ) emulator.hook_add(uc.UC_HOOK_INSN_INVALID, self._hook_insn_error, user_data=state) emulator.hook_add(uc.UC_HOOK_MEM_INVALID, self._hook_mem_error, user_data=state) end_of_code = exe.location_from_address(address).virtual.box.upper try: emulator.emu_start(address, end_of_code) except uc.UcError: pass it: Iterator[Interval] = iter(tree) for interval in it: size = interval.end - interval.begin - 1 if size not in bounds[self.args.patch_range]: continue try: patch = emulator.mem_read(interval.begin, size) except uc.UcError as error: self.log_info(F'error reading 0x{interval.begin:0{width}X}:{size}: {error!s}') continue if not any(patch): continue self.log_info(F'memory patch at {state.fmt(interval.begin)} of size {size}') yield patch def _hook_mem_read(self, emu: Uc, access: int, address: int, size: int, value: int, state: EmuState): mask = (1 << (size * 8)) - 1 state.last_read = value & mask def _hook_mem_write(self, emu: Uc, access: int, address: int, size: int, value: int, state: EmuState): try: mask = (1 << (size * 8)) - 1 unsigned_value = value & mask if unsigned_value == state.expected_address: callstack = state.callstack state.retaddr = unsigned_value if not self.args.skip_calls: if not callstack: state.callstack_ceiling = emu.reg_read(state.sp_register) callstack.append(unsigned_value) return else: state.retaddr = None skipped = False if ( not self.args.log_stack_cookies and emu.reg_read(state.sp_register) ^ unsigned_value == state.last_read ): skipped = 'stack cookie' elif size not in bounds[self.args.write_range]: skipped = 'size excluded' elif ( state.callstack_ceiling > 0 and not self.args.log_writes_in_calls and address in range(state.callstack_ceiling - 0x200, state.callstack_ceiling) ): skipped = 'inside call' elif not self.args.log_stack_addresses and unsigned_value in state.stack: skipped = 'stack address' elif not self.args.log_other_addresses and not state.blob: for s in state.executable.sections(): if address in s.virtual: skipped = F'write to section {s.name}' break if ( not skipped and unsigned_value == 0 and state.writes.at(address) is not None and self.args.log_zero_overwrites is False ): try: if any(emu.mem_read(address, size)): skipped = 'zero overwrite' except Exception: pass if not skipped: state.writes.addi(address, address + size + 1) state.writes.merge_overlaps() state.waiting = 0 def info(): data = unsigned_value.to_bytes(size, state.executable.byte_order().value) ph = state.executable.pointer_size // 4 pt = state.executable.pointer_size // 8 h = data.hex().upper() t = re.sub('[^!-~]', '.', data.decode('latin1')) msg = state.log(F'{state.fmt(address)} <- {h:_<{ph}} {t:_<{pt}}') if skipped: msg = F'{msg} (ignored: {skipped})' return msg self.log_info(info) except KeyboardInterrupt: emu.emu_stop() return False def _hook_insn_error(self, emu: Uc, state: EmuState): self.log_debug('aborting emulation; instruction error') emu.emu_stop() return False def _hook_mem_error(self, emu: Uc, access: int, address: int, size: int, value: int, state: EmuState): bs = self.args.block_size try: emu.mem_map(align(bs, address, down=True), 2 * bs) except Exception: self.log_info(state.log(F'{state.fmt(address)} :: MEMORY ERROR')) return False else: return True def _hook_code(self, emu: Uc, address: int, size: int, state: EmuState): try: state.ticks -= 1 state.visits[address] += 1 if state.visits[address] > state.max_loop > 0: self.log_info( F'aborting emulation: 0x{address:0{state.executable.pointer_size // 8}X}' F' was visited more than {state.max_loop} times.') emu.emu_stop() return False if address == state.stop or state.ticks == 0: emu.emu_stop() return False waiting = state.waiting callstack = state.callstack depth = len(callstack) state.previous_address = address retaddr = state.retaddr state.retaddr = None if address != state.expected_address: if retaddr is not None and self.args.skip_calls: if self.args.skip_calls > 1: stack_size = self.args.stack_size block_size = self.args.block_size rv = state.rv_register alloc_addr = align(block_size, state.allocations[-1].upper) state.allocations.append(Range(alloc_addr, alloc_addr + stack_size)) emu.mem_map(alloc_addr, stack_size) emu.reg_write(rv, alloc_addr) ip = state.ip_register sp = state.sp_register ps = state.executable.pointer_size // 8 emu.reg_write(ip, retaddr) emu.reg_write(sp, emu.reg_read(sp) + ps) return if depth and address == callstack[-1]: depth -= 1 state.callstack.pop() if depth == 0: state.callstack_ceiling = 0 state.expected_address = address elif retaddr is not None and not self.args.skip_calls: # The present address was moved to the stack but we did not branch. # This is not quite accurate, of course: We could be calling the # next instruction. However, that sort of code is usually not really # a function call anyway, but rather a way to get the IP. callstack.pop() if waiting > self.args.wait: emu.emu_stop() return False if not depth or not self.args.wait_calls: state.waiting += 1 state.expected_address += size instruction = state.disassemble(address, size) if instruction: instruction = F'{instruction.mnemonic} {instruction.op_str}' self.log_debug(state.log(instruction)) else: self.log_debug(state.log('unrecognized instruction, aborting')) emu.emu_stop() except KeyboardInterrupt: emu.emu_stop() return False def _uc_arch(self, arch: Arch, bo: Optional[BO] = None) -> Tuple[int, int]: uc = self._unicorn arch, mode = { Arch.X32 : (uc.UC_ARCH_X86, uc.UC_MODE_32), # noqa Arch.X64 : (uc.UC_ARCH_X86, uc.UC_MODE_64), # noqa Arch.ARM32 : (uc.UC_ARCH_ARM, uc.UC_MODE_ARM), # noqa Arch.ARM64 : (uc.UC_ARCH_ARM, uc.UC_MODE_THUMB), # noqa Arch.MIPS16 : (uc.UC_ARCH_MIPS, uc.UC_MODE_16), # noqa Arch.MIPS32 : (uc.UC_ARCH_MIPS, uc.UC_MODE_32), # noqa Arch.MIPS64 : (uc.UC_ARCH_MIPS, uc.UC_MODE_64), # noqa Arch.PPC32 : (uc.UC_ARCH_PPC, uc.UC_MODE_32), # noqa Arch.PPC64 : (uc.UC_ARCH_PPC, uc.UC_MODE_64), # noqa Arch.SPARC32 : (uc.UC_ARCH_SPARC, uc.UC_MODE_32), # noqa Arch.SPARC64 : (uc.UC_ARCH_SPARC, uc.UC_MODE_V9), # noqa }[arch] if bo is not None: mode |= { BO.BE: uc.UC_MODE_BIG_ENDIAN, BO.LE: uc.UC_MODE_LITTLE_ENDIAN, }[bo] return arch, mode def _cs_arch(self, arch: Arch, bo: Optional[BO] = None) -> Tuple[int, int]: cs = self._capstone arch, mode = { Arch.X32 : (cs.CS_ARCH_X86, cs.CS_MODE_32), # noqa Arch.X64 : (cs.CS_ARCH_X86, cs.CS_MODE_64), # noqa Arch.ARM32 : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM), # noqa Arch.ARM64 : (cs.CS_ARCH_ARM, cs.CS_MODE_THUMB), # noqa Arch.MIPS16 : (cs.CS_ARCH_MIPS, cs.CS_MODE_16), # noqa Arch.MIPS32 : (cs.CS_ARCH_MIPS, cs.CS_MODE_32), # noqa Arch.MIPS64 : (cs.CS_ARCH_MIPS, cs.CS_MODE_64), # noqa Arch.PPC32 : (cs.CS_ARCH_PPC, cs.CS_MODE_32), # noqa Arch.PPC64 : (cs.CS_ARCH_PPC, cs.CS_MODE_64), # noqa Arch.SPARC32 : (cs.CS_ARCH_SPARC, cs.CS_MODE_32), # noqa Arch.SPARC64 : (cs.CS_ARCH_SPARC, cs.CS_MODE_V9), # noqa }[arch] if bo is not None: mode |= { BO.BE: cs.CS_MODE_BIG_ENDIAN, BO.LE: cs.CS_MODE_LITTLE_ENDIAN, }[bo] return arch, mode
class winreg (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract values from a Windows registry hive or from a registry export (.reg file).
Expand source code Browse git
class winreg(PathExtractorUnit): """ Extract values from a Windows registry hive or from a registry export (.reg file). """ @PathExtractorUnit.Requires('python-registry', 'formats') def _registry(): import Registry import Registry.Registry import Registry.RegistryParse return Registry @staticmethod def _walk(patterns: List[PathPattern], key: RegistryKey, *path: str): here = '/'.join(path) if not any(p.reach(here) for p in patterns): winreg.log_debug(F'pruning search at {here}') return for value in key.values(): def raw(v: RegistryValue = value): return v.raw_data() vpath = F'{here}/{value.name()}' yield UnpackResult(vpath, raw) for subkey in key.subkeys(): yield from winreg._walk(patterns, subkey, *path, subkey.name()) def _unpack_hive(self, data: bytearray): try: with MemoryFile(data) as stream: root = self._registry.Registry.Registry(stream).root() yield from self._walk(self._patterns, root, root.name()) except self._registry.RegistryParse.ParseException: raise ParseException def _decode_registry_export(self, data: str): def REG_BINARY(data: str) -> bytes: return bytes.fromhex(re.sub('[^a-f0-9]+', '', data)) def REG_SZ(data: str) -> bytes: return data.encode(self.codec) | esc(quoted=True) | bytes def REG_EXPAND_SZ(data: str): return REG_BINARY(data).decode('UTF-16LE').rstrip('\0').encode(self.codec) def REG_MULTI_SZ(data: str): data = REG_BINARY(data).decode('UTF-16LE').split('\0') for string in data: if string: yield string.encode(self.codec) def REG_DWORD(data: str): value = int(data, 16) return F'0x{value:X}'.encode(self.codec) def REG_QWORD(data: str): value = int.from_bytes(REG_BINARY(data), 'little') return F'0x{value:X}'.encode(self.codec) class Missing: def __init__(self, name: str): self.name = name def __str__(self): return self.name REG_NONE = REG_EXPAND_SZ REG_DWORD_BIG_ENDIAN = Missing('REG_DWORD_BIG_ENDIAN') REG_LINK = Missing('REG_LINK') REG_RESOURCE_LIST = Missing('REG_RESOURCE_LIST') REG_FULL_RESOURCE_DESCRIPTOR = Missing('REG_FULL_RESOURCE_DESCRIPTOR') REG_RESOURCE_REQUIREMENTS_LIST = Missing('REG_RESOURCE_REQUIREMENTS_LIST') prefix, _, encoded = data.partition(':') try: decoder = { 'hex(0)' : REG_NONE, 'hex(1)' : REG_SZ, 'hex(2)' : REG_EXPAND_SZ, 'hex(3)' : REG_BINARY, 'hex' : REG_BINARY, 'hex(4)' : REG_DWORD, 'dword' : REG_DWORD, 'hex(5)' : REG_DWORD_BIG_ENDIAN, 'hex(6)' : REG_LINK, 'hex(7)' : REG_MULTI_SZ, 'hex(8)' : REG_RESOURCE_LIST, 'hex(9)' : REG_FULL_RESOURCE_DESCRIPTOR, 'hex(a)' : REG_RESOURCE_REQUIREMENTS_LIST, 'hex(b)' : REG_QWORD, }[prefix] except KeyError: decoder = REG_SZ encoded = data if isinstance(decoder, Missing): self.log_warn(F'Found registry type {decoder!s}; no decoder implemented.') return self.log_debug(F'decoding as {decoder.__name__}: {encoded}') it = decoder(encoded) if not inspect.isgenerator(it): it = (it,) yield from it def _unpack_file(self, data: bytearray): for codec in ('utf16', 'utf-16le', 'utf8'): try: reg = data.decode(codec).splitlines(keepends=True) except UnicodeError: continue if reg[0].startswith('Windows Registry Editor'): break else: raise ParseException config = WinRegFileParser() config.read_string(''.join(reg[1:])) for key in config.sections(): self.log_debug(key) for value in config[key]: name = next(iter(shlex.split(value))) path = Path(key) / Path(name) data = config[key][value] decoded = list(self._decode_registry_export(data)) if len(decoded) == 1: yield UnpackResult(str(path), decoded[0]) continue for k, d in enumerate(decoded): yield UnpackResult(F'{path!s}.{k}', d) def unpack(self, data): with contextlib.suppress(ParseException): yield from self._unpack_hive(data) return yield from self._unpack_file(data)
class wshenc (marker=True)
-
Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).
Expand source code Browse git
class wshenc(Unit): """ Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE). """ _MARKER_INIT = RB'#@~^BINREF==' _MARKER_STOP = RB'BINREF==^#~@' _CHUNKS = ( 0x57, 0x6E, 0x7B, 0x4A, 0x4C, 0x41, 0x0B, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x4A, 0x4C, 0x41, 0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x14, 0x15, 0x15, 0x15, 0x16, 0x16, 0x16, 0x17, 0x17, 0x17, 0x18, 0x18, 0x18, 0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B, 0x1B, 0x1C, 0x1C, 0x1C, 0x1D, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x2E, 0x2D, 0x32, 0x47, 0x75, 0x30, 0x7A, 0x52, 0x21, 0x56, 0x60, 0x29, 0x42, 0x71, 0x5B, 0x6A, 0x5E, 0x38, 0x2F, 0x49, 0x33, 0x26, 0x5C, 0x3D, 0x49, 0x62, 0x58, 0x41, 0x7D, 0x3A, 0x34, 0x29, 0x35, 0x32, 0x36, 0x65, 0x5B, 0x20, 0x39, 0x76, 0x7C, 0x5C, 0x72, 0x7A, 0x56, 0x43, 0x7F, 0x73, 0x38, 0x6B, 0x66, 0x39, 0x63, 0x4E, 0x70, 0x33, 0x45, 0x45, 0x2B, 0x6B, 0x68, 0x68, 0x62, 0x71, 0x51, 0x59, 0x4F, 0x66, 0x78, 0x09, 0x76, 0x5E, 0x62, 0x31, 0x7D, 0x44, 0x64, 0x4A, 0x23, 0x54, 0x6D, 0x75, 0x43, 0x71, 0x4A, 0x4C, 0x41, 0x7E, 0x3A, 0x60, 0x4A, 0x4C, 0x41, 0x5E, 0x7E, 0x53, 0x40, 0x4C, 0x40, 0x77, 0x45, 0x42, 0x4A, 0x2C, 0x27, 0x61, 0x2A, 0x48, 0x5D, 0x74, 0x72, 0x22, 0x27, 0x75, 0x4B, 0x37, 0x31, 0x6F, 0x44, 0x37, 0x4E, 0x79, 0x4D, 0x3B, 0x59, 0x52, 0x4C, 0x2F, 0x22, 0x50, 0x6F, 0x54, 0x67, 0x26, 0x6A, 0x2A, 0x72, 0x47, 0x7D, 0x6A, 0x64, 0x74, 0x39, 0x2D, 0x54, 0x7B, 0x20, 0x2B, 0x3F, 0x7F, 0x2D, 0x38, 0x2E, 0x2C, 0x77, 0x4C, 0x30, 0x67, 0x5D, 0x6E, 0x53, 0x7E, 0x6B, 0x47, 0x6C, 0x66, 0x34, 0x6F, 0x35, 0x78, 0x79, 0x25, 0x5D, 0x74, 0x21, 0x30, 0x43, 0x64, 0x23, 0x26, 0x4D, 0x5A, 0x76, 0x52, 0x5B, 0x25, 0x63, 0x6C, 0x24, 0x3F, 0x48, 0x2B, 0x7B, 0x55, 0x28, 0x78, 0x70, 0x23, 0x29, 0x69, 0x41, 0x28, 0x2E, 0x34, 0x73, 0x4C, 0x09, 0x59, 0x21, 0x2A, 0x33, 0x24, 0x44, 0x7F, 0x4E, 0x3F, 0x6D, 0x50, 0x77, 0x55, 0x09, 0x3B, 0x53, 0x56, 0x55, 0x7C, 0x73, 0x69, 0x3A, 0x35, 0x61, 0x5F, 0x61, 0x63, 0x65, 0x4B, 0x50, 0x46, 0x58, 0x67, 0x58, 0x3B, 0x51, 0x31, 0x57, 0x49, 0x69, 0x22, 0x4F, 0x6C, 0x6D, 0x46, 0x5A, 0x4D, 0x68, 0x48, 0x25, 0x7C, 0x27, 0x28, 0x36, 0x5C, 0x46, 0x70, 0x3D, 0x4A, 0x6E, 0x24, 0x32, 0x7A, 0x79, 0x41, 0x2F, 0x37, 0x3D, 0x5F, 0x60, 0x5F, 0x4B, 0x51, 0x4F, 0x5A, 0x20, 0x42, 0x2C, 0x36, 0x65, 0x57) _OFFSETS = ( 0, 1, 2, 0, 1, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1, 0, 0, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 1) _ENCODER = { 0x09 : [0x37, 0x69, 0x64], 0x0B : [0x0B, 0x0B, 0x0B], 0x0C : [0x0C, 0x0C, 0x0C], 0x0E : [0x0E, 0x0E, 0x0E], 0x0F : [0x0F, 0x0F, 0x0F], 0x10 : [0x10, 0x10, 0x10], 0x11 : [0x11, 0x11, 0x11], 0x12 : [0x12, 0x12, 0x12], 0x13 : [0x13, 0x13, 0x13], 0x14 : [0x14, 0x14, 0x14], 0x15 : [0x15, 0x15, 0x15], 0x16 : [0x16, 0x16, 0x16], 0x17 : [0x17, 0x17, 0x17], 0x18 : [0x18, 0x18, 0x18], 0x19 : [0x19, 0x19, 0x19], 0x1A : [0x1A, 0x1A, 0x1A], 0x1B : [0x1B, 0x1B, 0x1B], 0x1C : [0x1C, 0x1C, 0x1C], 0x1D : [0x1D, 0x1D, 0x1D], 0x1E : [0x1E, 0x1E, 0x1E], 0x1F : [0x1F, 0x1F, 0x1F], 0x20 : [0x7E, 0x2C, 0x50], 0x21 : [0x5A, 0x65, 0x22], 0x22 : [0x45, 0x72, 0x4A], 0x23 : [0x3A, 0x5B, 0x61], 0x24 : [0x79, 0x66, 0x5E], 0x25 : [0x59, 0x75, 0x5D], 0x26 : [0x27, 0x4C, 0x5B], 0x27 : [0x76, 0x45, 0x42], 0x28 : [0x63, 0x76, 0x60], 0x29 : [0x62, 0x2A, 0x23], 0x2A : [0x4D, 0x43, 0x65], 0x2B : [0x51, 0x33, 0x5F], 0x2C : [0x53, 0x42, 0x7E], 0x2D : [0x52, 0x20, 0x4F], 0x2E : [0x20, 0x63, 0x52], 0x2F : [0x26, 0x4A, 0x7A], 0x30 : [0x54, 0x5A, 0x21], 0x31 : [0x71, 0x38, 0x46], 0x32 : [0x2B, 0x79, 0x20], 0x33 : [0x66, 0x32, 0x26], 0x34 : [0x2A, 0x57, 0x63], 0x35 : [0x58, 0x6C, 0x2A], 0x36 : [0x7F, 0x2B, 0x76], 0x37 : [0x7B, 0x46, 0x47], 0x38 : [0x30, 0x52, 0x25], 0x39 : [0x31, 0x4F, 0x2C], 0x3A : [0x6C, 0x3D, 0x29], 0x3B : [0x49, 0x70, 0x69], 0x3D : [0x78, 0x7B, 0x27], 0x3F : [0x5F, 0x51, 0x67], 0x40 : [0x40, None, 0x40], 0x41 : [0x29, 0x7A, 0x62], 0x42 : [0x24, 0x7E, 0x41], 0x43 : [0x2F, 0x3B, 0x5A], 0x44 : [0x39, 0x47, 0x66], 0x45 : [0x33, 0x41, 0x32], 0x46 : [0x6F, 0x77, 0x73], 0x47 : [0x21, 0x56, 0x4D], 0x48 : [0x75, 0x5F, 0x43], 0x49 : [0x28, 0x26, 0x71], 0x4A : [0x42, 0x78, 0x39], 0x4B : [0x46, 0x6E, 0x7C], 0x4C : [0x4A, 0x64, 0x53], 0x4D : [0x5C, 0x74, 0x48], 0x4E : [0x48, 0x67, 0x31], 0x4F : [0x36, 0x7D, 0x72], 0x50 : [0x4B, 0x68, 0x6E], 0x51 : [0x7D, 0x35, 0x70], 0x52 : [0x5D, 0x22, 0x49], 0x53 : [0x6A, 0x55, 0x3F], 0x54 : [0x50, 0x3A, 0x4B], 0x55 : [0x69, 0x60, 0x6A], 0x56 : [0x23, 0x6A, 0x2E], 0x57 : [0x09, 0x71, 0x7F], 0x58 : [0x70, 0x6F, 0x28], 0x59 : [0x65, 0x49, 0x35], 0x5A : [0x74, 0x5C, 0x7D], 0x5B : [0x2C, 0x5D, 0x24], 0x5C : [0x77, 0x27, 0x2D], 0x5D : [0x44, 0x59, 0x54], 0x5E : [0x3F, 0x25, 0x37], 0x5F : [0x6D, 0x7C, 0x7B], 0x60 : [0x7C, 0x23, 0x3D], 0x61 : [0x43, 0x6D, 0x6C], 0x62 : [0x38, 0x28, 0x34], 0x63 : [0x5E, 0x31, 0x6D], 0x64 : [0x5B, 0x39, 0x4E], 0x65 : [0x6E, 0x7F, 0x2B], 0x66 : [0x57, 0x36, 0x30], 0x67 : [0x4C, 0x54, 0x6F], 0x68 : [0x34, 0x34, 0x74], 0x69 : [0x72, 0x62, 0x6B], 0x6A : [0x25, 0x4E, 0x4C], 0x6B : [0x56, 0x30, 0x33], 0x6C : [0x73, 0x5E, 0x56], 0x6D : [0x68, 0x73, 0x3A], 0x6E : [0x55, 0x09, 0x78], 0x6F : [0x47, 0x4B, 0x57], 0x70 : [0x32, 0x61, 0x77], 0x71 : [0x35, 0x24, 0x3B], 0x72 : [0x2E, 0x4D, 0x44], 0x73 : [0x64, 0x6B, 0x2F], 0x74 : [0x4F, 0x44, 0x59], 0x75 : [0x3B, 0x21, 0x45], 0x76 : [0x2D, 0x37, 0x5C], 0x77 : [0x41, 0x53, 0x68], 0x78 : [0x61, 0x58, 0x36], 0x79 : [0x7A, 0x48, 0x58], 0x7A : [0x22, 0x2E, 0x79], 0x7B : [0x60, 0x50, 0x09], 0x7C : [0x6B, 0x2D, 0x75], 0x7D : [0x4E, 0x29, 0x38], 0x7E : [0x3D, 0x3F, 0x55], 0x7F : [0x67, 0x2F, 0x51] } _ESCAPE = { 0x40: B'@$', 0x3C: B'@!', 0x3E: B'@*', 0x0D: B'@#', 0x0A: B'@&', } _UNESCAPE = { B'@$': B'@', B'@!': B'<', B'@*': B'>', B'@#': B'\r', B'@&': B'\n', } def __init__( self, marker: Arg.Switch('-m', '--no-marker', off=True, help=( 'Do not require magic marker when encoding and do not search for ' 'marker when decoding.') ) = True ): super().__init__(marker=marker) @classmethod def _chunk(cls, byte, index): k = byte - 9 c = cls._CHUNKS[k * 3 : k * 3 + 3] return c[cls._OFFSETS[index % 64]] def _escape(self, iterable): escapes = bytes(self._ESCAPE) if self.args.marker: yield from self._MARKER_INIT for byte in iterable: if byte in escapes: yield from self._ESCAPE[byte] else: yield byte if self.args.marker: yield from self._MARKER_STOP def _unescape(self, data): def unescaper(m): return self._UNESCAPE[m[0]] return re.sub(RB'@[$!*#&]', unescaper, data) @classmethod def _decoded(cls, data): index = -1 for byte in data: if byte < 128: index += 1 if (byte == 9 or 31 < byte < 128) and byte != 60 and byte != 62 and byte != 64: byte = cls._chunk(byte, index) yield byte @classmethod def _encoded(cls, data): for i, byte in enumerate(data): try: sequence = cls._ENCODER[byte] except KeyError: yield byte else: offset = cls._OFFSETS[i % 0x40] yield sequence[offset] def reverse(self, data): return bytearray(self._escape(self._encoded(data))) def process(self, data): if self.args.marker: match = formats.wshenc.search(data) if not match: raise ValueError('Encoded script marker was not found.') data = match[0][12:-12] return bytearray(self._decoded(self._unescape(data)))
class xchacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
XChaCha encryption and decryption. The nonce must be 24 bytes long.
Expand source code Browse git
class xchacha(LatinCipherUnit): """ XChaCha encryption and decryption. The nonce must be 24 bytes long. """ def keystream(self) -> Iterable[int]: kdp, kdn, nonce = struct.unpack('<Q8s8s', self.args.nonce) yield from LatinX( ChaChaCipher, (0, 1, 2, 3, 12, 13, 14, 15), self.args.key, kdn, kdp, nonce, self.args.magic, self.args.rounds, self.args.offset, )
class xfcc (variable='count', relative=False)
-
The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames of input. It consumes all frames in the current and counts the number of times each item occurs. It converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of every leaf has this leaf as its only child. The leaves of this tree have been enriched with a meta variable containing the number of times the corresponding chunk has occurred in the input frame tree.
Expand source code Browse git
class xfcc(Unit): """ The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames of input. It consumes all frames in the current and counts the number of times each item occurs. It converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of every leaf has this leaf as its only child. The leaves of this tree have been enriched with a meta variable containing the number of times the corresponding chunk has occurred in the input frame tree. """ def __init__( self, variable: Arg(help='The variable which is used as the accumulator') = 'count', relative: Arg.Switch('-r', help='Normalize the accumulator to a number between 0 and 1.') = False ): super().__init__(variable=variable, relative=relative) self._trunk = None self._store: Dict[Chunk, int] = defaultdict(int) def finish(self): vn = self.args.variable rc = self.args.relative if rc and self._store: maximum = max(self._store.values()) for index, (chunk, count) in enumerate(self._store.items()): if rc: count /= maximum chunk.path[-2] = 0 chunk.path[-1] = index chunk.meta[vn] = count yield chunk self._store.clear() def _getcount(self, chunk): try: count = int(chunk.meta[self.args.variable]) except (AttributeError, KeyError, TypeError): return 1 else: return count def filter(self, chunks: Iterable[Chunk]): it = iter(chunks) try: head = next(it) except StopIteration: return if len(head.path) < 2: self.log_warn(F'the current frame is nested {len(head.path)} layers deep, at least two layers are required.') yield head yield from it return trunk = head.path[:-2] store = self._store if trunk != self._trunk: yield from self.finish() self._trunk = trunk store[head] += self._getcount(head) for chunk in it: store[chunk] += self._getcount(chunk)
class xj0 (key=None, raw=False, all=False)
-
Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to extract remaining fields as metadata: String values are extracted only if they do not exceed 80 characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and lists of the latter are also extracted.
Expand source code Browse git
class xj0(Unit): """ Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to extract remaining fields as metadata: String values are extracted only if they do not exceed 80 characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and lists of the latter are also extracted. """ def __init__( self, key: Unit.Arg.Binary(help='Optional key of a value to become the main body of the chunk.') = None, raw: Unit.Arg.Switch('-r', group='META', help='Do not extract any other fields as metadata.') = False, all: Unit.Arg.Switch('-a', group='META', help='Extract all other fields as metadata.') = False ): super().__init__(key=key, raw=raw, all=all) def process(self, data): def acceptable(key, value, inside_list=False): if not is_valid_variable_name(key): return False if isinstance(value, dict): return False if isinstance(value, (float, int, bool)): return True if inside_list: return False if isinstance(value, list): return all(acceptable(key, t, True) for t in value) if isinstance(value, str): if self.args.all: return True return len(value) in range(1, 80) and '\n' not in value doc: dict = json.loads(data) if not isinstance(doc, dict): raise ValueError('The input must be a JSON dictionary.') key = self.args.key result = key and doc.pop(key.decode(self.codec), '').encode(self.codec) if self.args.raw: return result else: return self.labelled(result, **{ key: value for key, value in doc.items() if acceptable(key, value) })
class xjl
-
Returns all JSON elements from a JSON iterable as individual outputs.
Expand source code Browse git
class xjl(Unit): """ Returns all JSON elements from a JSON iterable as individual outputs. """ def process(self, data): try: doc: Union[list, dict] = json.loads(data) except Exception: from refinery.units.pattern.carve_json import carve_json doc = data | carve_json | json.loads try: it = doc.values() except AttributeError: it = doc for item in it: yield json.dumps(item, indent=4).encode(self.codec)
class xkey (range=slice(1, 32, None))
-
The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key, and where the plaintext also has one letter that occurs with overwhelming frequency. This is often the case for the zero byte in binary formats such as PE files, and the space character in text files. Based on this assumption, the unit computes the most likely key. This can be useful to decrypt PE and uncompressed text files that were encrypted byte-wise using a short key.
Expand source code Browse git
class xkey(Unit): """ The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key, and where the plaintext also has one letter that occurs with overwhelming frequency. This is often the case for the zero byte in binary formats such as PE files, and the space character in text files. Based on this assumption, the unit computes the most likely key. This can be useful to decrypt PE and uncompressed text files that were encrypted byte-wise using a short key. """ def __init__( self, range: Arg.Bounds(help='range of length values to try in Python slice syntax, the default is {default}.') = slice(1, 32), ): super().__init__(range=range) def process(self, data: bytearray): score = 0 guess = None bounds: slice = self.args.range view = memoryview(data) n = len(view) if n <= 1: return view start = bounds.start or 1 stop = min(bounds.stop or n, n) if bounds.step is not None: step = bounds.step if bounds.start is None: start *= step else: step = 1 self.log_debug(F'received input range [{bounds.start}:{bounds.stop}:{bounds.step}], using [{start}:{stop}:{step}]') for _count in range(start, stop + 1, step): _guess = [Counter(view[j::_count]).most_common(1)[0] for j in range(_count)] _score = sum(letter_count for _, letter_count in _guess) / n # This scaling accounts for the smaller probability of larger keys. No proper statistical analysis has been # conducted to derive it; there might be plenty of room for improvement here. _score = _score * ((n - _count) / (n - 1)) ** _count logmsg = F'got score {_score * 100:5.2f}% for length {_count}' if _score > score: self.log_info(logmsg) score = _score guess = bytearray(value for value, _ in _guess) else: self.log_debug(logmsg) return guess
class xlmdeobf (extract_only=False, sort_formulas=False, with_ms_excel=False, day=-1, output_formula_format='CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format='CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent=False, start_point='', password='', output_level=0, timeout=0)
-
Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.
Expand source code Browse git
class xlmdeobf(Unit): """ Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros. """ def __init__( self, extract_only: Unit.Arg.Switch( '-x', help='Only extract cells without any emulation.' ) = False, sort_formulas: Unit.Arg.Switch( '-s', '--sort-formulas', help='Sort extracted formulas based on their cell address (implies -x).', ) = False, with_ms_excel: Unit.Arg.Switch( '-X', '--with-ms-excel', help='Use MS Excel to process XLS files.' ) = False, day: Unit.Arg.Number( '-d', '--day', help='Specify the day of month', ) = -1, output_formula_format: Unit.Arg( '-O', '--output-format', type=str, metavar='FMT', help='Specify the format for output formulas (using [[CELL-ADDR]], [[INT-FORMULA]], and [[STATUS]])', ) = 'CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format: Unit.Arg( '-E', '--extract-format', metavar='FMT', type=str, help='Specify the format for extracted formulas (using [[CELL-ADDR]], [[CELL-FORMULA]], and [[CELL-VALUE]])', ) = 'CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent: Unit.Arg.Switch( '-I', '--no-indent', help='Do not show indent before formulas', ) = False, start_point: Unit.Arg( '-c', '--start-point', type=str, help='Start interpretation from a specific cell address', metavar='CELL', ) = '', password: Unit.Arg( '-p', '--password', type=str, help='Password to decrypt the protected document', ) = '', output_level: Unit.Arg.Number( '-o', '--output-level', help=( 'Set the level of details to be shown (0:all commands, 1: commands no jump 2:important ' 'commands 3:strings in important commands).' ), ) = 0, timeout: Unit.Arg.Number( '-t', '--timeout', help='Stop emulation after N seconds (0: not interruption N>0: stop emulation after N seconds)', ) = 0, ): extract_only = sort_formulas or extract_only self.superinit(super(), **vars()) @Unit.Requires('XLMMacroDeobfuscator', 'formats', 'office') def _process_file(): from XLMMacroDeobfuscator.configs import settings settings.SILENT = True from XLMMacroDeobfuscator.deobfuscator import process_file return process_file def process(self, data: bytearray): with VirtualFileSystem() as vfs, NoLogging(): result = self._process_file( file=vfs.new(data), noninteractive=True, return_deobfuscated=True, extract_only=self.args.extract_only, silent=True, sort_formulas=self.args.sort_formulas, defined_names=False, with_ms_excel=self.args.with_ms_excel, start_with_shell=False, day=self.args.day, output_formula_format=self.args.output_formula_format, extract_formula_format=self.args.extract_formula_format, no_indent=self.args.no_indent, start_point=self.args.start_point, password=self.args.password, output_level=self.args.output_level, timeout=self.args.timeout, ) return '\n'.join(result).encode(self.codec)
class xlxtr (*references)
-
Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet reference is of the form
B1
or1.2
, both specifying the first cell of the second column. A cell range can be specified asB1:C12
, or1.2:C12
, or1.2:12.3
. Finally, the unit will always refer to the first sheet in the document and to change this, specify the sheet name or index separated by a hashtag, i.e.sheet#B1:C12
or1#B1:C12
. Note that indices are 1-based. To get all elements of one sheet, usesheet#
. The unit If parsing a sheet reference fails, the script will assume that the given reference specifies a sheet.Expand source code Browse git
class xlxtr(_ExcelUnit): """ Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet reference is of the form `B1` or `1.2`, both specifying the first cell of the second column. A cell range can be specified as `B1:C12`, or `1.2:C12`, or `1.2:12.3`. Finally, the unit will always refer to the first sheet in the document and to change this, specify the sheet name or index separated by a hashtag, i.e. `sheet#B1:C12` or `1#B1:C12`. Note that indices are 1-based. To get all elements of one sheet, use `sheet#`. The unit If parsing a sheet reference fails, the script will assume that the given reference specifies a sheet. """ def __init__(self, *references: Arg(metavar='reference', type=SheetReference, help=( 'A sheet reference to be extracted. ' 'If no sheet references are given, the unit lists all sheet names.' ))): if not references: references = [SheetReference('*')] super().__init__(references=references) def process(self, data): wb = Workbook(data, self) for ref in self.args.references: ref: SheetReference for k, name in enumerate(wb.sheets()): if not ref.match(k, name): continue for r, row in enumerate(wb.get_sheet_data(name), 1): for c, value in enumerate(row, 1): if (r, c) not in ref: continue if value is None: continue yield self.labelled( str(value).encode(self.codec), row=r, col=c, ref=_rc2ref(r, c), sheet=name )
class xor (argument, bigendian=False, blocksize=None)
-
Form the exclusive or of the input data with the given argument.
Expand source code Browse git
class xor(BinaryOperationWithAutoBlockAdjustment): """ Form the exclusive or of the input data with the given argument. """ @staticmethod def operate(a, b): return a ^ b @staticmethod def inplace(a, b): a ^= b def _fastblock(self, data): try: return super()._fastblock(data) except FastBlockError as E: try: from Cryptodome.Util.strxor import strxor except ModuleNotFoundError: raise E else: from itertools import islice size = len(data) arg0 = self._normalize_argument(*self._argument_parse_hook(self.args.argument[0])) take = len(data) // self.blocksize + 1 argb = self.unchunk(islice(arg0, take)) del argb[size:] return strxor(data, argb)
class xsalsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)
-
XSalsa encryption and decryption. The nonce must be 24 bytes long.
Expand source code Browse git
class xsalsa(LatinCipherUnit): """ XSalsa encryption and decryption. The nonce must be 24 bytes long. """ def keystream(self) -> Iterable[int]: kdn, kdp, nonce = struct.unpack('<8sQ8s', self.args.nonce) yield from LatinX( SalsaCipher, (0, 5, 10, 15, 6, 7, 8, 9), self.args.key, kdn, kdp, nonce, self.args.magic, self.args.rounds, self.args.offset, )
class xt (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from archives. The unit tries to identify the archive format and use the correct extractor.
Expand source code Browse git
class xt(ArchiveUnit): """ Extract files from archives. The unit tries to identify the archive format and use the correct extractor. """ @classmethod def handles(cls, data: bytearray) -> Optional[bool]: out = False for engine in cls.handlers(): engine_verdict = engine.handles(data) if engine_verdict is True: return True if engine_verdict is None: out = None return out @staticmethod def handlers(): """ Returns all archive handlers supported by the unit. """ from refinery.units.formats.office.xtone import xtone yield xtone from refinery.units.formats.archive.xtgz import xtgz yield xtgz from refinery.units.formats.email import xtmail yield xtmail from refinery.units.formats.pdf import xtpdf yield xtpdf from refinery.units.formats.archive.xtasar import xtasar yield xtasar from refinery.units.formats.office.xtrtf import xtrtf yield xtrtf from refinery.units.formats.archive.xtzpaq import xtzpaq yield xtzpaq from refinery.units.formats.pe.dotnet.dnsfx import dnsfx yield dnsfx from refinery.units.formats.archive.xtnsis import xtnsis yield xtnsis from refinery.units.formats.archive.xtnode import xtnode yield xtnode from refinery.units.formats.archive.xtace import xtace yield xtace from refinery.units.formats.archive.xtcab import xtcab yield xtcab from refinery.units.formats.archive.xtcpio import xtcpio yield xtcpio from refinery.units.formats.archive.xtiso import xtiso yield xtiso from refinery.units.formats.archive.xtpyi import xtpyi yield xtpyi from refinery.units.formats.archive.xttar import xttar yield xttar from refinery.units.formats.archive.xtiss import xtiss yield xtiss from refinery.units.formats.archive.xtzip import xtzip yield xtzip from refinery.units.formats.archive.xt7z import xt7z yield xt7z from refinery.units.formats.msi import xtmsi yield xtmsi from refinery.units.formats.archive.xtmacho import xtmacho yield xtmacho from refinery.units.formats.archive.xtnuitka import xtnuitka yield xtnuitka from refinery.units.formats.office.xtdoc import xtdoc yield xtdoc from refinery.units.formats.json import xtjson yield xtjson from refinery.units.formats.exe.vsect import vsect yield vsect def unpack(self, data): fallback: List[Type[ArchiveUnit]] = [] errors = {} pos_args = self.args.paths key_args = dict( list=self.args.list, path=self.args.path, date=self.args.date, join_path=self.args.join, drop_path=self.args.drop, ) if self.args.pwd: key_args.update(pwd=self.args.pwd) if self.args.regex: key_args.update(regex=self.args.regex) class unpacker: unit = self def __init__(self, handler: Type[ArchiveUnit], fallback: bool): self.success = False self.handler = handler self.fallback = fallback def __iter__(self): handler = self.handler if self.fallback: verdict = True else: verdict = handler.handles(data) if verdict is False: self.unit.log_info(F'rejected: {handler.name}') elif verdict is True: if not self.fallback: self.unit.log_info(F'accepted: {handler.name}') try: unit = handler(*pos_args, **key_args) unit.args.lenient = self.unit.args.lenient unit.args.quiet = self.unit.args.quiet except TypeError as error: self.unit.log_debug('handler construction failed:', error) return try: for item in unit.unpack(data): item.get_data() yield item except Exception as error: if not self.fallback: errors[handler.name] = error if isinstance(error, MultipleArchives): self.unit.log_warn(error) else: self.unit.log_debug('handler unpacking failed:', error) else: self.success = True elif verdict is None: fallback.append(handler) for handler in self.handlers(): self._custom_path_separator = handler._custom_path_separator it = unpacker(handler, fallback=False) yield from it if it.success: return self.log_debug('fallback order:', lambda: ', '.join(h.name for h in fallback)) for handler in fallback: it = unpacker(handler, fallback=True) yield from it if it.success: return if not errors: raise ValueError('input data did not match any known archive format') for name, error in errors.items(): self.log_info(F'error when trying to unpack with {name}:', error) raise RefineryException('none of the available unpackers could handle this data')
class xt7z (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from a 7zip archive.
Expand source code Browse git
class xt7z(ArchiveUnit): """ Extract files from a 7zip archive. """ @ArchiveUnit.Requires('py7zr', 'arc', 'default', 'extended') def _py7zr(): import py7zr import py7zr.exceptions return py7zr def unpack(self, data: bytearray): for match in re.finditer(re.escape(B'7z\xBC\xAF\x27\x1C'), data): start = match.start() if start != 0: self.log_info(F'found a header at offset 0x{start:X}, trying to extract from there.') try: yield from self._unpack_from(data, start) except self._py7zr.Bad7zFile: continue else: break def _unpack_from(self, data: bytearray, zp: int = 0): def mk7z(**keywords): return self._py7zr.SevenZipFile(MemoryFile(mv[zp:]), **keywords) pwd = self.args.pwd mv = memoryview(data) if pwd: try: archive = mk7z(password=pwd.decode(self.codec)) except self._py7zr.Bad7zFile: raise ValueError('corrupt archive; the password is likely invalid.') else: def passwords(): yield None yield from self._COMMON_PASSWORDS for pwd in passwords(): try: archive = mk7z(password=pwd) problem = archive.testzip() except self._py7zr.PasswordRequired: problem = True except self._py7zr.UnsupportedCompressionMethodError as E: raise ValueError(E.message) except self._py7zr.exceptions.InternalError: # ignore internal errors during testzip break except SystemError: problem = True except Exception: if pwd is None: raise problem = True if not problem: break if pwd is not None: self.log_debug(F'trying password: {pwd}') else: raise ValueError('a password is required and none of the default passwords worked.') for info in archive.list(): def extract(archive: SevenZipFile = archive, info: FileInfo = info): archive.reset() return archive.read([info.filename]).get(info.filename).read() if info.is_directory: continue yield self._pack(info.filename, info.creationtime, extract, crc32=info.crc32) @classmethod def handles(cls, data: bytearray) -> bool: return B'7z\xBC\xAF\x27\x1C' in data
class xtace (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from an ACE archive.
Expand source code Browse git
class xtace(ArchiveUnit): """ Extract files from an ACE archive. """ def unpack(self, data): ace = acefile.open(MemoryFile(data, read_as_bytes=True)) for member in ace.getmembers(): member: acefile.AceMember comment = {} if not member.comment else {'comment': member.comment} yield self._pack( member.filename, member.datetime, lambda a=ace, m=member: a.read(m, pwd=self.args.pwd), **comment ) @classmethod def handles(cls, data: bytearray) -> bool: return b'**ACE**' in data[:0x100]
class xtasar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from a ASAR archive.
Expand source code Browse git
class xtasar(ArchiveUnit): """ Extract files from a ASAR archive. """ def unpack(self, data: bytearray): def _unpack(dir: JSONDict, *path): for name, listing in dir.get('files', {}).items(): yield from _unpack(listing, *path, name) try: offset = dir['offset'] size = dir['size'] except KeyError: return try: offset = int(offset) + header.base end = int(size) + offset except TypeError: self.log_warn(F'unable to convert offset "{offset}" and size "{size}" to integers') return if not path: self.log_warn(F'not processing item at root with offset {offset} and size {size}') return yield UnpackResult( '/'.join(path), lambda a=offset, b=end: data[a:b], offset=offset ) header = AsarHeader(data) self.log_debug(F'header read successfully, base offset is {header.base}.') yield from _unpack(header.directory) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return data.startswith(b'\04\0\0\0') and data[0x10:0x18] == B'{"files"'
class xtcab (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from CAB (cabinet) archives.
Expand source code Browse git
class xtcab(ArchiveUnit): """ Extract files from CAB (cabinet) archives. """ @ArchiveUnit.Requires('cabarchive', 'arc', 'default', 'extended') def _cabarchive(): import cabarchive return cabarchive def unpack(self, data: bytearray): arc = self._cabarchive.CabArchive(data) for item in arc.find_files('*'): yield self._pack(item.filename, datetime.combine(item.date, item.time), item.buf) @classmethod def handles(cls, data: bytearray): return data.startswith(B'MSCF')
class xtcpio (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from a CPIO archive.
Expand source code Browse git
class xtcpio(ArchiveUnit): """ Extract files from a CPIO archive. """ def unpack(self, data): def cpio(): with suppress(EOF): return CPIOEntry(reader) reader = StructReader(memoryview(data)) for entry in iter(cpio, None): if entry.name == 'TRAILER!!!': break yield self._pack(entry.name, entry.mtime, entry.data) @classmethod def handles(cls, data: bytearray) -> bool: for signature in (B'\x71\xC7', B'\xC7\x71', B'0707'): if data.startswith(signature): if B'TRAILER!!' in data: return True else: return None return False
class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract files from an OLE document such as a Microsoft Word DOCX file.
Expand source code Browse git
class xtdoc(PathExtractorUnit): """ Extract files from an OLE document such as a Microsoft Word DOCX file. """ @PathExtractorUnit.Requires('olefile', 'formats', 'office', 'extended') def _olefile(): import olefile return olefile def unpack(self, data): with MemoryFile(data) as stream: try: oledoc = self._olefile.OleFileIO(stream) except OSError as error: self.log_info(F'error, {error}, treating input as zip file') yield from xtzip().unpack(data) return for item in oledoc.listdir(): if not item or not item[-1]: continue path = '/'.join(item) olestream = oledoc.openstream(path) c0 = ord(item[-1][:1]) if c0 < 20: item[-1] = F'[{c0:d}]{item[-1][1:]}' path = '/'.join(item) path = convert_msi_name(path) self.log_debug('exploring:', path) yield UnpackResult(path, olestream.read()) @classmethod def handles(self, data: bytearray) -> Optional[bool]: if data.startswith(B'\xD0\xCF\x11\xE0'): return True if xtzip.handles(data): return sum(1 for marker in [ B'[Content_Types].xml', B'word/document.xml', B'docProps/core.xml', ] if marker in data) >= 2
class xtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False)
-
XTEA encryption and decryption.
Expand source code Browse git
class xtea(TEAUnit, cipher=BlockCipherFactory(XTEA)): """ XTEA encryption and decryption. """ pass
class xtgz (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract a file from a GZip archive.
Expand source code Browse git
class xtgz(ArchiveUnit): """ Extract a file from a GZip archive. """ def unpack(self, data: bytearray): archive = GzipHeader(data) path = archive.name date = archive.mtime date = date and datetime.fromtimestamp(date) or None if path is None: try: meta = metavars(data) path = Path(meta['path']) except KeyError: path = 'ungz' else: self.log_warn(path) suffix = path.suffix if suffix.lower() == '.gz': path = path.with_suffix('') else: path = path.with_suffix(F'{suffix}.ungz') path = path.as_posix() yield self._pack(path, date, archive.data) @classmethod def handles(cls, data: bytearray) -> bool: return data.startswith(B'\x1F\x8B')
class xthtml (*paths, outer=False, attributes=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the given tag. The main purpose is to extract scripts from HTML documents.
Expand source code Browse git
class xthtml(XMLToPathExtractorUnit): """ The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the given tag. The main purpose is to extract scripts from HTML documents. """ def __init__( self, *paths, outer: Arg.Switch('-o', help='Include the HTML tags for an extracted element.') = False, attributes: Arg.Switch('-a', help='Populate chunk metadata with HTML tag attributes.') = False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path' ): super().__init__( *paths, outer=outer, attributes=attributes, format='{tag}', path=path, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, ) def unpack(self, data): html = HTMLTreeParser() html.feed(data.decode(self.codec)) root = html.tos root.reindex() meta = metavars(data) path = self._make_path_builder(meta, root) while root.parent: self.log_info(F'tag was not closed: {root.tag}') root = root.parent while len(root.children) == 1: child, = root.children if child.tag != root.tag: break root = child def tree(root: HTMLNode, *parts: str): def outer(root: HTMLNode = root): return root.recover(inner=False).encode(self.codec) def inner(root: HTMLNode = root): return root.recover().encode(self.codec) tagpath = '/'.join(parts) meta = {} if self.args.attributes: meta.update(root.attributes) if root.root: yield UnpackResult(tagpath, inner, **meta) elif self.args.outer: yield UnpackResult(tagpath, outer, **meta) else: yield UnpackResult(tagpath, inner, **meta) for child in root.children: if child.textual: continue yield from tree(child, *parts, path(child)) yield from tree(root, path(root)) @classmethod def handles(self, data: bytearray): from refinery.lib import mime info = mime.get_cached_file_magic_info(data) if info.extension == 'html': return True if info.mime.endswith('html'): return True return False
class xtiso (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs='auto')
-
Extract files from a ISO archive.
Expand source code Browse git
class xtiso(ArchiveUnit): """ Extract files from a ISO archive. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs: Arg.Choice('-s', metavar='TYPE', choices=_ISO_FILE_SYSTEMS, help=( 'Specify a file system ({choices}) extension to use. The default setting {default} will automatically ' 'detect the first of the other available options and use it.')) = 'auto' ): if fs not in _ISO_FILE_SYSTEMS: raise ValueError(F'invalid file system {fs}: must be udf, joliet, rr, iso, or auto.') super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, path=path, date=date, fs=fs ) @ArchiveUnit.Requires('pycdlib', 'arc', 'default', 'extended') def _pycdlib(): import pycdlib import pycdlib.dates def fixed_parse(self, datestr): datestr = datestr[:-3] + b'00\0' return original_parse(self, datestr) original_parse = pycdlib.dates.VolumeDescriptorDate.parse pycdlib.dates.VolumeDescriptorDate.parse = fixed_parse return pycdlib @staticmethod def _strip_revision(name: str): base, split, revision = name.partition(';') return base if split and revision.isdigit() else name def unpack(self, data): if not self.handles(data): self.log_warn('The data does not look like an ISO file.') with MemoryFile(data, read_as_bytes=True) as stream: iso = self._pycdlib.PyCdlib() iso.open_fp(stream) fs = self.args.fs if fs != 'auto': mkfacade = { 'iso' : iso.get_iso9660_facade, 'udf' : iso.get_udf_facade, 'joliet' : iso.get_joliet_facade, 'rr' : iso.get_rock_ridge_facade, } facade = mkfacade[fs]() elif iso.has_udf(): self.log_info('using format: udf') facade = iso.get_udf_facade() elif iso.has_joliet(): self.log_info('using format: joliet') facade = iso.get_joliet_facade() elif iso.has_rock_ridge(): self.log_info('using format: rr') facade = iso.get_rock_ridge_facade() else: self.log_info('using format: iso') facade = iso.get_iso9660_facade() for root, _, files in facade.walk('/'): root = root.rstrip('/') for name in files: name = name.lstrip('/') path = F'{root}/{name}' try: info = facade.get_record(path) date = info.date except Exception: info = None date = None else: date = datetime.datetime( date.years_since_1900 + 1900, date.month, date.day_of_month, date.hour, date.minute, date.second, tzinfo=datetime.timezone(datetime.timedelta(minutes=15 * date.gmtoffset)) ) def extract(info=info, path=path): if info: buffer = MemoryFile(bytearray(info.data_length)) else: buffer = MemoryFile(bytearray()) facade.get_file_from_iso_fp(buffer, path) return buffer.getvalue() yield self._pack(self._strip_revision(path), date, extract) @classmethod def handles(cls, data: bytearray) -> bool: return any(data[k] == B'CD001' for k in ( slice(0x8001, 0x8006), slice(0x8801, 0x8806), slice(0x9001, 0x9006), ))
class xtiss (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extracts files from Install Shield Setup files.
Expand source code Browse git
class xtiss(ArchiveUnit): """ Extracts files from Install Shield Setup files. """ def unpack(self, data: bytearray): offset = max(data.rfind(magic) for magic in ISSReader.MAGIC) if offset < 0: raise ValueError('ISS magic not found.') data[:offset] = [] reader = ISSReader(data) count = reader.iss_archive_header() self.log_info(F'archive contains {count} files according to header') for _ in range(count): name, data = reader.iss_file() yield self._pack(name, None, data) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return data.startswith(B'MZ') and any(data.find(m) > 0 for m in ISSReader.MAGIC)
class xtjson (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract values from a JSON document.
Expand source code Browse git
class xtjson(PathExtractorUnit): """ Extract values from a JSON document. """ _custom_path_separator = '.' def unpack(self, data): def crawl(path, cursor): if isinstance(cursor, dict): for key, value in cursor.items(): yield from crawl(F'{path}/{key}', value) elif isinstance(cursor, list): for key, value in enumerate(cursor): yield from crawl(F'{path}/{key:d}', value) if path: yield path, cursor, cursor.__class__.__name__ for path, item, typename in crawl('', json.loads(data)): def extract(item=item): if isinstance(item, (list, dict)): dumped = json.dumps(item, indent=4) else: dumped = str(item) return dumped.encode(self.codec) yield UnpackResult(path, extract, type=typename) @classmethod def handles(self, data: bytearray) -> Optional[bool]: return bool(checks.json.fullmatch(data))
class xtmacho (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)."
Expand source code Browse git
class xtmacho(ArchiveUnit): """ Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)." """ _SIGNATURE_BE = B'\xCA\xFE\xBA\xBE' _SIGNATURE_LE = B'\xBE\xBA\xFE\xCA' def unpack(self, data: bytearray): view = memoryview(data) signature = bytes(view[:4]) try: reader = StructReader(view, bigendian={ self._SIGNATURE_BE: True, self._SIGNATURE_LE: False, }[signature]) except KeyError as KE: raise ValueError('Not a MachO universal binary; invalid magic header bytes.') from KE else: reader.seekset(4) count = reader.u32() self.log_info(F'reading {count} embedded executables') while count > 0: fa = FatArch(reader) self.log_info(F'reading item of size 0x{len(fa.data):08X}, arch {fa.cputype.name}') yield self._pack(fa.cputype.name, None, fa.data) count -= 1 @classmethod def handles(cls, data: bytearray): return data[:4] in ( cls._SIGNATURE_BE, cls._SIGNATURE_LE, )
class xtmagtape
-
Extract files from SIMH magtape files.
Expand source code Browse git
class xtmagtape(Unit): """ Extract files from SIMH magtape files. """ def process(self, data: bytearray): reader = StructReader(data) for r in itertools.count(): buffer = MemoryFile() for k in itertools.count(): try: head = reader.peek(4) size = reader.read_integer(24) mark = reader.read_byte() except EOFError: self.log_info('end of file while reading chunk header, terminating') return if not any(head): if k == 0: return break if mark != 0: self.log_warn(F'error code 0x{mark:02X} in record {r}.{k}') buffer.write(reader.read(size)) if reader.peek(4) != head: if reader.tell() % 2 and reader.peek(5)[1:] == head: padding = reader.read_byte() if padding != 0: self.log_info(F'nonzero padding byte in record {r}.{k}') else: raise ValueError('Invalid footer, data is corrupted.') reader.seekrel(4) yield buffer.getbuffer()
class xtmail (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract files and body from EMail messages. The unit supports both the Outlook message format and regular MIME documents.
Expand source code Browse git
class xtmail(PathExtractorUnit): """ Extract files and body from EMail messages. The unit supports both the Outlook message format and regular MIME documents. """ def _get_headparts(self, head): mw = mimewords() mw = partial(mw.process.__wrapped__.__wrapped__, mw) jh = defaultdict(list) for key, value in head: jh[key].append(mw(''.join(t.lstrip() for t in value.splitlines(False)))) jh = {k: v[0] if len(v) == 1 else [t for t in v if t] for k, v in jh.items()} yield UnpackResult('headers.txt', lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec)) yield UnpackResult('headers.json', lambda jsn=jh: json.dumps(jsn, indent=4).encode(self.codec)) @PathExtractorUnit.Requires('extract-msg<=0.41.0', 'formats', 'office', 'default', 'extended') def _extract_msg(): import extract_msg.message import extract_msg.enums return extract_msg def _get_parts_outlook(self, data): def ensure_bytes(data): return data if isinstance(data, bytes) else data.encode(self.codec) def make_message(name, msg): with NoLogging(): try: htm = msg.htmlBody except Exception: htm = None try: txt = msg.body except Exception: txt = None if txt: yield UnpackResult(F'{name}.txt', ensure_bytes(txt)) if htm: yield UnpackResult(F'{name}.htm', ensure_bytes(htm)) msgcount = 0 with NoLogging(): class ForgivingMessage(self._extract_msg.message.Message): """ If parsing the input bytes fails early, the "__open" private attribute may not yet exist. This hack prevents an exception to occur in the destructor. """ def __getattr__(self, key: str): if key.endswith('_open'): return False raise AttributeError(key) msg = ForgivingMessage(bytes(data)) yield from self._get_headparts(msg.header.items()) yield from make_message('body', msg) def attachments(msg): for attachment in getattr(msg, 'attachments', ()): yield attachment if attachment.type == 'data': continue yield from attachments(attachment.data) for attachment in attachments(msg): at = attachment.type if at is self._extract_msg.enums.AttachmentType.MSG: msgcount += 1 yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data) continue if not isbuffer(attachment.data): self.log_warn(F'unknown attachment of type {at}, please report this!') continue path = attachment.longFilename or attachment.shortFilename yield UnpackResult(F'attachments/{path}', attachment.data) @PathExtractorUnit.Requires('chardet', 'default', 'extended') def _chardet(): import chardet return chardet def _get_parts_regular(self, data: bytes): try: info = self._chardet.detect(data) msg = data.decode(info['encoding']) except UnicodeDecodeError: raise ValueError('This is not a plaintext email message.') else: msg = Parser().parsestr(msg) yield from self._get_headparts(msg.items()) for k, part in enumerate(msg.walk()): path = part.get_filename() elog = None if path is None: extension = file_extension(part.get_content_type(), 'txt') path = F'body.{extension}' else: path = path | mimewords | str path = F'attachments/{path}' try: data = part.get_payload(decode=True) except Exception as E: try: data = part.get_payload(decode=False) except Exception as E: elog = str(E) data = None else: from refinery import carve self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}') if isinstance(data, str): data = data.encode('latin1') if isbuffer(data): data = next(data | carve('b64', stripspace=True, single=True, decode=True)) else: elog = str(E) data = None if not data: if elog is not None: self.log_warn(F'could not get content of message part {k}: {elog!s}') continue yield UnpackResult(path, data) def unpack(self, data): try: yield from self._get_parts_outlook(data) except Exception: self.log_debug('failed parsing input as Outlook message') yield from self._get_parts_regular(data) @classmethod def handles(cls, data: bytearray) -> bool: markers = [ b'\nReceived:\x20from' b'\nSubject:\x20', b'\nTo:\x20', b'\nBcc:\x20', b'\nContent-Transfer-Encoding:\x20', b'\nContent-Type:\x20', b'\nReturn-Path:\x20', ] if data.startswith(B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'): markers = [marker.decode('latin1').encode('utf-16le') for marker in markers] return sum(1 for marker in markers if marker in data) >= 3
class xtmsi (*paths, list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, nocab=False)
-
Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file MsiTables.json contains parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in a virtual folder named "Action".
Expand source code Browse git
class xtmsi(xtdoc): """ Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file {FN} contains parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in a virtual folder named "Action". """ _SYNTHETIC_STREAMS_FILENAME = 'MsiTables.json' _SYNTHETIC_STREAMS_TOPLEVEL = 'MsiTables' # https://learn.microsoft.com/en-us/windows/win32/msi/summary-list-of-all-custom-action-types _CUSTOM_ACTION_TYPES = { 0x01: 'DLL file stored in a Binary table stream.', 0x02: 'EXE file stored in a Binary table stream.', 0x05: 'JScript file stored in a Binary table stream.', 0x06: 'VBScript file stored in a Binary table stream.', 0x11: 'DLL file that is installed with a product.', 0x12: 'EXE file that is installed with a product.', 0x13: 'Displays a specified error message and returns failure, terminating the installation.', 0x15: 'JScript file that is installed with a product.', 0x16: 'VBScript file that is installed with a product.', 0x22: 'EXE file having a path referencing a directory.', 0x23: 'Directory set with formatted text.', 0x25: 'JScript text stored in this sequence table.', 0x26: 'VBScript text stored in this sequence table.', 0x32: 'EXE file having a path specified by a property value.', 0x33: 'Property set with formatted text.', 0x35: 'JScript text specified by a property value.', 0x36: 'VBScript text specified by a property value.', } def __init__( self, *paths, list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, nocab: Arg.Switch('-N', help='Do not list and extract embedded CAB archives.') = False, **kw, ): super().__init__( *paths, list=list, path=path, join_path=join_path, drop_path=drop_path, nocab=nocab, fuzzy=fuzzy, exact=exact, regex=regex, **kw, ) def unpack(self, data): streams = {result.path: result for result in super().unpack(data)} def stream(name: str): return streams.pop(name).get_data() def column_formats(table: Dict[str, MSITableColumnInfo]) -> str: return ''.join(v.struct_format for v in table.values()) def stream_to_rows(data: ByteStr, row_format: str): row_size = struct.calcsize(F'<{row_format}') row_count = int(len(data) / row_size) reader = StructReader(data) columns = [reader.read_struct(F'<{sc * row_count}') for sc in row_format] for i in range(row_count): yield [c[i] for c in columns] tables: Dict[str, Dict[str, MSITableColumnInfo]] = collections.defaultdict(collections.OrderedDict) strings = MSIStringData(stream('!_StringData'), stream('!_StringPool')) for tbl_name_id, col_number, col_name_id, col_attributes in stream_to_rows(stream('!_Columns'), 'HHHH'): tbl_name = strings.ref(tbl_name_id) col_name = strings.ref(col_name_id) tables[tbl_name][col_name] = MSITableColumnInfo(col_number, col_attributes) table_names_given = {strings.ref(k) for k in chunks.unpack(stream('!_Tables'), 2, False)} table_names_known = set(tables) for name in table_names_known - table_names_given: self.log_warn(F'table name known but not given: {name}') for name in table_names_given - table_names_known: self.log_warn(F'table name given but not known: {name}') class ScriptItem(NamedTuple): row_index: int extension: Optional[str] processed_table_data: Dict[str, List[Dict[str, str]]] = {} tbl_properties: Dict[str, str] = {} tbl_files: Dict[str, str] = {} tbl_components: Dict[str, str] = {} postprocessing: List[ScriptItem] = [] def format_string(string: str): # https://learn.microsoft.com/en-us/windows/win32/msi/formatted def _replace(match: re.Match[str]): _replace.done = False prefix, name = match.groups() if not prefix: tbl = tbl_properties elif prefix in '%': name = name.rstrip('%').upper() return F'%{name}%' elif prefix in '!#': tbl = tbl_files elif prefix in '$': tbl = tbl_components else: raise ValueError return tbl.get(name, '') while True: _replace.done = True string = re.sub(R'''(?x) \[ # open square bracket (?![~\\]) # not followed by escapes ([%$!#]?) # any of the valid prefix characters ([^[\]{}]+) # no brackets or braces \]''', _replace, string) if _replace.done: break string = re.sub(r'\[\\(.)\]', r'\1', string) string = string.replace('[~]', '\0') return string for table_name, table in tables.items(): stream_name = F'!{table_name}' if stream_name not in streams: continue processed = [] info = list(table.values()) for r, row in enumerate(stream_to_rows(stream(stream_name), column_formats(table))): values = [] for index, value in enumerate(row): vt = info[index].type if vt is MsiType.Long: if value != 0: value -= 0x80000000 elif vt is MsiType.Short: if value != 0: value -= 0x8000 elif value in strings: value = strings.ref(value) elif not info[index].is_integer: value = '' values.append(value) if table_name == 'Property': tbl_properties[values[0]] = values[1] if table_name == 'File': tbl_properties[values[0]] = values[2] if table_name == 'Component': tbl_properties[values[0]] = F'%{values[2]}%' entry = dict(zip(table, values)) einfo = {t: i for t, i in zip(table, info)} if table_name == 'MsiFileHash': entry['Hash'] = struct.pack( '<IIII', row[2] ^ 0x80000000, row[3] ^ 0x80000000, row[4] ^ 0x80000000, row[5] ^ 0x80000000, ).hex() if table_name == 'CustomAction': code = row[1] & 0x3F try: entry['Comment'] = self._CUSTOM_ACTION_TYPES[code] except LookupError: pass t = einfo.get('Target') c = {0x25: 'js', 0x26: 'vbs', 0x33: None} if code in c and t and not t.is_integer: postprocessing.append(ScriptItem(r, c[code])) processed.append(entry) if processed: processed_table_data[table_name] = processed ca = processed_table_data.get('CustomAction', None) for item in postprocessing: entry = ca[item.row_index] try: path: str = entry['Action'] data: str = entry['Target'] except KeyError: continue root = F'Action/{path}' if item.extension: path = F'{root}.{item.extension}' streams[path] = UnpackResult(path, data.encode(self.codec)) continue data = format_string(data) parts = [part.partition('\x02') for part in data.split('\x01')] if not all(part[1] == '\x02' for part in parts): continue for name, _, script in parts: if not name.lower().startswith('script'): continue if not script: continue path = F'{root}.{name}' streams[path] = UnpackResult(path, script.encode(self.codec)) for ignored_stream in [ '[5]SummaryInformation', '[5]DocumentSummaryInformation', '[5]DigitalSignature', '[5]MsiDigitalSignatureEx' ]: streams.pop(ignored_stream, None) inconsistencies = 0 for k in range(len(strings)): c = strings.computed_ref_count[k] p = strings.provided_ref_count[k] if c != p and not self.log_debug(F'string reference count computed={c} provided={p}:', strings.ref(k + 1, False)): inconsistencies += 1 if inconsistencies: self.log_info(F'found {inconsistencies} incorrect string reference counts') def fix_msi_path(path: str): prefix, dot, name = path.partition('.') if dot == '.' and prefix in processed_table_data: path = F'{prefix}/{name}' return path if self.args.nocab: cabs = {} else: def _iscab(path): return media_info and any(item.get('Cabinet', '') == F'#{path}' for item in media_info) media_info: List[JSONDict] = processed_table_data.get('Media', []) cabs: Dict[str, UnpackResult] = { path: item for path, item in streams.items() if _iscab(path)} for cab in cabs: self.log_info(F'found cab file: {cab}') if cabs: from refinery.units.formats.archive.xtcab import xtcab file_names: Dict[str, JSONDict] = {} for file_info in processed_table_data.get('File', []): try: src_name = file_info['File'] dst_name = file_info['FileName'] except KeyError: continue _, _, long = dst_name.partition('|') dst_name = long or dst_name file_names[src_name] = dst_name for path, cab in cabs.items(): try: unpacked: List[UnpackResult] = list(xtcab().unpack(cab.get_data())) except Exception as e: self.log_info(F'unable to extract embedded cab file: {e!s}') continue base, dot, ext = path.rpartition('.') if dot == '.' and ext.lower() == 'cab': path = base else: del streams[path] cab.path = F'{path}.cab' streams[cab.path] = cab for result in unpacked: sub_path = file_names.get(result.path, result.path) sub_path = self._custom_path_separator.join((path, sub_path)) streams[sub_path] = result streams = {fix_msi_path(path): item for path, item in streams.items()} ds = UnpackResult(self._SYNTHETIC_STREAMS_FILENAME, json.dumps(processed_table_data, indent=4).encode(self.codec)) streams[ds.path] = ds converter = csv() for key, data in processed_table_data.items(): sk = key.strip('_') if sk not in processed_table_data: key = sk try: tbl = UnpackResult(F'{self._SYNTHETIC_STREAMS_TOPLEVEL}/{key}.csv', converter.json_to_csv(data)) except Exception: continue streams[tbl.path] = tbl for path in sorted(streams): streams[path].path = path yield streams[path] @classmethod def handles(self, data: bytearray): if not data.startswith(B'\xD0\xCF\x11\xE0'): return False return FileMagicInfo(data).extension == 'msi'
class xtnode (*paths, entry=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date')
-
Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two utilities that are commonly used to generate stand-alone executables.
Expand source code Browse git
class xtnode(ArchiveUnit): """ Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two utilities that are commonly used to generate stand-alone executables. """ _NEXE_SENTINEL = B'<nexe~~sentinel>' _PKG_PAYLOAD_P = B'PAYLOAD_POSITION' _PKG_PAYLOAD_S = B'PAYLOAD_SIZE' _PKG_PRELUDE_P = B'PRELUDE_POSITION' _PKG_PRELUDE_S = B'PRELUDE_SIZE' _PKG_COMMON_JS = B'sourceMappingURL=common.js.map' def __init__( self, *paths, entry: Arg.Switch('-u', help='Only extract the entry point.') = False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', ): super().__init__(*paths, entry=entry, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, path=path, date=date) def unpack(self, data: ByteStr) -> Iterable[UnpackResult]: if self._is_nexe(data): self.log_info('unpacking as nexe') yield from self._unpack_nexe(data) return if self._is_pkg(data): self.log_info('unpacking as pkg') yield from self._unpack_pkg(data) return def _unpack_nexe(self, data: ByteStr): try: ep = re.compile( RB"entry\s*=\s*path\.resolve\(path\.dirname\(process\.execPath\),\s*(%s)\)" % formats.string) ep, = ep.finditer(data) except Exception: ep = None self.log_info('could not identify entry point') else: ep = ep.group(1) | esc(quoted=True) | str self.log_info(F'entry point: {ep}') view = memoryview(data) for marker in re.finditer(re.escape(self._NEXE_SENTINEL), data): end = marker.end() + 16 sizes = data[marker.end():end] if sizes.startswith(b"')"): continue reader = StructReader(sizes) code_size = int(reader.f64()) blob_size = int(reader.f64()) start = marker.start() - code_size - blob_size try: reader = StructReader(view[start:end]) code = reader.read_exactly(code_size) blob = reader.read_exactly(blob_size) except EOFError: self.log_debug(F'found marker at 0x{marker.start():X}, but failed to read data') continue else: self.log_debug(F'found marker at 0x{marker.start():X}, data start at {start:X}') for rsrc in re.finditer(RB'process\.__nexe\s*=', code): rsrc = JSONReader(code[rsrc.end():]) rsrc = rsrc.read_json() if len(rsrc) == 1: _, rsrc = rsrc.popitem() for path, (offset, length) in rsrc.items(): end = offset + length if ep and self.args.entry and path != ep: continue yield UnpackResult(path, blob[offset:end]) def _unpack_pkg(self, data: ByteStr): def _extract_coordinates(*v: bytes): for name in v: pattern = name + BR'''\s{0,3}=\s{0,3}(['"])([\s\d]+)\1''' value, = re.finditer(pattern, data) yield int(value.group(2).decode('utf8').strip(), 0) def _extract_data(*v: bytes): try: offset, length = _extract_coordinates(*v) except Exception: return None return data[offset:offset + length] payload = _extract_data(self._PKG_PAYLOAD_P, self._PKG_PAYLOAD_S) if not payload: raise ValueError('unable to extract payload') prelude = _extract_data(self._PKG_PRELUDE_P, self._PKG_PRELUDE_S) if not prelude: raise ValueError('unable to extract prelude') mapping = re.search(re.escape(self._PKG_COMMON_JS) + BR'\s*\},\s*\{', prelude) if not mapping: raise ValueError('unable to find common.js mapping') reader = JSONReader(prelude[mapping.end() - 1:]) files: Dict[str, dict] = reader.read_json() if files is None: raise ValueError('failed to read file list') entry = reader.skip_comma().read_string() links = reader.skip_comma().read_json() # _unknown1 = reader.skip_comma().read_json() # _unknown2 = reader.skip_comma().read_terminated_array(B')').strip() root = next(iter(files)) skip = 0 view = memoryview(payload) for k in range(len(root) + 1): test = root[:k].rstrip('/').rstrip('\\') if not all(path.startswith(test) for path in files): root = test[:-1] skip = k - 1 break entry = entry[skip:] self.log_info(F'detected root directory {root}, entry point is {entry}') for src, dst in links.items(): new_files = {} self.log_info('link src:', src[skip:]) self.log_info('link dst:', dst[skip:]) for path, location in files.items(): if not path.startswith(src): continue new_path = dst + path[len(src):] new_files[new_path] = location self.log_debug('synthesizing linked file:', new_path) files.update(new_files) for path, location in files.items(): path = path[skip:] if entry and self.args.entry and path != entry: continue data = None for kind, (offset, length) in location.items(): stop = offset + length if kind == '3': # metadata continue if kind == '2': # unknown continue if kind in '01': data = view[offset:stop] if data is not None: yield UnpackResult(path, data) @classmethod def _is_nexe(cls, data: ByteStr) -> bool: return cls._NEXE_SENTINEL in data @classmethod def _is_pkg(cls, data: ByteStr) -> bool: if cls._PKG_PAYLOAD_P not in data: return False if cls._PKG_PAYLOAD_S not in data: return False if cls._PKG_PRELUDE_P not in data: return False if cls._PKG_PRELUDE_S not in data: return False if cls._PKG_COMMON_JS not in data: return False return True @classmethod def handles(cls, data: ByteStr) -> Optional[bool]: return cls._is_nexe(data) or cls._is_pkg(data)
class xtnsis (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from NSIS archives.
Expand source code Browse git
class xtnsis(ArchiveUnit): """ Extract files from NSIS archives. """ @classmethod def _find_archive_offset(cls, data: bytearray, before: int = -1, flawmax=2): def signatures(*magics): for changes in range(flawmax + 1): for magic in magics: if not changes: yield 0, magic continue for positions in itertools.permutations(range(len(magic)), r=changes): signature = bytearray(magic) for p in positions: signature[p] = 0x2E yield changes, bytes(signature) best_guess = None search_space = memoryview(data) for flaws, sig in signatures(*NSArchive.MAGICS): if flaws > 1: search_space = search_space[:0x20_000] matches = [m.start() - 4 for m in re.finditer(sig, search_space, flags=re.DOTALL)] if before >= 0: matches = [match for match in matches if match < before] matches.reverse() archive = None for match in matches: if match % 0x200 == 0: archive = match break if not archive: if matches and not best_guess: best_guess = matches[-1] else: msg = F'Archive signature was found at offset 0x{archive:X}' if flaws > 0: msg = F'{msg}; it has {flaws} imperfections and was likely modified' cls.log_info(F'{msg}.') return archive if best_guess: cls.log_info(F'A signature was found at offset 0x{best_guess:08X}; it is not properly aligned.') return best_guess return None def unpack(self, data): memory = memoryview(data) before = -1 _error = None while True: offset = self._find_archive_offset(data, before) if offset is None: _error = _error or ValueError('Unable to find an NSIS archive marker.') raise _error try: arc = NSArchive(memory[offset:]) except Exception as e: _error = e before = offset else: break def info(): yield F'{arc.header.type.name} archive' yield F'compression type {arc.method.value}' yield F'mystery value 0x{arc.header.unknown_value:X}' yield 'solid archive' if arc.solid else 'fragmented archive' yield '64-bit header' if arc.header.is64bit else '32-bit header' yield 'unicode' if arc.header.unicode else 'ascii' self.log_info(', '.join(info())) for item in arc.header.items: yield self._pack(item.path, item.mtime, lambda i=item: arc._extract_item(i).data) yield self._pack('setup.bin', None, arc.header_data) yield self._pack('setup.nsis', None, arc.script.encode(self.codec)) @classmethod def handles(cls, data: bytearray) -> bool: return any(magic in data for magic in NSArchive.MAGICS)
class xtnuitka (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extracts files packed by Nuitka using the –onefile option.
Expand source code Browse git
class xtnuitka(PathExtractorUnit): """ Extracts files packed by Nuitka using the --onefile option. """ _MAGIC = B'KA' @PathExtractorUnit.Requires('pyzstd', 'arc') def _pyzstd(): import pyzstd return pyzstd def unpack(self, data: ByteStr) -> Iterable[UnpackResult]: class NuitkaData(Struct): unit = self def __init__(self, reader: StructReader): self.magic = reader.read_exactly(2) self.compression_flag = reader.read_exactly(1) if self.compressed: zd = self.unit._pyzstd.ZstdDecompressor() reader = StructReader(zd.decompress(reader.read())) self.files = {} self.truncated = False while not reader.eof: path = reader.read_w_string('utf-16') if not path: break size = reader.u64() data = reader.read(size) if len(data) == size: self.files[path] = data else: self.truncated = True @property def compressed(self): return self.compression_flag == b'Y' if data.startswith(b'MZ'): arcs = list(self._pe_candidates(data)) else: arcs = [data] for arc in arcs: archive = NuitkaData(arc) if archive.truncated: self.log_warn('the archive is truncated') if archive.magic != self._MAGIC: self.log_warn('the archive data does not start with the correct magic sequence') for path, data in archive.files.items(): yield UnpackResult(path, data) @classmethod def handles(cls, data: ByteStr) -> Optional[bool]: if data.startswith(b'MZ'): try: next(cls._pe_candidates(data)) except StopIteration: return False else: return data.startswith(cls._MAGIC) @classmethod def _pe_candidates(cls, data: ByteStr): from refinery.units.formats.pe.peoverlay import peoverlay blob = data | peoverlay | bytearray if blob.startswith(cls._MAGIC): yield blob from refinery.units.formats.pe.perc import perc for blob in data | perc: if blob.startswith(cls._MAGIC): yield blob
class xtone (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract embedded files from OneNote documents.
Expand source code Browse git
class xtone(PathExtractorUnit): """ Extract embedded files from OneNote documents. """ @PathExtractorUnit.Requires('pyonenote', 'formats', 'office', 'extended') def _pyOneNote(): import pyOneNote import pyOneNote.OneDocument return pyOneNote.OneDocument def unpack(self, data: bytearray): with MemoryFile(memoryview(data)) as stream: one = self._pyOneNote.OneDocment(stream) for guid, file in one.get_files().items(): chunk = file['content'] try: extension = file['extension'] except KeyError: extension = F'.{get_cached_file_magic_info(chunk).extension}' yield UnpackResult(F'{guid}{extension}', chunk) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return UUID('e4525c7b-8cd8-a74d-aeb1-5378d02996d3').bytes in data
class xtp (*pattern, filter=0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)
-
Extract Patterns: Uses regular expressions to extract indicators from the input data and optionally filters these results heuristically. The unit is designed to extract indicators such as domain names and IP addresses, see below for a complete list. To extract data formats such as hex-encoded data, use
carve
.Expand source code Browse git
class xtp(PatternExtractor): """ Extract Patterns: Uses regular expressions to extract indicators from the input data and optionally filters these results heuristically. The unit is designed to extract indicators such as domain names and IP addresses, see below for a complete list. To extract data formats such as hex-encoded data, use `refinery.carve`. """ def __init__( self, *pattern: Arg('pattern', type=str, default=( indicators.hostname.name, indicators.url.name, indicators.email.name, ), help=( 'Choose the pattern to extract. The unit uses {{default}} by default. Use an ' 'asterix character to select all available patterns. The available patterns ' 'are: {}'.format(', '.join(p.display for p in indicators)) ) ), filter: Arg('-f', dest='filter', action='count', help=( 'If this setting is enabled, the xtp unit will attempt to reduce the number ' 'of false positives by certain crude heuristics. Specify multiple times to ' 'make the filtering more aggressive.' ) ) = 0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None ): self.superinit(super(), **vars(), ascii=True, utf16=True) patterns = { p for name in pattern for p in indicators if fnmatch(p.display, name) } # if indicators.hostname in patterns: # patterns.remove(indicators.hostname) # patterns.add(indicators.ipv4) # patterns.add(indicators.domain) patterns = [F'(?P<{p.name}>{p.value})' for p in patterns] if not patterns: raise RefineryCriticalException('The given mask does not match any known indicator pattern.') pattern = '|'.join(patterns) self.args.pattern = re.compile(pattern.encode(self.codec), flags=re.DOTALL) self.args.filter = filter _ALPHABETIC = ascii_letters.encode('ASCII') _LEGITIMATE_HOSTS = { 'acm.org' : 1, 'adobe.com' : 1, 'aka.ms' : 1, 'android.com' : 1, 'apache.org' : 1, 'apple.com' : 1, 'archive.org' : 2, 'azure.com' : 1, 'baidu.com' : 2, 'bootstrapcdn.com' : 2, 'cdnjs.cloudflare.com' : 4, 'comodo.net' : 1, 'comodoca.com' : 1, 'curl.haxx.se' : 1, 'curl.se' : 1, 'digicert.com' : 1, 'dublincore.org' : 1, 'facebook.com' : 4, 'fontawesome.com' : 1, 'github.com' : 3, 'globalsign.com' : 1, 'globalsign.net' : 1, 'godaddy.com' : 1, 'google.com' : 4, 'googleapis.com' : 5, 'googleusercontent.com' : 5, 'gov' : 2, 'gstatic.com' : 2, 'iana.org' : 1, 'intel.com' : 1, 'jquery.com' : 1, 'jsdelivr.net' : 2, 'live.com' : 1, 'microsoft.com' : 1, 'msdn.com' : 1, 'msn.com' : 1, 'newtonsoft.com' : 3, # json.net 'nuget.org' : 3, 'office.com' : 1, 'office365.com' : 2, 'openssl.org' : 1, 'openxmlformats.org' : 1, 'oracle.com' : 1, 'purl.org' : 1, 'python.org' : 1, 'schema.org' : 2, 'sectigo.com' : 1, 'skype.com' : 1, 'sourceforge.net' : 4, 'stackoverflow.com' : 1, 'sun.com' : 1, 'sway-cdn.com' : 1, 'sway-extensions.com' : 1, 'symantec.com' : 1, 'symauth.com' : 1, 'symcb.com' : 1, 'symcd.com' : 1, 'sysinternals.com' : 3, 'thawte.com' : 1, 'unicode.org' : 2, 'usertrust.com' : 1, 'verisign.com' : 1, 'w3.org' : 1, 'wikipedia.org' : 1, 'wolfram.com' : 1, 'xml.org' : 1, 'xmlsoap.org' : 1, 'yahoo.com' : 1, } for _ext in [ 'build', 'data', 'do', 'help', 'java', 'md', 'mov', 'name', 'py', 'so', 'sys', 'zip', ]: _LEGITIMATE_HOSTS[_ext] = 4 _DOMAIN_WHITELIST = [ 'system.net', 'wscript.shell', ] _BRACKETING = { B"'"[0]: B"'", B'"'[0]: B'"', B'('[0]: B')', B'{'[0]: B'}', B'['[0]: B']', B'<'[0]: B'>', } def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str, value: bytes): term = self._BRACKETING.get(data[pos - 1], None) if term: pos = value.find(term) if pos > 0: value = value[:pos] if not self.args.filter: return value if name == indicators.hostname.name: if all(part.isdigit() for part in value.split(B'.')): name = indicators.ipv4.name elif B'.' not in value: name = indicators.ipv6.name else: name = indicators.domain.name if name == indicators.ipv4.name: ocets = [int(x) for x in value.split(B'.')] if ocets.count(0) >= 3: return None if self.args.filter > 2 and sum(ocets) < 10: return None for area in ( bytes(data[pos - 20 : pos + 20]), bytes(data[pos * 2 - 40 : pos * 2 + 40 : 2]), bytes(data[pos * 2 - 41 : pos * 2 + 39 : 2]), ): if B'version' in area.lower(): return None ip = ip_address(value.decode(self.codec)) if not ip.is_global: if self.args.filter >= 3 or not ip.is_private: return None elif name in { indicators.url.name, indicators.socket.name, indicators.hostname.name, indicators.domain.name, indicators.subdomain.name }: if self.args.filter >= 2: if LetterWeights.IOC(value) < 0.6: self.log_info(F'excluding indicator because with low score: {value}', clip=True) return None if name != indicators.url.name and len(value) > 0x100: self.log_info(F'excluding indicator because it is too long: {value}', clip=True) return None ioc = value.decode(self.codec) if '://' not in ioc: ioc = F'tcp://{ioc}' parts = urlparse(ioc) host, _, _ = parts.netloc.partition(':') hl = host.lower() for white, level in self._LEGITIMATE_HOSTS.items(): if self.args.filter >= level and (hl == white or hl.endswith(F'.{white}')): self.log_info(F'excluding indicator because domain {hl} is whitelisted via {white}: {value}', clip=True) self.log_debug(F'reduce level below {level} to allow, current level is {self.args.filter}') return None if name == indicators.url.name: scheme = parts.scheme.lower() for p in ('http', 'https', 'ftp', 'file', 'mailto'): if scheme.endswith(p): pos = scheme.find(p) value = value[pos:] break if any(hl == w for w in self._DOMAIN_WHITELIST): self.log_info(F'excluding indicator because domain {hl} is whitelisted: {value}') return None if name in { indicators.hostname.name, indicators.domain.name, indicators.subdomain.name }: if data[pos - 1] in b'/\\' and self.args.filter >= 2: return None hostparts = host.split('.') if self.args.filter >= 2: if not all(p.isdigit() for p in hostparts) and all(len(p) < 4 for p in hostparts): self.log_info(F'excluding host with too many short parts: {value}') return None if self.args.filter >= 3: if len(hostparts) <= sum(3 for p in hostparts if p != p.lower() and p != p.upper()): self.log_info(F'excluding host with too many mixed case parts: {value}') return None # These heuristics attempt to filter out member access to variables in # scripts which can be mistaken for domains because of the TLD inflation # we've had. uppercase = sum(1 for c in host if c.isalpha() and c.upper() == c) lowercase = sum(1 for c in host if c.isalpha() and c.lower() == c) if lowercase and uppercase: caseratio = uppercase / lowercase if 0.1 < caseratio < 0.9: self.log_info(F'excluding indicator with too much uppercase letters: {value}') return None if all(x.isidentifier() for x in hostparts): if len(hostparts) == 2 and hostparts[0] in ('this', 'self'): self.log_info(F'excluding host that looks like a code snippet: {value}') return None if len(hostparts[-2]) < 3: self.log_info(F'excluding host with too short root domain name: {value}') return None if any(x.startswith('_') for x in hostparts): self.log_info(F'excluding host with underscores: {value}') return None if len(hostparts[-1]) > 3: prefix = '.'.join(hostparts[:-1]) seen_before = len(set(re.findall( R'{}(?:\.\w+)+'.format(prefix).encode('ascii'), data))) if seen_before > 2: self.log_debug(F'excluding indicator that was already seen: {value}') return None elif name == indicators.email.name: at = value.find(B'@') ix = 0 while value[ix] not in self._ALPHABETIC: ix += 1 return None if at - ix < 3 else value[ix:] elif name in ( indicators.path.name, indicators.winpath.name, indicators.nixpath.name, ): if len(value) < 8: self.log_info(F'excluding path because it is too short: {value}') return None if len(value) > 16 and len(re.findall(RB'\\x\d\d', value)) > len(value) // 10: self.log_info(F'excluding long path containign hex: {value}', clip=True) return None try: path_string = value.decode(self.codec) except Exception: self.log_debug(F'excluding path which did not decode: {value!r}', clip=True) return None try: path = Path(path_string) except Exception as E: self.log_debug(F'error parsing path "{path}": {E!s}') return None path_likeness = sum(v for v, x in [ (1, path.suffix), (1, path_string.startswith('/')), (2, path_string.startswith('%')), (2, path_string.startswith('\\\\')), (2, path_string[1:3] == ':\\'), ] if x) if 2 + path_likeness < min(self.args.filter, 2): self.log_info(F'excluding long path because it has no characteristic parts: {value}') return None bad_parts = 0 all_parts = len(path.parts) if self.args.filter >= 1: date_likeness = sum(1 for t in ['yyyy', 'yy', 'mm', 'dd', 'hh', 'ss'] if t in path.parts or t.upper() in path.parts) if len(value) < 20 and date_likeness >= all_parts - 1: self.log_info(F'excluding path that looks like a date format: {value}', clip=True) return None if self.args.filter >= 2: for k, part in enumerate(path.parts): if not k: drive, colon, slash = part.partition(':') if colon and len(drive) == 1 and len(slash) <= 1: continue if part[0] == part[~0] == '%': continue if len(part) == 1: continue if ( LetterWeights.Path(part) < 0.5 + (min(self.args.filter, 4) * 0.1) or (self.args.filter >= 2 and LetterWeights.Path(part[:1]) < 0.5) ): bad_parts += 1 self.log_debug(F'bad part {k + 1} in path: {part}') for filter_limit in (2, 3, 4): bad_ratio = 2 ** (filter_limit - 1) if self.args.filter >= filter_limit and bad_parts * bad_ratio >= all_parts: self.log_info(F'excluding path with bad parts: {value}', clip=True) return None return value def process(self, data): whitelist = set() def check(match: re.Match): for name, value in match.groupdict().items(): if value is not None: break else: raise RefineryCriticalException('Received empty match.') if value in whitelist: return None result = self._check_match(match.string, match.start(), name, value) if result is not None: return self.labelled(result, pattern=name) whitelist.add(value) transforms = [check] yield from self.matches_filtered(memoryview(data), self.args.pattern, *transforms)
class xtpdf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract objects from PDF documents.
Expand source code Browse git
class xtpdf(PathExtractorUnit): """ Extract objects from PDF documents. """ @PathExtractorUnit.Requires('pypdf>=3.1.0', 'formats', 'default', 'extended') def _pypdf2(): import pypdf import pypdf.generic return pypdf def _walk(self, blob, memo: Optional[Set[int]] = None, *path): while isinstance(blob, self._pypdf2.generic.IndirectObject): try: blob = blob.get_object() except Exception: break if memo is None: memo = {id(blob)} elif id(blob) in memo: return else: memo.add(id(blob)) try: name = blob['/F'] blob = blob['/EF']['/F'] except Exception: pass else: path = *path[:-1], F'/{name}' try: def extract(): with NoLogging(): return get_data() if TYPE_CHECKING: blob = cast(EncodedStreamObject, blob) get_data = blob.get_data except AttributeError: pass else: yield UnpackResult(''.join(path), extract, kind='object') return if isinstance(blob, self._pypdf2.generic.ByteStringObject): yield UnpackResult(''.join(path), blob, kind='bytes') return if isinstance(blob, self._pypdf2.generic.TextStringObject): yield UnpackResult(''.join(path), blob.encode(self.codec), kind='string') return if isinstance(blob, ( self._pypdf2.generic.BooleanObject, self._pypdf2.generic.ByteStringObject, self._pypdf2.generic.FloatObject, self._pypdf2.generic.NameObject, self._pypdf2.generic.NullObject, self._pypdf2.generic.NumberObject, self._pypdf2.generic.RectangleObject, )): # unhandled PDF objects return if isinstance(blob, self._pypdf2.generic.TreeObject): blob = list(blob) pdf = self._pypdf2.generic.PdfObject if isinstance(blob, list): if ( len(blob) % 2 == 0 and all(isinstance(key, str) for key in islice(iter(blob), 0, None, 2)) and all(isinstance(key, pdf) for key in islice(iter(blob), 1, None, 2)) ): blob = dict(zip(*([iter(blob)] * 2))) else: for key, value in enumerate(blob): yield from self._walk(value, memo, *path, F'/{key}') return if not isdict(blob): return for key, value in blob.items(): if not isinstance(key, str): continue if not key.startswith('/'): key = F'/{key}' yield from self._walk(value, memo, *path, key) def unpack(self, data): with MemoryFile(data, read_as_bytes=True) as stream: with NoLogging(): pdf = self._pypdf2.PdfReader(stream) catalog = pdf.trailer['/Root'] yield from self._walk(catalog) @classmethod def handles(self, data: bytearray) -> Optional[bool]: return data.startswith(B'%PDF-')
class xtpyi (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', decompile, user_code=False, unmarshal=0)
-
Extracts and decompiles files from a Python Installer (aka PyInstaller) archive.
Expand source code Browse git
class xtpyi(ArchiveUnit): """ Extracts and decompiles files from a Python Installer (aka PyInstaller) archive. """ def __init__( self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', decompile: Arg.Switch('-c', help='Attempt to decompile PYC files.'), user_code: Arg.Switch('-u', group='FILTER', help=( 'Extract only source code files from the root of the archive. These usually implement ' 'the actual domain logic. This implies the --decompile option.')) = False, unmarshal: Arg('-y', action='count', group='FILTER', help=( '(DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted packages can ' 'potentially exploit this to execute code. It is advised to only use this option inside ' 'an isolated environment. Specify twice to decompile unmarshalled Python bytecode.' )) = 0 ): super().__init__( *paths, list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex, path=path, date=date, decompile=decompile, unmarshal=unmarshal, user_code=user_code, ) @ArchiveUnit.Requires('xdis', 'arc', 'python', 'extended') def _xdis(): import xdis.load import xdis.magics import xdis.marsh import xdis.op_imports import xdis.version_info import xdis A, B, C, *_ = sys.version_info version = F'{A}.{B}.{C}' canonic = F'{A}.{B}' if version not in xdis.magics.canonic_python_version: class opcode_dummy: version = float(canonic) def __init__(self, name): self.name = name def __getattr__(self, key): return opcode_dummy(F'{self.name}.{key}') def __call__(self, *a, **k): return None def __str__(self): return self.name def __repr__(self): return self.name import importlib magic = importlib.util.MAGIC_NUMBER xdis.magics.add_magic_from_int(xdis.magics.magic2int(magic), version) xdis.magics.by_magic.setdefault(magic, set()).add(version) xdis.magics.by_version[version] = magic xdis.magics.magics[canonic] = magic xdis.magics.canonic_python_version[canonic] = canonic xdis.magics.add_canonic_versions(version, canonic) xdis.op_imports.op_imports.setdefault(canonic, opcode_dummy('dummy')) del A, B, C, version import xdis.std return xdis @ArchiveUnit.Requires('uncompyle6', 'arc', 'python', 'extended') def _uncompyle6(): import uncompyle6 import uncompyle6.main return uncompyle6 @ArchiveUnit.Requires('decompyle3', 'arc', 'python') def _decompyle3(): import decompyle3 import decompyle3.main return decompyle3 def unpack(self, data): view = memoryview(data) positions = [m.start() for m in re.finditer(re.escape(PyInstallerArchiveEpilogue.MagicSignature), view)] mode = Unmarshal(min(2, int(self.args.unmarshal))) self.log_debug(F'unmarshal mode: {mode.name}') if not positions: raise LookupError('unable to find PyInstaller signature') if len(positions) > 2: # first position is expected to be the sentinel value in the unpacker stub width = max(len(F'{p:X}') for p in positions) for position in positions: self.log_info(F'magic signature found at offset 0x{position:0{width}X}') self.log_warn(F'found {len(positions) - 1} potential PyInstaller epilogue markers; using last one.') decompile = self.args.decompile uc_target = PiType.USERCODE if decompile else PiType.SOURCE archive = PyInstallerArchiveEpilogue(view, positions[-1], mode, decompile) for name, file in archive.files.items(): if self.args.user_code: if file.type != uc_target: continue if name.startswith('pyiboot'): continue yield self._pack(name, None, file.data, type=file.type.name) @classmethod def handles(cls, data: ByteStr) -> Optional[bool]: return PyInstallerArchiveEpilogue.MagicSignature in data
class xtrtf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract embedded objects in RTF documents.
Expand source code Browse git
class xtrtf(PathExtractorUnit): """ Extract embedded objects in RTF documents. """ @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended') def _oletools(): import oletools import oletools.rtfobj import oletools.oleobj return oletools def unpack(self, data): parser = self._oletools.rtfobj.RtfObjParser(data) parser.parse() width = len(str(len(parser.objects))) for k, item in enumerate(parser.objects): item: RtfObject path = item.filename or F'carve{k:0{width}}.bin' data = item.rawdata meta = {} if item.is_ole: if item.format_id == self._oletools.oleobj.OleObject.TYPE_EMBEDDED: meta['ole_type'] = 'EMBEDDED' elif item.format_id == self._oletools.oleobj.OleObject.TYPE_LINKED: meta['ole_type'] = 'LINKED' if item.is_package: meta['src_path'] = item.src_path meta['tmp_path'] = item.temp_path if item.clsid is not None: meta['ole_info'] = item.clsid_desc meta['ole_guid'] = item.clsid meta['ole_name'] = item.class_name if item.oledata: data = item.oledata pos = item.rawdata.find(data) if pos > 0: meta['raw_header'] = item.rawdata[:pos] if item.olepkgdata: data = item.olepkgdata pos = item.oledata.find(data) if pos >= 0: meta['ole_header'] = item.oledata[:pos] yield UnpackResult(path, data, **meta) @classmethod def handles(self, data: bytearray) -> bool: return data[:500].lower().lstrip().startswith(b'{\\rtf')
class xttar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from a Tar archive.
Expand source code Browse git
class xttar(ArchiveUnit): """ Extract files from a Tar archive. """ def unpack(self, data: bytearray): with MemoryFile(data) as stream: try: archive = tarfile.open(fileobj=stream) except Exception: ustar = data.find(B'ustar') if ustar < 257: raise stream.seek(ustar - 257) archive = tarfile.open(fileobj=stream) for info in archive.getmembers(): if not info.isfile(): continue extractor = archive.extractfile(info) if extractor is None: continue date = datetime.datetime.fromtimestamp(info.mtime) yield self._pack(info.name, date, lambda e=extractor: e.read()) @classmethod def handles(cls, data: bytearray) -> bool: ustar = data.find(B'ustar') if ustar >= 0: return ustar == 257 or data[ustar:ustar + 3] in (B'\x00\x30\x30', B'\x20\x20\x00') return False
class xtvba (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract VBA macro code from Office documents.
Expand source code Browse git
class xtvba(PathExtractorUnit): """ Extract VBA macro code from Office documents. """ @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended') def _olevba(): from oletools import olevba return olevba def unpack(self, data): sentinel = uuid4() try: parser = self._olevba.VBA_Parser(sentinel, data=bytes(data), relaxed=True) except self._olevba.FileOpenError: raise ValueError('Input data not recognized by VBA parser') for p1, stream_path, p2, code in parser.extract_all_macros(): if not stream_path: if p1 == sentinel: continue if p2 == sentinel: continue yield UnpackResult(stream_path, code.encode(self.codec))
class xtw (stripspace=False, duplicates=False, longest=False, take=None)
-
Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works similar to the
xtp
unit.Expand source code Browse git
class xtw(PatternExtractor): """ Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works similar to the `refinery.xtp` unit. """ def __init__(self, stripspace=False, duplicates=False, longest=False, take=None): self.superinit(super(), **vars(), ascii=True, utf16=True) def process(self, data): pattern = '|'.join(FR'(?P<{p.name}>\b{p.value}\b)' for p in wallets) pattern = FR'\b{pattern}\b'.encode('latin1') def check(match: re.Match[bytes]): for name, value in match.groupdict().items(): if value is not None: break else: raise RefineryCriticalException('Received empty match.') return self.labelled(value, kind=name) yield from self.matches_filtered(memoryview(data), pattern, check)
class xtxml (*paths, format=None, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')
-
Extract values from an XML document.
Expand source code Browse git
class xtxml(XMLToPathExtractorUnit): """ Extract values from an XML document. """ def unpack(self, data): root = xml.parse(data.strip()) meta = metavars(data) path = self._make_path_builder(meta, root) def walk(node: xml.XMLNode, *parts: str): def extract(node: xml.XMLNode = node): if not node.children: return node.content.encode(self.codec) with MemoryFile() as stream: node.write(stream) return bytes(stream.getbuffer() | ppxml) yield UnpackResult('/'.join(parts), extract, **node.attributes) for child in node.children: yield from walk(child, *parts, path(child)) yield from walk(root, path(root))
class xtzip (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')
-
Extract files from a Zip archive.
Expand source code Browse git
class xtzip(ArchiveUnit): """ Extract files from a Zip archive. """ @ArchiveUnit.Requires('chardet', 'default', 'extended') def _chardet(): import chardet return chardet @ArchiveUnit.Requires('pyzipper', 'arc', 'default', 'extended') def _pyzipper(): import pyzipper return pyzipper @classmethod def _carver(cls): return carve_zip def unpack(self, data: bytearray): from zipfile import ZipFile, ZipInfo password = bytes(self.args.pwd) archive = ZipFile(MemoryFile(data)) if password: archive.setpassword(password) else: def password_invalid(pwd: Optional[str], pyzipper=False): nonlocal archive if pwd is not None: archive.setpassword(pwd.encode(self.codec)) try: archive.testzip() except NotImplementedError: if pyzipper: raise self.log_debug('compression method unsupported, switching to pyzipper') archive = self._pyzipper.AESZipFile(MemoryFile(data)) return password_invalid(pwd, True) except RuntimeError as E: if 'password' not in str(E): raise return True else: if pwd: self.log_debug('using password:', pwd) return False for pwd in [None, *self._COMMON_PASSWORDS]: if not password_invalid(pwd): break else: raise RuntimeError('Archive is password-protected.') for info in archive.infolist(): def xt(archive: ZipFile = archive, info: ZipInfo = info): try: return archive.read(info.filename) except RuntimeError as E: if 'password' not in str(E): raise if not password: raise RuntimeError('archive is password-protected') else: raise RuntimeError(F'invalid password: {password.decode(self.codec)}') from E if info.filename: if info.is_dir(): continue # courtesy of https://stackoverflow.com/a/37773438/9130824 filename = info.filename if info.flag_bits & ZIP_FILENAME_UTF8_FLAG == 0: filename_bytes = filename.encode('437') try: guessed_encoding = self._chardet.detect(filename_bytes)['encoding'] except ImportError: guessed_encoding = None guessed_encoding = guessed_encoding or 'cp1252' filename = filename_bytes.decode(guessed_encoding, 'replace') try: date = datetime(*info.date_time) except Exception as e: self.log_info(F'{e!s} - unable to determine date from tuple {info.date_time} for: {filename}') date = None yield self._pack(filename, date, xt) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return data.rfind(ZipEndOfCentralDirectory.SIGNATURE) > 0
class xtzpaq (*paths, index=False, pwd=b'', date=b'date', path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)
-
Extract files from a ZPAQ archive.
Expand source code Browse git
class xtzpaq(ArchiveUnit): """ Extract files from a ZPAQ archive. """ _MAGIC = B'\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3\x7A\x50\x51' def __init__( self, *paths, index: Arg.Switch('-i', help='Archive is an index (no d-blocks).') = False, **more ): for _code, _size in { _TCU32: 4, _TCI32: 4, _TCU16: 2, _TCI16: 2, }.items(): _item_size = array(_code).itemsize if _item_size == _size: continue raise RuntimeError( F'Expected array type "{_code}" to have entries of size {_size}, but the API ' F'reports a size of {_item_size}.') super().__init__(*paths, index=index, **more) @classmethod def handles(cls, data: bytearray) -> Optional[bool]: return cls._MAGIC in data def unpack(self, archive: bytearray): def mkdate(date) -> datetime: date = int(date) year = date // 1000000 // 10000 month = date // 100000000 % 100 day = date // 1000000 % 100 hour = date // 10000 % 100 minute = date // 100 % 100 second = date % 100 return datetime(year, month, day, hour, minute, second, 0) @dataclass class DT: date: int = 0 attr: int = 0 name: str = "" frag: List[int] = field(default_factory=list) @property def dt(self) -> Optional[datetime]: if self.date > 0: return mkdate(self.date) # TODO: implement password-protected archives # key = self.args.pwd index = self.args.index bsize: Dict[int, int] = {} # frag ID -> d block compressed size dt: Dict[str, DT] = {} # filename -> date, attr, frags frag: List[bytes] = [] # ID -> hash[20] size[4] data csize = 0 # expected offset of next non d block streaming = False journaling = False done = False dc = Decompressor() src = dc.set_input(archive) while not done and dc.read_block(): while not done: filename = dc.read_filename() if filename is None: break self.log_info('reading file', filename) comment = dc.read_comment() jsize = 0 if len(comment) >= 4 and comment[-4:] == "jDC\x01": num = re.search('^\\d+', comment) if not num: raise RuntimeError('missing size in comment') jsize = int(num[0]) if streaming: raise RuntimeError('journaling block after streaming one') journaling = True self.log_info('archive type is journaling') else: if journaling: raise RuntimeError('streaming block after journaling one') if index: raise RuntimeError('streaming block in index') streaming = True self.log_info('archive type is streaming') # Test journaling filename. The format must be # jDC[YYYYMMDDHHMMSS][t][NNNNNNNNNN] # where YYYYMMDDHHMMSS is the date, t is the type {c,d,h,i}, and # NNNNNNNNNN is the 10 digit first fragment ID for types c,d,h. # They must be in ascending lexicographical order. frag_id = 0 block_type = None if journaling: if len(filename) != 28: raise RuntimeError('filename size not 28') if filename[:3] != 'jDC': raise RuntimeError('filename not jDC') block_type = filename[17] if block_type not in 'cdhi': raise RuntimeError('type not c,d,h,i') try: mkdate(filename[3:17]) except Exception as E: raise RuntimeError('invalid date') from E frag_id = int(filename[18:28]) if not 1 <= frag_id <= 4294967295: raise RuntimeError('fragment ID out of range') seg = MemoryFile(size_limit=jsize) dc.set_output(seg) sha1 = hashlib.sha1() dc.set_hasher(sha1) dc.decompress_data() if journaling and len(seg) != jsize: raise RuntimeError('incomplete output') checksum = dc.read_segment_end() if checksum is None: self.log_debug('no checksum') elif checksum != sha1.digest(): raise RuntimeError('SHA1 mismatch') # check csize at first non-d block if csize and block_type in 'chi': if csize != offset: raise RuntimeError(F'csize={csize} does not point to offset={offset}') csize = 0 # get csize from c block seglen = len(seg) seg = StructReader(seg.getbuffer()) if block_type == 'c': if seglen < 8: raise RuntimeError("c block too small") csize = seg.u64() offset = src.tell() + 1 self.log_debug(F'csize={csize} at offset={offset}') if csize >> 63: self.log_warn('incomplete transaction at end of archive') done = True elif index and csize != 0: raise RuntimeError('nonzero csize in index') # Set csize to expected offset of first non d block # assuming 1 more byte for unread end of block marker. csize += offset if block_type == 'd': if index: raise RuntimeError('d block in index') bsize[frag_id] = src.tell() + 1 - offset # compressed size self.log_debug(F' {bsize[frag_id]} -> {len(seg)}') # Test frag size list at end. The format is f[id..id+n-1] fid n # where fid may be id or 0. sizes must sum to the rest of block. if seglen < 8: raise RuntimeError('d block too small') seg.seekset(-8) fid = seg.u32() or frag_id n = seg.u32() if fid != frag_id: raise RuntimeError('missing ID') if n > (seglen - 8) // 4: raise RuntimeError('frag list too big') fragsum = 0 # computed sum of frag sizes seg.seekset(-4 * (n + 2)) for _ in range(n): fragsum += seg.u32() if fragsum + n * 4 + 8 != seglen: raise RuntimeError('bad frag size list') # Save frag hashes and sizes. For output, save data too. seg.seekset(fragsum) data = memoryview(seg.getbuffer()) assert seg.remaining_bytes == n * 4 + 8 for i in range(n): while len(frag) <= frag_id + i: frag.append(B'') if frag[frag_id + i]: raise RuntimeError('duplicate frag ID') f = seg.u32() h = hashlib.sha1(data[:f]).digest() frag[frag_id + i] = h + f.to_bytes(4, 'little') + data[:f] data = data[f:] assert len(data) == n * 4 + 8 assert seg.remaining_bytes == 8 # Test and save h block. Format is: bsize (sha1[20] size)... # where bsize is the compressed size of the d block with the same id, # and each size corresonds to a fragment in that block. The list # must match the list in the d block if present. if block_type == 'h': if seglen % 24 != 4: raise RuntimeError('bad h block size') b = seg.u32() self.log_debug(F'[{frag_id}..{frag_id + seglen // 24}[ {b}') fragsum = 0 # uncompressed size of all frags for i in range(seglen // 24): fd = seg.read(24) if index: while len(frag) <= frag_id + i: frag.append(B'') if frag[frag_id + i]: raise RuntimeError('data in index') frag[frag_id + i] = fd elif frag_id + i >= len(frag) or len(frag[frag_id + i]) < 24: raise RuntimeError('no matching d block') elif frag[frag_id + i][:24] != fd: raise RuntimeError('frag size or hash mismatch') fragsum += int.from_bytes(fd[20:24], 'little') # Test i blocks and save files to extract. Format is: # date filename 0 na attr[0..na) ni ptr[0..ni) (to update) # 0 filename (to delete) # Date is 64 bits in YYYYMMDDHHMMSS format. if block_type == 'i': while not seg.eof: f = DT(seg.u64()) f.name = seg.read_c_string('utf8') if f.date > 0: na = seg.u32() if na > 65535: raise ValueError('attr size > 65535') f.attr = seg.read_integer(na * 8) ni = seg.u32() for i in range(ni): a = seg.u32() f.frag.append(a) if index: continue elif not 1 <= a < len(frag): raise RuntimeError('frag ID out of range') elif not frag[a]: raise LookupError('missing frag data') dt[f.name] = f if streaming: yield self._pack(filename, None, seg.getvalue()) offset = src.tell() self.log_debug(F'{offset} bytes of archive tested') if not journaling: return for name, f in dt.items(): if not f.date: continue size = sum( int.from_bytes(frag[fp][20:24], 'little') for fp in f.frag if 0 < fp < len(frag) and len(frag[fp]) >= 24 ) out = MemoryFile() for fp in f.frag: if fp < len(frag): out.write(memoryview(frag[fp])[24:]) if len(out) != size: self.log_warn('invalid size during unpacking') yield self._pack(name, f.dt, out.getvalue())
class xxh (seed=0, text=False)
-
Implements the xxHash hashing algorithm.
Expand source code Browse git
class xxh(HashUnit): """ Implements the xxHash hashing algorithm. """ def __init__( self, seed: HashUnit.Arg.Number(metavar='seed', help='specify the seed value; the default is {default}') = 0, text=False ): super().__init__(text, seed=seed) def _algorithm(self, data): return xxhash(data, self.args.seed)
class xxtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size=1)
-
Expand source code Browse git
class xxtea(TEAUnit, cipher=BlockCipherFactory(XXTEA)): block_size: int = 8 def __init__( self, key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size: Arg.Number('-b', help=( 'Cipher block size in 32-bit words. The default value {default} implies that the input ' 'is treated as a single block, which is common behaviour of many implementations.')) = 1 ): super().__init__(key, iv, padding, mode, raw, swap=swap, block_size=block_size) def _prepare_block(self, data: bytes): if self.args.block_size < 2: blocks, remainder = divmod(len(data), 4) if remainder: blocks += 1 self.block_size = blocks * 4 else: self.block_size = self.args.block_size * 4 def encrypt(self, data: bytes) -> bytes: self._prepare_block(data) return super().encrypt(data) def decrypt(self, data: bytes) -> bytes: self._prepare_block(data) return super().decrypt(data) def _new_cipher(self, **optionals) -> CipherInterface: return super()._new_cipher(block_size=self.block_size, **optionals)
class zl (level=9, window=15, zlib_header=False, gzip_header=False)
-
ZLib compression and decompression.
Expand source code Browse git
class zl(Unit): """ ZLib compression and decompression. """ def __init__( self, level : Arg.Number('-l', bound=(0, 0X9), help='Specify a compression level between 0 and 9.') = 9, window : Arg.Number('-w', bound=(8, 0XF), help='Manually specify the window size between 8 and 15.') = 15, zlib_header: Arg.Switch('-z', group='MODE', help='Use a ZLIB header.') = False, gzip_header: Arg.Switch('-g', group='MODE', help='Use a GZIP header.') = False ): if zlib_header and gzip_header: raise ValueError('You can only specify one header type (ZLIB or GZIP).') return super().__init__(level=level, window=window, zlib_header=zlib_header, gzip_header=gzip_header) def _decompress_data(self, data, mode: int, step: int): zl = zlib.decompressobj(mode) memory = memoryview(data) result = bytearray() while not zl.eof: read = min(step, len(memory)) try: chunk = zl.decompress(memory[:read]) except zlib.error as e: raise RefineryPartialResult(exception_to_string(e), result) from e else: result.extend(chunk) consumed = read - len(zl.unused_data) if not memory or consumed == 0: break memory = memory[consumed:] return result, memory def process(self, data): if data[0] == 0x78 or data[0:2] == B'\x1F\x8B' or self.args.zlib_header or self.args.gzip_header: modes = [self.args.window | 0x20, -self.args.window] else: modes = [-self.args.window, self.args.window | 0x20] modes.extend([0x10 | self.args.window, 0]) view = memoryview(data) step = 32 if self.leniency > 0 else len(data) for k in itertools.count(1): error = None rest = view for mode in modes: try: out, rest = self._decompress_data(view, mode, step) except Exception as e: error = error or e else: self.log_info(F'used mode {mode} to decompress chunk {k}') yield out error = None break if error: raise error if not rest: break if len(rest) == len(view): break if len(rest) > len(view): raise RuntimeError('Decompressor returned more tail data than input data.') yield out view = rest if k <= 0: raise ValueError('Could not detect any zlib stream.') def reverse(self, data): mode = -self.args.window if self.args.zlib_header: mode = -mode if self.args.gzip_header: mode = -mode | 0x10 self.log_debug(F'using mode {mode:+2d} for compression') zl = zlib.compressobj(self.args.level, zlib.DEFLATED, mode) zz = zl.compress(data) return zz + zl.flush(zlib.Z_FINISH) @classmethod def handles(self, data: bytearray): for sig in ( B'\x1F\x8B', # gzip header B'\x78\x01', # zlib low compression B'\x78\x9C', # zlib medium compression B'\x78\xDA', # zlib high compression ): if data[:2] == sig: return True
class zstd
-
ZStandard (ZSTD) compression and decompression.
Expand source code Browse git
class zstd(Unit): """ ZStandard (ZSTD) compression and decompression. """ @Unit.Requires('pyzstd', 'all') def _pyzstd(): import pyzstd return pyzstd def process(self, data): return self._pyzstd.ZstdDecompressor().decompress(data) def reverse(self, data): zc = self._pyzstd.ZstdCompressor() zc.compress(data) return zc.flush()