Module `refinery.shell`

Shell-Like Unit Interface

Any unit from the refinery module can also be imported from this module. When imported from here, the units are initialized differently: They can be given string arguments as they would receive on the command line. For example:

>>> from refinery.shell import *
>>> emit('ABC', 'DEF') [ pop('t') | xor('var:t') | pack('-R') ] | str
'575'

This especially gives easier access to the powerful refinery.lib.meta variables and the entire multibin format expressions, see refinery.lib.argformats.

Expand source code Browse git

"""
# Shell-Like Unit Interface

Any unit from the `refinery` module can also be imported from this module. When imported from here,
the units are initialized differently: They can be given string arguments as they would receive on
the command line. For example:

    >>> from refinery.shell import *
    >>> emit('ABC', 'DEF') [ pop('t') | xor('var:t') | pack('-R') ] | str
    '575'

This especially gives easier access to the powerful `refinery.lib.meta` variables and the entire
multibin format expressions, see `refinery.lib.argformats`.
"""
from functools import wraps
from refinery import __unit_loader__

with __unit_loader__:
    __all__ = sorted(__unit_loader__.units, key=lambda x: x.lower())


class __pdoc2__:
    def __class_getitem__(cls, *_):
        return ''


def __getattr__(name):
    with __unit_loader__:
        unit = __unit_loader__.resolve(name)

    if unit is None:
        raise AttributeError(name)

    class _unit(unit):
        def __new__(cls, *args, **kwargs):
            return unit.assemble(*args, **kwargs)

    return wraps(unit, updated=[])(_unit)


def __dir__():
    return __all__

Units

class a3x (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script bytecode. The unit also works on compiled AutoIt executables.

Expand source code Browse git

class a3x(PathExtractorUnit):
    """
    Extracts embedded resources from compiled AutoIt scripts and decompiles the embedded script
    bytecode. The unit also works on compiled AutoIt executables.
    """

    def unpack(self, data: bytearray):
        view = memoryview(data)
        cursor = 0
        errors: Dict[int, Exception] = {}
        script_count = 0
        truncated: Set[A3xRecord] = set()
        intact: Set[A3xRecord] = set()

        def _package(records: Iterable[A3xRecord]) -> Generator[UnpackResult, None, None]:
            for k, record in enumerate(records, 1):
                self.log_info(F'record {k} type:', record.type)
                self.log_info(F'record {k} path:', record.src_path)
                if record.path is None:
                    continue
                yield UnpackResult(
                    record.path,
                    record.extract,
                    srcpath=record.src_path,
                    created=record.created.isoformat(' ', 'seconds'),
                    written=record.written.isoformat(' ', 'seconds'),
                )

        while cursor < len(view):
            self.log_debug(F'searching at offset 0x{cursor:08X}')
            nc = data.find(A3xScript.MAGIC, cursor)
            if nc >= 0:
                cursor = nc
            else:
                rp = data.find(A3xRecord.MAGIC, cursor) - A3xScript.WIDTH
                if rp <= cursor:
                    break
                cursor = rp
            try:
                script = A3xScript(view[cursor:])
            except Exception as E:
                errors[cursor] = E
                cursor += 1
                continue
            else:
                valid = script.has_valid_magic()
                if valid:
                    _m = 'correct'
                else:
                    _m = 'invalid'
                if not script.body:
                    cursor += A3xScript.WIDTH
                    if not script.has_valid_magic():
                        cursor += len(A3xRecord.MAGIC)
                    continue
                if script.truncated:
                    _a = 'truncated'
                    truncated.update(script.body)
                else:
                    script_count += 1
                    _a = 'intact'
                    intact.update(script.body)
                self.log_info(
                    F'{_a} script of type', script.type,
                    F'and length 0x{len(script):08X}',
                    F'with {len(script.body)} records and {_m} magic:',
                    script.magic
                )
                cursor += len(script)
                if script.truncated:
                    if not script.has_valid_magic():
                        cursor += len(A3xRecord.MAGIC)
                    continue

            yield from _package(script.body)

        remaining = truncated - intact
        if remaining:
            self.log_warn('emitting records from truncated scripts')
            yield from _package(remaining)
            return
        elif truncated:
            self.log_debug('good news: intact scripts contained all records from truncated scripts')
        if script_count == 0:
            error = None
            for offset, error in errors.items():
                self.log_warn(F'error at offset 0x{offset:08X}:', error)
            if error:
                raise error

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return A3xScript.MAGIC in data or A3xRecord.MAGIC in data

class a85

Ascii85 encoding and decoding, the predecessor variant of Base85 with a different alphabet.

Expand source code Browse git

class a85(Unit):
    """
    Ascii85 encoding and decoding, the predecessor variant of Base85 with a different alphabet.
    """
    def reverse(self, data):
        return base64.a85encode(data)

    def process(self, data):
        if re.search(BR'\s', data) is not None:
            data = re.sub(BR'\s+', B'', data)
        return base64.a85decode(data)

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib.patterns import formats
        return formats.spaced_a85.value.fullmatch(data)

class add (*argument, bigendian=False, blocksize=None)

Add the given argument to each block.

Expand source code Browse git

class add(BinaryOperationWithAutoBlockAdjustment):
    """
    Add the given argument to each block.
    """
    @staticmethod
    def operate(a, b): return a + b
    @staticmethod
    def inplace(a, b): a += b

class adler32 (reps=1, text=False)

Returns the Adler32 hash of the input data.

Expand source code Browse git

class adler32(HashUnit):
    """
    Returns the Adler32 hash of the input data.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return zlib.adler32(data).to_bytes(4, 'big')

class aes (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

AES encryption and decryption.

Expand source code Browse git

class aes(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(AES)):
    """
    AES encryption and decryption.
    """
    pass

class alu (operator, *argument, seed=0, prologue=None, epilogue=None, inc=False, dec=False, cbc=False, ctr=False, bigendian=False, blocksize=None, precision=None)

The arithmetic-logical unit. It allows you to specify a custom Python expression where the following variables are allowed:

the variable A: same as V[0]
the variable B: current block
the variable E: block value of encoded input (not changed after update)
the variable N: number of bytes in the input
the variable K: current index in the input
the variable S: the internal state value
the variable V: the vector of arguments
the variable I: function that casts to a signed int in current precision
the variable U: function that casts to unsigned int in current precision
the variable R: function; R(x,4) rotates x by 4 to the right
the variable L: function; L(x,4) rotates x by 4 to the left
the variable M: function; M(x,8) picks the lower 8 bits of x
the variable X: function that negates the bits of the input

(The rotation operations are interpreted as shifts when arbitrary precision is used.)

Each block of the input is replaced by the value of this expression. Additionally, it is possible to specify prologue and epilogue expressions which are used to update the state variable S before and after the update of each block, respectively.

Expand source code Browse git

class alu(ArithmeticUnit):
    """
    The arithmetic-logical unit. It allows you to specify a custom Python expression where the following
    variables are allowed:

    - the variable `A`: same as `V[0]`
    - the variable `B`: current block
    - the variable `E`: block value of encoded input (not changed after update)
    - the variable `N`: number of bytes in the input
    - the variable `K`: current index in the input
    - the variable `S`: the internal state value
    - the variable `V`: the vector of arguments
    - the variable `I`: function that casts to a signed int in current precision
    - the variable `U`: function that casts to unsigned int in current precision
    - the variable `R`: function; `R(x,4)` rotates x by 4 to the right
    - the variable `L`: function; `L(x,4)` rotates x by 4 to the left
    - the variable `M`: function; `M(x,8)` picks the lower 8 bits of x
    - the variable `X`: function that negates the bits of the input

    (The rotation operations are interpreted as shifts when arbitrary precision is used.)

    Each block of the input is replaced by the value of this expression. Additionally, it is possible to
    specify prologue and epilogue expressions which are used to update the state variable `S` before and
    after the update of each block, respectively.
    """

    @staticmethod
    def _parse_op(definition, default=None):
        definition = definition or default
        if not definition:
            raise ValueError('No definition given')
        return definition

    def __init__(
        self, operator: Arg(type=str, help='A Python expression defining the operation.'), *argument,
        seed: Arg('-s', type=str, help=(
            'Optional seed value for the state variable S. The default is zero. This can be an expression '
            'involving the variable N.')) = 0,
        prologue: Arg('-p', type=str, metavar='E', help=(
            'Optional expression with which the state variable S is updated before a block is operated on.')) = None,
        epilogue: Arg('-e', type=str, metavar='E', group='EPI', help=(
            'Optional expression with which the state variable S is updated after a block was operated on.')) = None,
        inc: Arg('-I', group='EPI', help='equivalent to --epilogue=S+1') = False,
        dec: Arg('-D', group='EPI', help='equivalent to --epilogue=S-1') = False,
        cbc: Arg('-X', group='EPI', help='equivalent to --epilogue=(B)') = False,
        ctr: Arg('-T', group='EPI', help='equivalent to --epilogue=S+B') = False,
        bigendian=False, blocksize=None, precision=None
    ):
        for flag, flag_is_set, expression in [
            ('--cbc', cbc, '(B)'),
            ('--inc', inc, 'S+1'),
            ('--dec', dec, 'S-1'),
            ('--ctr', ctr, 'S+B'),
        ]:
            if flag_is_set:
                if epilogue is not None:
                    raise ValueError(
                        F'Ambiguous specification; epilogue was already set to {epilogue} '
                        F'when {flag} was parsed.'
                    )
                epilogue = expression

        self._index = IndexCounter()

        super().__init__(
            self._index,
            *argument,
            bigendian=bigendian,
            blocksize=blocksize,
            precision=precision,
            seed=seed,
            operator=self._parse_op(operator),
            prologue=self._parse_op(prologue, 'S'),
            epilogue=self._parse_op(epilogue, 'S'),
        )

    @property
    def _is_ecb(self):
        return not self.args.epilogue and not self.args.prologue

    def _fastblock(self, _):
        raise FastBlockError

    def process(self, data):
        context = dict(metavars(data))
        seed = self.args.seed
        fbits = self.fbits
        fmask = self.fmask

        self._index.init(self.fmask)

        def _expression(definition: str):
            return PythonExpression(definition, *'IBEASMNVRLX', all_variables_allowed=True, mask=fmask)

        prologue = _expression(self.args.prologue).expression
        epilogue = _expression(self.args.epilogue).expression
        operator = _expression(self.args.operator).expression

        def cast_unsigned(n) -> int:
            return int(n) & fmask

        def cast_signed(n) -> int:
            n = int(n) & fmask
            if n >> (fbits - 1):
                return -((~n + 1) & fmask)
            else:
                return n

        if fbits is INF:
            def rotate_r(n, k): return n >> k
            def rotate_l(n, k): return n << k
        else:
            def rotate_r(n, k): return (n >> k) | (n << (fbits - k)) & fmask
            def rotate_l(n, k): return (n << k) | (n >> (fbits - k)) & fmask

        def negate_bits(n):
            return n ^ fmask

        def mask_to_bits(x, b):
            return x & ((1 << b) - 1)

        context.update(
            N=len(data),
            I=cast_signed,
            U=cast_unsigned,
            R=rotate_r,
            L=rotate_l,
            X=negate_bits,
            M=mask_to_bits,
        )
        args = [
            self._infinitize_argument(*self._argument_parse_hook(a))
            for a in self.args.argument]
        if args:
            args = [next(iter(a)) for a in args]
            context['A'] = args[0]
            context['V'] = args

        if isinstance(seed, str):
            seed = PythonExpression(seed, 'IAMNVRLX', constants=context, mask=fmask)
        if callable(seed):
            seed = seed(context, N=len(data))

        self._index.init(self.fmask)
        context.update(S=seed)

        def operate(block, index, *args):
            context.update(K=index, B=block, E=block, V=args)
            if args:
                context['A'] = args[0]
            context['S'] = eval(prologue, None, context)
            context['B'] = eval(operator, None, context)
            context['S'] = eval(epilogue, None, context)
            return context['B']

        placeholder = self.operate
        self.operate = operate

        try:
            result = super().process(data)
        finally:
            self.operate = placeholder

        return result

    @staticmethod
    def operate(block, index, *args):
        raise RuntimeError('This operate method cannot be called.')

    def inplace(self, block, *args) -> None:
        super().inplace(block, *args)

class aplib

APLib compression and decompression.

Expand source code Browse git

class aplib(Unit):
    """
    APLib compression and decompression.
    """

    def reverse(self, buf):
        return compressor(buf).compress()

    def process(self, buf):
        view = memoryview(buf)
        size = 0
        if view[:4] == B'AP32':
            size = int.from_bytes(buf[4:8], 'little')
            if size > 0x80:
                size = 0
            else:
                self.log_info(F'detected aPLib header of size {size}')
        return decompressor(view[size:]).decompress()

    @classmethod
    def handles(self, data: bytearray):
        if data[:4] == B'AP32':
            return True
        return None

class argon2id (size, salt, iter=1, jobs=1, cost=None, skey=None, more=None)

Implements Argon2id-based key derivation.

Expand source code Browse git

class argon2id(Unit):
    """
    Implements Argon2id-based key derivation.
    """

    def __init__(
        self,
        size: Arg.Number(metavar='n', help='number of bytes to generate'),
        salt: Arg.Binary(metavar='S', help='salt bytes'),
        iter: Arg.Number(metavar='t', help='number of iterations, defaults to {default}') = 1,
        jobs: Arg.Number(metavar='p', help='parallelism, defaults to {default}') = 1,
        cost: Arg.Number(metavar='m', help='memory cost in kibibytes, defaults to the minimum of 8192 per job.') = None,
        skey: Arg.Binary(metavar='K', help='optional secret key') = None,
        more: Arg.Binary(metavar='X', help='optional additional data') = None,
    ):
        super().__init__(size=size, salt=salt, iter=iter, skey=skey, jobs=jobs, cost=cost, more=more)

    @Unit.Requires('cryptography', 'default', 'extended')
    def _argon2id():
        from cryptography.hazmat.primitives.kdf.argon2 import Argon2id
        return Argon2id

    def process(self, data):
        m = self.args.cost
        p = self.args.jobs
        S = self.args.salt
        K = self.args.skey
        n = self.args.size
        X = self.args.more
        t = self.args.iter
        K = K and bytes(K) or None
        X = X and bytes(X) or None
        S = bytes(S)
        m = m or 8192 * p
        a2id = self._argon2id(
            salt=S, length=n, iterations=t, lanes=p, memory_cost=m, ad=X, secret=K)
        return a2id.derive(data)

class asm (mode='x32', *, count=None, until=None, no_address=False, no_hexdump=False)

Disassembles the input data using capstone and produces a human-readable disassembly listing. It internally uses the opc unit for this, which is an alternative option if you are looking for more programmatic disassembly.

Expand source code Browse git

class asm(opc):
    """
    Disassembles the input data using capstone and produces a human-readable disassembly listing.
    It internally uses the `refinery.opc` unit for this, which is an alternative option if you are
    looking for more programmatic disassembly.
    """
    def __init__(
        self, mode='x32', *, count=None, until=None,
        no_address: Arg.Switch('-A', help='Disable address display.') = False,
        no_hexdump: Arg.Switch('-H', help='Disable opcodes hexdump.') = False,
    ):
        super().__init__(
            mode=mode,
            nvar='_name',
            avar='_addr',
            ovar='_arg',
            count=count,
            until=until,
            no_address=no_address,
            no_hexdump=no_hexdump,
        )

    def process(self, data):
        insns = list(super().process(data))
        if not insns:
            return

        no_address = self.args.no_address
        no_hexdump = self.args.no_hexdump

        def _hl(x): return len(hex(x))

        args_width = max(len(insn['_args']) for insn in insns)
        memo_width = max(len(insn['_name']) for insn in insns)
        addr_width = max(_hl(insn['_addr']) for insn in insns)

        if no_address:
            addr_width = 0
            memo_width = memo_width + 2

        max_data_bytes_count = max(len(c) for c in insns)

        padding = addr_width + memo_width + args_width + 10
        metrics_opc = HexDumpMetrics(max_data_bytes_count, padding=padding)

        for insn in insns:
            hd = one(hexdump(insn, metrics_opc))
            name = insn.meta.pop('_name')
            args = insn.meta.pop('_args')
            addr = insn.meta.pop('_addr')
            msg = F' {name:<{memo_width}}  {args:<{args_width}}'
            if not no_hexdump:
                msg = F'{msg}  ; {hd}'
            if not no_address:
                msg = F'{addr:0{addr_width}X}: {msg}'
            yield msg.encode(self.codec)

class atbash

https://en.wikipedia.org/wiki/Atbash Atbash encoding and decoding. Fairly useless in the 21st century, except for picking out crypto nerds.

Expand source code Browse git

class atbash(Unit):
    """
    https://en.wikipedia.org/wiki/Atbash
    Atbash encoding and decoding. Fairly useless in the 21st century, except
    for picking out crypto nerds.
    """

    def process(self, data: bytearray):
        uc = range(B'A'[0], B'Z'[0] + 1)
        lc = range(B'a'[0], B'z'[0] + 1)
        for k, letter in enumerate(data):
            if letter in uc:
                data[k] = uc[~uc.index(letter)]
                continue
            if letter in lc:
                data[k] = lc[~lc.index(letter)]
                continue
        return data

    reverse = process

class autoxor (range=slice(1, 32, None), plaintext=None, searchpos=slice(0, None, None), alph=False, crib=False, freq=False)

Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte with successive bytes from a key or by subtracting the respective key byte value from each input byte. It uses the xkey unit to attack the cipher and attempts to recover the plaintext automatically.

The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

Known plaintext cribs: The unit contains a library of file signatures that are expected to occur at specific offsets. It uses these to attempt a known-plaintext attack against the input. If a key is found that is at most half the size of such a crib, it is returned.
Known alphabets: For each given key length, the input is split into slices that would have been encrypted with a single byte for keys of that length. Each such slice undergoes a character frequency analysis. If the histogram indicates that an alphabet of a small size was used (i.e. base64), then the unit attempts to determine the key based on this.
Known high frequency glyph: Works if the plaintext contains one letter that occurs with very high frequency, i.e. zero padding in PE or ELF files, and the space character in text. Based on this assumption, the unit computes the most likely key. This method will work best on uncompressed files that were encrypted with a short key.

When no option is set, the unit uses all the above methods by default. When at least one of the methods is selected, it will attempt only selected methods. When a custom plaintext is given, the other methods are disabled by default.

Expand source code Browse git

class autoxor(xkey, docs='{0}{p}{1}'):
    """
    Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte
    with successive bytes from a key or by subtracting the respective key byte value from each
    input byte. It uses the `refinery.xkey` unit to attack the cipher and attempts to recover the
    plaintext automatically.
    """
    def process(self, data: bytearray):
        fallback: tuple[str, bytes, bytearray, bool] | None = None

        try:
            result = next(self._attack(data))
        except StopIteration:
            pass
        else:
            key = result.key
            names = []

            if result.xor is not False:
                names.append('xor')
            if result.xor is not True:
                names.append('sub')

            for name in names:

                unit = get_entry_point(name)
                bin = data | unit(key) | bytearray
                space = B'\0' | unit(0x20) | bytes

                for k in range(4):
                    b = bin[k:k + 0x1000]
                    m = magic(b)
                    if not m.blob:
                        self.log_info(F'method {name} resulted in non-blob data ({m.mime}) at offset {k}; returning buffer')
                        return self.labelled(bin, key=key, method=name)

                if fallback is None:
                    fallback = name, key, bin, m.blob

                if not any(bin):
                    continue

                as_text = bin | unit(space) | bytearray

                try:
                    decoded = as_text.decode('utf8')
                except UnicodeDecodeError:
                    is_text = False
                else:
                    import re
                    is_text = bool(re.fullmatch(r'[\s\w!-~]+', decoded))

                if is_text:
                    self.log_info('detected likely text input; automatically shifting towards space character')
                    key = (b'\x20' * len(key)) | unit(key) | bytes
                    return self.labelled(as_text, key=key, method=name)

        if fallback is None:
            self.log_warn('No key was found; returning original data.')
            return data
        else:
            name, key, bin, is_blob = fallback
            if is_blob and result.how == self._rt.freq:
                self.log_warn('unrecognized format and no confirmed crib; the output is likely junk')
            return self.labelled(bin, key=key)

class b2f

Short for "back to front". This unit is a shortcut for pick with argument ::-1: It will reorder the chunks in the current frame in reverse order.

Expand source code Browse git

class b2f(pick):
    """
    Short for "back to front". This unit is a shortcut for `refinery.pick` with argument `::-1`:
    It will reorder the chunks in the current frame in reverse order.
    """
    def __init__(self):
        super().__init__(slice(None, None, -1))

class b32

Base32 encoding and decoding.

Expand source code Browse git

class b32(Unit):
    """
    Base32 encoding and decoding.
    """
    def reverse(self, data):
        return base64.b32encode(data)

    def process(self, data: bytearray):
        before_padding = 0
        for before_padding in range(len(data), 0, -1):
            if data[before_padding - 1:before_padding] != B'=':
                break
        padding_size = -before_padding % 8
        missing = before_padding + padding_size - len(data)
        if missing > 0:
            self.log_info(F'detected incorrect padding: added {missing} padding characters')
            data.extend(B'=' * missing)
        if missing < 0:
            self.log_info(F'detected incorrect padding: removed {-missing} padding characters')
            data[padding_size + before_padding:] = []
        return base64.b32decode(data, casefold=True)

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        if not formats.b32.value.fullmatch(data):
            return False
        return not formats.hex.value.fullmatch(data)

class b58

Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses because the alphabet omits digits and letters that look similar.

Expand source code Browse git

class b58(base):
    """
    Base58 encoding and decoding. It is famously used as an encoding in Bitcoin addresses
    because the alphabet omits digits and letters that look similar.
    """
    def __init__(self):
        super().__init__(b'123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz')

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return (
            formats.b58.value.fullmatch(data)
            and not formats.hex.value.fullmatch(data)
            and not formats.b32.value.fullmatch(data)
        )

class b62

Base62 encoding and decoding.

Expand source code Browse git

class b62(base):
    """
    Base62 encoding and decoding.
    """
    def __init__(self):
        super().__init__(b'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')

    @classmethod
    def handles(cls, data):
        from refinery.lib.patterns import formats
        return (
            formats.b62.value.fullmatch(data)
            and not formats.hex.value.fullmatch(data)
            and not formats.b32.value.fullmatch(data)
        )

class b64 (urlsafe=False)

Base64 encoding and decoding.

Expand source code Browse git

class b64(Unit):
    """
    Base64 encoding and decoding.
    """
    def __init__(self, urlsafe: Arg.Switch('-u', help='use URL-safe alphabet') = False):
        super().__init__(urlsafe=urlsafe)

    def reverse(self, data):
        altchars = None
        if self.args.urlsafe:
            altchars = B'-_'
        return base64.b64encode(data, altchars=altchars)

    def process(self, data: bytearray):
        if not data:
            return data
        if len(data) == 1:
            raise ValueError('single byte can not be base64-decoded.')
        data.extend(B'===')
        altchars = None
        if (B'-' in data or B'_' in data) and (B'+' not in data and B'/' not in data) or self.args.urlsafe:
            altchars = B'-_'
        return base64.b64decode(data, altchars=altchars)

    @Unit.Requires('numpy', 'speed', 'default', 'extended')
    def _numpy():
        import numpy
        return numpy

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        from refinery.lib.patterns import formats
        if not formats.spaced_b64.value.fullmatch(data):
            return False
        try:
            np = cls._numpy
        except ImportError:
            histogram = set()
            lcase_count = 0
            ucase_count = 0
            digit_count = 0
            other_count = 0
            total_count = len(data)
            for byte in data:
                histogram.add(byte)
                if len(histogram) > 60:
                    return True
                elif byte in range(0x61, 0x7B):
                    lcase_count += 1
                elif byte in range(0x41, 0x5B):
                    ucase_count += 1
                elif byte in range(0x30, 0x40):
                    digit_count += 1
                elif byte in B'\v\f\t\r\n\x20':
                    total_count -= 1
                else:
                    other_count += 1
        else:
            hist = np.histogram(
                np.frombuffer(memoryview(data), np.uint8), range(0x101))[0]
            lcase_count = sum(hist[k] for k in range(0x61, 0x7B))
            ucase_count = sum(hist[k] for k in range(0x41, 0x5B))
            digit_count = sum(hist[k] for k in range(0x30, 0x40))
            space_count = sum(hist[k] for k in B'\v\f\t\r\n\x20')
            total_count = len(data) - space_count
            other_count = total_count - (digit_count + ucase_count + lcase_count)

        if any(c < total_count // 64 for c in (lcase_count, ucase_count, digit_count)):
            return False
        if other_count * 2 > total_count:
            return False

        return True

class b65536

Base65536 encoding and decoding. A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set.

Expand source code Browse git

class b65536(Unit):
    """
    Base65536 encoding and decoding.
    A relatively esoteric encoding scheme utilizing the UTF-16 / UTF-32 character set.
    """
    def reverse(self, data):
        if not data:
            return B''

        output = MemoryFile()
        length = len(data)
        for x in range(0, length, 2):
            b1 = data[x]
            b2 = data[x + 1] if x + 1 < length else -1
            code_point = _BLOCK_START[b2] + b1
            output.write(chr(code_point).encode())
        return output.getvalue()

    def process(self, data):
        if not data:
            return B''

        done = False
        output = MemoryFile()
        for ch in data.decode():
            code_point = ord(ch)
            b1 = code_point & ((1 << 8) - 1)
            try:
                b2 = _B2[code_point - b1]
            except KeyError:
                self.log_info('Invalid base65536 code point: %d, skipping' % code_point)
                continue
            b = b1.to_bytes(1, "little") if b2 == -1 else b1.to_bytes(1, "little") + b2.to_bytes(1, "little")
            if len(b) == 1:
                if done:
                    raise ValueError('base65536 sequence continued after final byte')
                done = True
            output.write(b)
        return output.getvalue()

class b85

Base85 encoding and decoding.

Expand source code Browse git

class b85(Unit):
    """
    Base85 encoding and decoding.
    """
    def reverse(self, data):
        return base64.b85encode(data)

    def process(self, data):
        if re.search(BR'\s', data) is not None:
            data = re.sub(BR'\s+', B'', data)
        return base64.b85decode(data)

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib.patterns import formats
        return formats.spaced_b85.value.fullmatch(data)

class b92

Base92 encoding and decoding.

Expand source code Browse git

class b92(Unit):
    """
    Base92 encoding and decoding.
    """
    def reverse(self, data):
        if not data:
            return B'~'

        reader = StructReader(data, bigendian=True)
        output = MemoryFile()
        while reader.remaining_bits > 0:
            try:
                block = reader.read_integer(13)
            except EOFError:
                count = reader.remaining_bits
                block = reader.read_integer(count)
                self.log_debug(F'reading {count} remaining bits: {block:0{count}b}')
                shift = 6 - count
                if shift >= 0:
                    block <<= shift
                    self.log_debug(F'encoding block: {block:06b}')
                    output.write_byte(_B92_ALPHABET[block])
                    break
                block <<= 13 - count
            self.log_debug(F'encoding block: {block:013b}')
            hi, lo = divmod(block, 91)
            output.write_byte(_B92_ALPHABET[hi])
            output.write_byte(_B92_ALPHABET[lo])
        return output.getvalue()

    def process(self, data):
        if data == B'~':
            return B''

        output = MemoryFile()
        buffer = 0
        length = 0

        view = memoryview(data)
        q, r = divmod(len(view), 2)

        if r > 0:
            bits = 6
            tail = _B92_DECODING[data[~0]]
        else:
            bits = 13
            tail = _B92_DECODING[data[~1]] * 91 + _B92_DECODING[data[~0]]
            view = view[:(q - 1) * 2]

        it = iter(view)

        for a, b in zip(it, it):
            block = _B92_DECODING[a] * 91 + _B92_DECODING[b]
            assert length < 8
            buffer <<= 13
            buffer |= block
            length += 13
            size, length = divmod(length, 8)
            assert size > 0
            output.write((buffer >> length).to_bytes(size, 'big'))
            buffer &= (1 << length) - 1

        missing = 8 - length
        shift = bits - missing

        if shift < 8:
            bytecount = 1
        else:
            bytecount = 2
            shift -= 8
            missing += 8

        if shift < 0:
            raise RefineryPartialResult(
                F'Invalid padding, missing {-shift} bits.',
                output.getvalue())

        buffer <<= missing
        buffer |= tail >> shift
        length += missing
        output.write(buffer.to_bytes(bytecount, 'big'))

        if tail & ((1 << shift) - 1) != 0:
            raise RefineryPartialResult(
                F'Invalid padding, lower {shift} bits of {tail:0{bits}b} are not zero.',
                output.getvalue())

        return output.getvalue()

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib.patterns import formats
        return formats.b92.value.fullmatch(data)

class base (base=0, strip_padding=False, little_endian=False, strict_digits=False)

Encodes and decodes integers in arbitrary base.

Expand source code Browse git

class base(Unit):
    """
    Encodes and decodes integers in arbitrary base.
    """
    def __init__(
        self,
        base: Arg(type=numseq, metavar='base|alphabet', help=(
            R'Either the base to be used or an alphabet. If an explicit alphabet is given, its length '
            R'determines the base. The default base 0 treats the input as a Python integer literal. If '
            F'a numeric base is given, digits from the alphabet "{_DEFAULT_ALPH_STR}" are used. ')) = 0,
        strip_padding: Arg.Switch('-s', help='Do not add leading zeros to the output.') = False,
        little_endian: Arg.Switch('-e', help='Use little endian byte order instead of big endian.') = False,
        strict_digits: Arg.Switch('-d', help='Check that all input digits are part of the alphabet.') = False,
    ):
        super().__init__(
            base=base,
            strip_padding=strip_padding,
            little_endian=little_endian,
            strict_digits=strict_digits,
        )

    @property
    def _args(self):
        base = self.args.base
        if isinstance(base, int):
            if not base:
                return 0, B''
            if base in _LARGER_ALPHABETS:
                return base, _LARGER_ALPHABETS[base]
            if base not in range(2, len(_DEFAULT_ALPHABET) + 1):
                raise ValueError(F'base may only be an integer between 2 and {len(_DEFAULT_ALPHABET)}')
            return base, _DEFAULT_ALPHABET[:base]
        if len(set(base)) != len(base):
            raise ValueError('the given alphabet contains duplicate letters')
        return len(base), bytearray(base)

    @property
    def byteorder(self):
        return 'little' if self.args.little_endian else 'big'

    def reverse(self, data):
        base, alphabet = self._args
        self.log_info('using byte order', self.byteorder)
        number = int.from_bytes(data, byteorder=self.byteorder)

        if base == 0:
            return B'0x%X' % number
        if base > len(alphabet):
            raise ValueError(F'Only {len(alphabet)} available; not enough to encode base {base}')

        data_bits = len(data) * 8
        base_bits = math.log2(base)
        result = bytearray()

        while data_bits >= 1:
            number, k = divmod(number, base)
            result.append(alphabet[k])
            if not number and self.args.strip_padding:
                break
            data_bits -= base_bits

        result.reverse()
        return result

    def process(self, data: bytearray):
        base, alphabet = self._args
        be_lenient = not self.args.strict_digits
        if be_lenient and alphabet.upper() == alphabet:
            lcased = (c + 0x20 if 0x41 <= c <= 0x5a else c for c in data)
            if all(x == y for x, y in zip(data, lcased)):
                data = data.upper()
        if base and base != 64 and be_lenient:
            check = set(alphabet)
            index = 0
            it = iter(data)
            for b in it:
                if b not in check:
                    break
                index += 1
            for b in it:
                if b in check:
                    data[index] = b
                    index += 1
            self.log_info(F'stripped {len(data) - index} invalid digits from input data')
            del data[index:]
        if len(alphabet) <= len(_DEFAULT_ALPHABET):
            defaults = _DEFAULT_ALPHABET[:base]
            if alphabet != defaults:
                self.log_info('translating input data to a default alphabet for faster conversion')
                data_translated = data.translate(bytes.maketrans(alphabet, defaults))
                result = int(data_translated, base)
            else:
                result = int(data, base)
        elif len(alphabet) == 64:
            import base64
            _b64_alphabet = _LARGER_ALPHABETS[64]
            if alphabet != _b64_alphabet:
                data = data.translate(bytes.maketrans(alphabet, _b64_alphabet))
            return base64.b64decode(data + b'===', validate=self.args.strict_digits)
        elif len(alphabet) == 85:
            import base64
            _b85_alphabet = _LARGER_ALPHABETS[85]
            if alphabet != _b85_alphabet:
                data = data.translate(bytes.maketrans(alphabet, _b85_alphabet))
            return base64.b85decode(data)
        else:
            self.log_warn('very long alphabet, unable to use built-ins; reverting to (slow) fallback.')
            result = 0
            lookup = {digit: k for k, digit in enumerate(alphabet)}
            for digit in data:
                result *= base
                result += lookup[digit]
        if not base or self.args.strip_padding:
            bits = result.bit_length()
        else:
            bits = (len(data) - 1) * math.log2(base) + math.log2(max(alphabet.index(data[0]), 1))
            bits = math.ceil(bits)
        size, rest = divmod(bits, 8)
        size += int(bool(rest))
        return result.to_bytes(size, byteorder=self.byteorder)

class bat (keep_all=False, keep_comment=False, keep_definitions=False, keep_echo=False)

Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script is interpreted, variables are substituted for previously defined values, including commonly defined operating system environment variables. Variable definitions that are later evaluated are removed from the script, as are all echo commands and comments.

Expand source code Browse git

class bat(Unit):
    """
    Deobfuscates batch files, based on the batch deobfuscator by DissectMalware. The input script
    is interpreted, variables are substituted for previously defined values, including commonly
    defined operating system environment variables. Variable definitions that are later evaluated
    are removed from the script, as are all echo commands and comments.
    """
    def __init__(
        self,
        keep_all         : Unit.Arg.Switch('-a', help='Do not strip anything after deobfuscation.') = False,
        keep_comment     : Unit.Arg.Switch('-c', help='Do not strip comments from the script.') = False,
        keep_definitions : Unit.Arg.Switch('-d', help='Do not strip variable definitions.') = False,
        keep_echo        : Unit.Arg.Switch('-e', help='Do not strip echo calls in the script.') = False,
    ): ...

    @unicoded
    def process(self, data: str) -> str:
        mode = STRIP.ALL
        if self.args.keep_all:
            mode = STRIP.NONE
        elif self.args.keep_comment:
            mode ^= STRIP.COMMENT
        elif self.args.keep_definitions:
            mode ^= STRIP.DEFINITION
        elif self.args.keep_echo:
            mode ^= STRIP.ECHO
        return BatchDeobfuscator().deobfuscate(data, mode)

class bitrev (bigendian=False, blocksize=None)

Reverse the bits of every block. Any excess bytes at the end of the input that are not an integer multiple of the block size are ignored.

Unreadable bit reversal operations due to: https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel

Expand source code Browse git

class bitrev(UnaryOperation):
    """
    Reverse the bits of every block. Any excess bytes at the end of the input that are not
    an integer multiple of the block size are ignored.
    """
    @staticmethod
    def operate(arg):
        raise RuntimeError('operate was called before the unit was initialized')

    def __init__(self, bigendian=False, blocksize=None):
        """
        Unreadable bit reversal operations due to:
        https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv
        https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
        """
        super().__init__(bigendian=bigendian, blocksize=blocksize, _truncate=1)

        if self.bytestream:
            def operate(v):
                return ((v * 0x202020202) & 0x10884422010) % 1023
        elif self.blocksize in (2, 4, 8):
            def operate(v):
                s = self.fbits
                m = self.fmask
                w = v
                while s > 1:
                    s >>= 1
                    m = m ^ (m << s)
                    w = ((w << s) & ~m) | ((w >> s) & m)
                return w
        else:
            def operate(v):
                w = v & 0
                for s in range(self.fbits):
                    w |= ((v >> s) & 1) << (self.fbits - s - 1)
                return w
        self.operate = operate

class bitsnip (slices=[slice(0, 1, None)], bigendian=False, blocksize=None)

Pick a certain range of bits from each block of the input. The extracted ranges of bits are concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are indexed from least significant at index 0 to most significant in each block. When the unit operates in big endian mode, the internal bit buffer is shifted left in each step and new bits are inserted as the least significant portion. Conversely, in default (little endian) mode, newly extracted bits are added as the now most significant ones. After concatenating all bit slices into a large integer, this integer is converted into a byte string according to the given byte ordering.

Expand source code Browse git

class bitsnip(BlockTransformationBase):
    """
    Pick a certain range of bits from each block of the input. The extracted ranges of bits are
    concatenated. Leftover bits that do not form at least one full byte are discarded. Bits are
    indexed from least significant at index 0 to most significant in each block. When the unit
    operates in big endian mode, the internal bit buffer is shifted left in each step and new bits
    are inserted as the least significant portion. Conversely, in default (little endian) mode,
    newly extracted bits are added as the now most significant ones. After concatenating all bit
    slices into a large integer, this integer is converted into a byte string according to the
    given byte ordering.
    """
    def __init__(
        self, slices: Arg(help=(
            'Specify start:stop:size, where size can be used to pad or truncate the extracted '
            'bits. If size is omitted, it defaults to (stop-start). If no slice is specified, '
            'it defaults to 0, which corresponds to 0:1:1, i.e. extracting the lowest bit.')
        ) = [slice(0, 1)],
        bigendian=False, blocksize=None
    ):
        super().__init__(slices=slices, bigendian=bigendian, blocksize=blocksize)

    def process(self, data: bytearray):
        bitsnip_data = 0
        bitsnip_size = 0
        slices: List[Tuple[int, int, int]] = []
        maxbits = 8 * self.blocksize
        args: Iterable[slice] = iter(self.args.slices)
        bigendian: bool = self.args.bigendian

        for s in args:
            start = s.start
            stop = s.stop
            if start is None:
                start = 0
            if stop is None:
                stop = maxbits
            elif stop > maxbits:
                raise ValueError(F'the selection {start}:{stop} is out of bounds for the block size {self.blocksize}')
            if start >= stop:
                continue
            size = stop - start
            mask = (1 << size) - 1
            size = s.step or size
            slices.append((start, mask, size))

        for item in self.chunk(data):
            for shift, mask, size in slices:
                bits = (item >> shift) & mask
                if bigendian:
                    bitsnip_data <<= size
                    bitsnip_data |= bits
                else:
                    bitsnip_data |= bits << bitsnip_size
                bitsnip_size += size

        length, remainder = divmod(bitsnip_size, 8)

        if remainder != 0:
            self.log_info(F'discarding {bitsnip_size % 8} bits')
            if bigendian:
                bitsnip_data >>= remainder
            else:
                bitsnip_data &= (1 << (8 * length)) - 1

        if bigendian:
            return bitsnip_data.to_bytes(length, 'big')
        else:
            return bitsnip_data.to_bytes(length, 'little')

class blabla (key, nonce=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', rounds=10, discard=0, stateful=False)

Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It is similar to ChaCha in design but operates on 64-bit blocks.

Expand source code Browse git

class blabla(StreamCipherUnit):
    """
    Implements the BlaBla cipher, a 256-bit stream cipher designed by Jean-Philippe Aumasson. It
    is similar to ChaCha in design but operates on 64-bit blocks.
    """
    key_size = {32}

    def __init__(
        self, key,
        nonce: Arg(help='The 16-byte nonce. The default are 16 null bytes.') = bytes(16),
        rounds: Arg.Number('-r', help='The number of rounds, default is {default}.') = 10,
        discard=0, stateful=False
    ):
        super().__init__(key=key, nonce=nonce, rounds=rounds, discard=discard, stateful=stateful)

    def keystream(self):
        r = self.args.rounds
        n = self.args.nonce
        k = struct.unpack('<4Q', self.args.key)

        try:
            n = struct.unpack('<2Q', n)
        except Exception:
            raise ValueError(F'The given nonce has invalid length of {len(n)}, it must be 16 bytes in size.')

        q = [
            0x6170786593810fab,  # 0x0
            0x3320646ec7398aee,  # 0x1
            0x79622d3217318274,  # 0x2
            0x6b206574babadada,  # 0x3
            *k,                  # 0x4 .. 0x7
            0x2ae36e593e46ad5f,  # 0x8
            0xb68f143029225fc9,  # 0x9
            0x8da1e08468303aa6,  # 0xA
            0xa48a209acd50a4a7,  # 0xB
            0x7fdc12f23f90778c,  # 0xC
            1,                   # 0xD
            *n                   # 0xE .. 0xF
        ]
        while True:
            v = [*q]
            for _ in range(r):
                for a, b, c, d in [
                    (0x0, 0x4, 0x8, 0xC),
                    (0x1, 0x5, 0x9, 0xD),
                    (0x2, 0x6, 0xA, 0xE),
                    (0x3, 0x7, 0xB, 0xF),
                    (0x0, 0x5, 0xA, 0xF),
                    (0x1, 0x6, 0xB, 0xC),
                    (0x2, 0x7, 0x8, 0xD),
                    (0x3, 0x4, 0x9, 0xE),
                ]:
                    v[a] = v[a] + v[b] & _M64
                    v[d] = rotr64(v[d] ^ v[a], 32)
                    v[c] = v[c] + v[d] & _M64
                    v[b] = rotr64(v[b] ^ v[c], 24)
                    v[a] = v[a] + v[b] & _M64
                    v[d] = rotr64(v[d] ^ v[a], 16)
                    v[c] = v[c] + v[d] & _M64
                    v[b] = rotr64(v[b] ^ v[c], 63)
            v = [x + y & _M64 for x, y in zip(q, v)]
            q[0xD] += 1
            yield from struct.pack('<16Q', *v)

class blk224 (reps=1, text=False)

Returns the BLK224 hash of the input data.

class blk256 (reps=1, text=False)

Returns the BLK256 hash of the input data.

class blk384 (reps=1, text=False)

Returns the BLK384 hash of the input data.

class blk512 (reps=1, text=False)

Returns the BLK512 hash of the input data.

class blowfish (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

Blowfish encryption and decryption.

Expand source code Browse git

class blowfish(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(Blowfish)):
    """
    Blowfish encryption and decryption.
    """
    pass

class blz

BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree implementation: It requires a lot of time & memory.

Expand source code Browse git

class blz(Unit):
    """
    BriefLZ compression and decompression. The compression algorithm uses a pure Python suffix tree
    implementation: It requires a lot of time & memory.
    """
    def _begin(self, data):
        self._src = StructReader(memoryview(data))
        self._dst = MemoryFile(bytearray())
        return self

    def _reset(self):
        self._src.seek(0)
        self._dst.seek(0)
        self._dst.truncate()
        return self

    def _decompress(self):
        (
            signature,
            version,
            src_count,
            src_crc32,
            dst_count,
            dst_crc32,
        ) = self._src.read_struct('>6L')
        if signature != 0x626C7A1A:
            raise ValueError(F'Invalid BriefLZ signature: {signature:08X}, should be 626C7A1A.')
        if version > 10:
            raise ValueError(F'Invalid version number {version}, should be less than 10.')
        self.log_debug(F'signature: 0x{signature:08X} V{version}')
        self.log_debug(F'src count: 0x{src_count:08X}')
        self.log_debug(F'src crc32: 0x{src_crc32:08X}')
        self.log_debug(F'dst count: 0x{dst_count:08X}')
        self.log_debug(F'dst crc32: 0x{dst_crc32:08X}')
        src = self._src.getvalue()
        src = src[24:24 + src_count]
        if len(src) < src_count:
            self.log_warn(F'Only {len(src)} bytes in buffer, but header annoucned a length of {src_count}.')
        if src_crc32:
            check = zlib.crc32(src)
            if check != src_crc32:
                self.log_warn(F'Invalid source data CRC {check:08X}, should be {src_crc32:08X}.')
        dst = self._decompress_chunk(dst_count)
        if not dst_crc32:
            return dst
        check = zlib.crc32(dst)
        if check != dst_crc32:
            self.log_warn(F'Invalid result data CRC {check:08X}, should be {dst_crc32:08X}.')
        return dst

    def _decompress_modded(self):
        self._src.seekrel(8)
        total_size = self._src.u64()
        chunk_size = self._src.u64()
        remaining = total_size
        self.log_debug(F'total size: 0x{total_size:016X}')
        self.log_debug(F'chunk size: 0x{chunk_size:016X}')
        while remaining > chunk_size:
            self._decompress_chunk(chunk_size)
            remaining -= chunk_size
        return self._decompress_chunk(remaining)

    def _decompress_chunk(self, size=None):
        bitcount = 0
        bitstore = 0
        decompressed = 1

        def readbit():
            nonlocal bitcount, bitstore
            if not bitcount:
                bitstore = int.from_bytes(self._src.read_exactly(2), 'little')
                bitcount = 0xF
            else:
                bitcount = bitcount - 1
            return (bitstore >> bitcount) & 1

        def readint():
            result = 2 + readbit()
            while readbit():
                result <<= 1
                result += readbit()
            return result

        self._dst.write(self._src.read_exactly(1))

        try:
            while not size or decompressed < size:
                if readbit():
                    length = readint() + 2
                    sector = readint() - 2
                    offset = self._src.read_byte() + 1
                    delta = offset + 0x100 * sector
                    available = self._dst.tell()
                    if delta not in range(available + 1):
                        raise RefineryPartialResult(
                            F'Requested rewind by 0x{delta:08X} bytes with only 0x{available:08X} bytes in output buffer.',
                            partial=self._dst.getvalue())
                    quotient, remainder = divmod(length, delta)
                    replay = memoryview(self._dst.getvalue())
                    replay = bytes(replay[-delta:] if quotient else replay[-delta:length - delta])
                    replay = quotient * replay + replay[:remainder]
                    self._dst.write(replay)
                    decompressed += length
                else:
                    self._dst.write(self._src.read_exactly(1))
                    decompressed += 1
        except EOFError as E:
            raise RefineryPartialResult(str(E), partial=self._dst.getvalue())
        dst = self._dst.getvalue()
        if decompressed < size:
            raise RefineryPartialResult(
                F'Attempted to decompress {size} bytes, got only {len(dst)}.', dst)
        if decompressed > size:
            raise RuntimeError('Decompressed buffer contained more bytes than expected.')
        return dst

    def _compress(self):
        from refinery.lib.suffixtree import SuffixTree

        try:
            self.log_info('computing suffix tree')
            tree = SuffixTree(self._src.getvalue())
        except Exception:
            raise

        bitstore = 0  # The bit stream to be written
        bitcount = 0  # The number of bits in the bit stream
        buffer = MemoryFile(bytearray())

        # Write empty header and first byte of source
        self._dst.write(bytearray(24))
        self._dst.write(self._src.read_exactly(1))

        def writeint(n: int) -> None:
            """
            Write an integer to the bit stream.
            """
            nonlocal bitstore, bitcount
            nbits = n.bit_length()
            if nbits < 2:
                raise ValueError
            # The highest bit is implicitly assumed:
            n ^= 1 << (nbits - 1)
            remaining = nbits - 2
            while remaining:
                remaining -= 1
                bitstore <<= 2
                bitcount += 2
                bitstore |= ((n >> remaining) & 3) | 1
            bitstore <<= 2
            bitcount += 2
            bitstore |= (n & 1) << 1

        src = self._src.getvalue()
        remaining = len(src) - 1
        self.log_info('compressing data')

        while True:
            cursor = len(src) - remaining
            rest = src[cursor:]
            if bitcount >= 0x10:
                block_count, bitcount = divmod(bitcount, 0x10)
                info_channel = bitstore >> bitcount
                bitstore = info_channel << bitcount ^ bitstore
                # The decompressor will read bits from top to bottom, and each 16 bit block has to be
                # little-endian encoded. The bit stream is encoded top to bottom bit in the bitstore
                # variable, and by encoding it as a big endian integer, the stream is in the correct
                # order. However, we need to swap adjacent bytes to achieve little endian encoding for
                # each of the blocks:
                info_channel = bytearray(info_channel.to_bytes(block_count * 2, 'big'))
                for k in range(block_count):
                    k0 = 2 * k + 0
                    k1 = 2 * k + 1
                    info_channel[k0], info_channel[k1] = info_channel[k1], info_channel[k0]
                info_channel = memoryview(info_channel)
                data_channel = memoryview(buffer.getvalue())
                self._dst.write(info_channel[:2])
                self._dst.write(data_channel[:-1])
                self._dst.write(info_channel[2:])
                data_channel = bytes(data_channel[-1:])
                buffer.truncate(0)
                store = buffer if bitcount else self._dst
                store.write(data_channel)
            if remaining + bitcount < 0x10:
                buffer = buffer.getvalue()
                if rest or buffer:
                    bitstore <<= 0x10 - bitcount
                    self._dst.write(bitstore.to_bytes(2, 'little'))
                    self._dst.write(buffer)
                    self._dst.write(rest)
                elif bitcount:
                    raise RuntimeError('Bitbuffer Overflow')
                break
            node = tree.root
            length = 0
            offset = 0
            sector = None
            while node.children and length < len(rest):
                for child in node.children.values():
                    if tree.data[child.start] == rest[length]:
                        node = child
                        break
                if node.start >= cursor:
                    break
                offset = node.start - length
                length = node.end + 1 - offset
            length = min(remaining, length)
            if length >= 4:
                sector, offset = divmod(cursor - offset - 1, 0x100)
            bitcount += 1
            bitstore <<= 1
            if sector is None:
                buffer.write(rest[:1])
                remaining -= 1
                continue
            bitstore |= 1
            buffer.write(bytes((offset,)))
            writeint(length - 2)
            writeint(sector + 2)
            remaining -= length

        self._dst.seek(24)
        dst = self._dst.peek()
        self._dst.seek(0)
        self._dst.write(struct.pack('>6L', 0x626C7A1A, 1, len(dst), zlib.crc32(dst), len(src), zlib.crc32(src)))
        return self._dst.getvalue()

    def process(self, data):
        self._begin(data)
        partial = None
        try:
            return self._decompress()
        except ValueError as error:
            if isinstance(error, RefineryPartialResult):
                partial = error
            self.log_warn(F'Reverting to modified BriefLZ after decompression error: {error!s}')
            self._reset()

        try:
            return self._decompress_modded()
        except RefineryPartialResult:
            raise
        except Exception as error:
            if not partial:
                raise
            raise partial from error

    def reverse(self, data):
        return self._begin(data)._compress()

class brotli

Brotli compression and decompression.

Expand source code Browse git

class brotli(Unit):
    """
    Brotli compression and decompression.
    """

    @Unit.Requires('brotlipy', 'all')
    def _brotli():
        import brotli
        return brotli

    def process(self, data):
        return self._brotli.decompress(bytes(data))

    def reverse(self, data):
        return self._brotli.compress(bytes(data))

class bruteforce (name, length=slice(1, None, None), format=None, alphabet=None, pattern=None, printable=False, digits=False, identifier=False, letters=False)

Generates all possible combinations of letters in a given alphabet. For each generated string, one copy of each input chunk is generated and populated with a meta variable containing that string. This can be used for simple brute forcing checks.

Expand source code Browse git

class bruteforce(Unit):
    """
    Generates all possible combinations of letters in a given alphabet. For each generated string,
    one copy of each input chunk is generated and populated with a meta variable containing that
    string. This can be used for simple brute forcing checks.
    """
    def __init__(
        self,
        name  : Arg.String(help='Name of the meta variable to be populated.'),
        length: Arg.Bounds(metavar='length', help=(
            'Specifies the range of characters to brute force, default is {default}.'
        )) = slice(1, None),
        format: Arg.String(help=(
            'Optional format expression for the output string. The format sequence "{0}" is the '
            'current brute force string, the sequence "{1}" represents the input data.'
        )) = None,
        alphabet  : Arg.Binary('-a', group='ALPH', help=(
            'The alphabet from which to choose the letters. Entire byte range by default.'
        )) = None,
        pattern   : Arg.RegExp('-r', group='ALPH',
            help='Provide a regular expression pattern to define the alphabet.') = None,
        printable : Arg.Switch('-p', group='ALPH',
            help='Equivalent to --pattern=[\\s\\x20-\\x7E]') = False,
        digits    : Arg.Switch('-d', group='ALPH',
            help='Equivalent to --pattern=\\d') = False,
        identifier: Arg.Switch('-i', group='ALPH',
            help='Equivalent to --pattern=\\w') = False,
        letters   : Arg.Switch('-l', group='ALPH',
            help='Equivalent to --pattern=[a-zA-Z]') = False,
    ):
        options = sum(1 for x in [printable, digits, identifier, letters] if x)

        if options > 1 or options and pattern:
            raise ValueError('Invalid selection.')

        if printable:
            pattern = b'[\\s\\x20-\\x7E]'
        if digits:
            pattern = b'\\d'
        if identifier:
            pattern = b'\\w'
        if letters:
            pattern = b'[a-zA-Z]'

        super().__init__(
            name=name,
            length=length,
            format=format,
            alphabet=alphabet,
            pattern=pattern,
        )

    def _alphabet(self) -> bytes:
        alphabet = self.args.alphabet
        if alphabet:
            return alphabet
        alphabet = bytes(range(0x100))
        pattern = self.args.pattern
        if not pattern:
            return alphabet
        alphabet = B''.join(re.findall(pattern, alphabet, flags=re.DOTALL))
        if alphabet:
            return alphabet
        raise ValueError(F'Invalid regular expression: {pattern}')

    def process(self, data: bytearray):
        format_spec: str = self.args.format
        meta = metavars(data)
        name = self.args.name
        kwargs = {name: None}

        for length in integers_of_slice(self.args.length):
            self.log_info(F'generating {length} digits')
            if not isinstance(length, int) or length < 0:
                raise ValueError(F'Unable to brute force {length} characters.')
            for string in itertools.product(self._alphabet(), repeat=length):
                string = bytes(string)
                if format_spec:
                    string = meta.format_bin(format_spec, self.codec, [string, data])
                kwargs[name] = string
                yield self.labelled(data, **kwargs)

class byteswap (size=4)

Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block size are discarded.

Expand source code Browse git

class byteswap(UnaryOperation):
    """
    Reverses the order of bytes in each block. Excess bytes that are not an integer multiple of the block
    size are discarded.
    """
    def __init__(self, size: Arg.Number(help='the block size in bytes; the default is {default}.') = 4):
        super().__init__(blocksize=size, _truncate=2)

    def inplace(self, block: ndarray) -> None:
        block.byteswap(True)

    operate = NotImplemented

    def process(self, data):
        try:
            return self._fastblock(data)
        except FastBlockError:
            b = self.blocksize
            n = len(data)
            m = n - n % b
            v = memoryview(data)
            if b == 1:
                self.log_warn('running this unit with a block size of 1 does not have any effect')
                return data
            for k in range(0, m, b):
                _end = k and k - 1 or None
                data[k : k + b] = v[k + b - 1:_end:-1]
            if m < n:
                del v
                del data[m:]
            return data

class bz2 (level=9)

BZip2 compression and decompression.

Expand source code Browse git

class bz2(Unit):
    """
    BZip2 compression and decompression.
    """
    def __init__(self, level: Arg('-l', type=number[1:9], help='compression level preset between 1 and 9') = 9):
        super().__init__(level=level)

    def process(self, data):
        return bz2_.decompress(data)

    def reverse(self, data):
        return bz2_.compress(data, self.args.level)

    @classmethod
    def handles(self, data: bytearray):
        return data[:3] == B'BZh'

class camellia (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

Camellia encryption and decryption.

Expand source code Browse git

class camellia(StandardBlockCipherUnit, cipher=BlockCipherFactory(Camellia)):
    """
    Camellia encryption and decryption.
    """
    pass

class carve (format, unique=False, decode=False, single=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None, utf16=True, ascii=True)

Extracts patches of data in particular formats from the input.

Expand source code Browse git

class carve(PatternExtractor):
    """
    Extracts patches of data in particular formats from the input.
    """
    def __init__(
        self, format: Arg.Choice(choices=[p.display for p in formats], metavar='format',
            help='Specify one of the following formats: {choices}'),
        unique: Arg.Switch('-q', help='Yield every match only once.') = False,
        decode: Arg.Switch('-d', help='Automatically decode known patterns.') = False,
        single: Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1') = False,
        min=1, max=None, len=None,
        stripspace=False, longest=False, take=None, utf16=True, ascii=True
    ):
        if single:
            take = 1
            longest = True
            unique = True
        super().__init__(
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            duplicates=not unique,
            longest=longest,
            take=take,
            ascii=ascii,
            utf16=utf16,
            format=formats.from_dashname(format)
        )
        if not decode:
            decoder = NotImplemented
        elif self.args.format in (formats.multiline_string, formats.string):
            from ..encoding.esc import esc
            decoder = esc(unicode=True, quoted=True)
        elif self.args.format is formats.integer:
            from ..encoding.base import base
            decoder = base()
        elif self.args.format in (formats.uppercase_hex, formats.spaced_hex, formats.hex):
            from ..encoding.hex import hex
            decoder = hex()
        elif self.args.format is formats.hexdump:
            from ..formats.hexload import hexload
            decoder = hexload()
        elif self.args.format is formats.intarray:
            from ..blockwise.pack import pack
            decoder = pack()
        elif self.args.format in (formats.b64, formats.b64any, formats.spaced_b64):
            from ..encoding.b64 import b64
            decoder = b64()
        elif self.args.format in (formats.b85, formats.spaced_b85):
            from ..encoding.b85 import b85
            decoder = b85()
        elif self.args.format is formats.b64url:
            from ..encoding.b64 import b64
            decoder = b64(urlsafe=True)
        elif self.args.format is formats.b32:
            from ..encoding.b32 import b32
            decoder = b32()
        elif self.args.format is formats.ps1str:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format is formats.vbastr:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format is formats.hexarray:
            from ..blockwise.pack import pack
            decoder = pack(0x10)
        elif self.args.format is formats.wshenc:
            from ..encoding.wshenc import wshenc
            decoder = wshenc()
        elif self.args.format is formats.uuencode:
            from ..encoding.uuenc import uuenc
            decoder = uuenc()
        elif self.args.format in (
            formats.urlquote,
            formats.urlquote_coarse,
            formats.urlquote_narrow,
        ):
            from ..encoding.url import url
            decoder = url()
        else:
            decoder = NotImplemented
        self.decoder = decoder

    def process(self, data):
        it = iter(self.matches_filtered(memoryview(data), self.args.format.value.bin_compiled))
        if self.decoder is NotImplemented:
            yield from it
        for chunk in it:
            try:
                yield self.decoder(chunk)
            except Exception as E:
                self.log_info(F'decoder failure: {E!s}')

class carve_7z

Extracts anything from the input data that looks like a 7zip archive file.

Expand source code Browse git

class carve_7z(Unit):
    """
    Extracts anything from the input data that looks like a 7zip archive file.
    """
    @Unit.Requires('py7zr', 'arc', 'default', 'extended')
    def _py7zr():
        import py7zr
        return py7zr

    HEADER_SIGNATURE = B'7z\xBC\xAF\x27\x1C'

    def process(self, data: bytearray):
        cursor = 0
        mv = memoryview(data)
        while True:
            start = data.find(self.HEADER_SIGNATURE, cursor)
            if start < cursor:
                break
            self.log_debug(F'found header at offset: 0x{start:08X}')
            try:
                mf = MemoryFileRecorder(mv[start:])
                self.log_debug('attempting to read archive')
                archive = self._py7zr.SevenZipFile(mf)
                self.log_debug('attempting to test archive')
                success = archive.test() is not False
            except ImportError:
                raise
            except Exception as error:
                self.log_debug('parsing archive failed:', error)
                success = False
            if success:
                self.log_info(F'identified archive of size 0x{mf.max_cursor:08X} at offset 0x{start:08X}')
                cursor = start + mf.max_cursor
                yield self.labelled(mv[start:cursor], offset=start)
            else:
                cursor = start + 5

class carve_der

Extracts anything from the input data that looks like a DER sequence. The carving can be very slow: The unit will attempt to parse an ASN1 sequence at every offset where a byte with value 0x30 is found, since this can indicate the start of an ASN1 SEQUENCE. It will only consider the next 10KB of data at this offset, but it nevertheless remains a poor heuristic.

Expand source code Browse git

class carve_der(Unit):
    """
    Extracts anything from the input data that looks like a DER sequence. The carving can be very
    slow: The unit will attempt to parse an ASN1 sequence at every offset where a byte with value
    0x30 is found, since this can indicate the start of an ASN1 SEQUENCE. It will only consider the
    next 10KB of data at this offset, but it nevertheless remains a poor heuristic.
    """
    @Unit.Requires('pyasn1', 'default', 'extended')
    def _pyasn1parsers():
        from pyasn1.codec.der.decoder import decode
        from pyasn1.codec.der.encoder import encode
        return encode, decode

    def process(self, data: bytearray):
        cursor = 0
        encode, decode = self._pyasn1parsers
        while True:
            try:
                pos = data.index(0x30, cursor)
            except Exception:
                break
            else:
                cursor += 1
            if pos + 1 < len(data) and data[pos + 1] == 0:
                continue
            try:
                sequence = decode(bytes(data[pos:pos + 10_000]))
            except Exception:
                continue
            if not (der := sequence[0]):
                self.log_info(F'0x{pos:08X}: parser returned nothing')
                continue
            if len(der) < 2:
                self.log_info(F'0x{pos:08X}: parser returned empty sequence')
                continue
            der = encode(der)
            cursor = pos + len(der)
            yield self.labelled(der, offset=pos)

class carve_json (all=False)

Extracts anything from the input data that looks like JSON.

Expand source code Browse git

class carve_json(Unit):
    """
    Extracts anything from the input data that looks like JSON.
    """
    def __init__(
        self, all: Arg.Switch('-a', help=(
            'By default, only dictionaries are carved. Specify this flag to also carve lists.'
        )) = False
    ):
        super().__init__(all=all)

    def process(self, data):
        for start, chunk in JSONCarver(data, dictonly=not self.args.all):
            yield self.labelled(chunk, offset=start)

class carve_lnk

Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)

Expand source code Browse git

class carve_lnk(Unit):
    """
    Extracts anything from the input data that looks like a Windows shortcut (i.e. an LNK file)
    """

    @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'extended')
    def _LnkParse3():
        import LnkParse3
        import LnkParse3.extra_factory
        return LnkParse3

    def process(self, data: bytearray):
        pos = 0
        mem = memoryview(data)
        sig = B'\x4C\x00\x00\x00\x01\x14\x02\x00'
        lnk = self._LnkParse3

        while True:
            pos = data.find(sig, pos)
            if pos < 0:
                break
            try:
                parsed = lnk.lnk_file(indata=mem[pos:])
            except Exception:
                pos += 1
                continue
            end = pos + parsed.header.size() + parsed.string_data.size()
            if parsed.has_target_id_list():
                end += parsed.targets.size()
            if parsed.has_link_info() and not parsed.force_no_link_info():
                with suppress(AttributeError):
                    end += parsed.info.size()
            with NoLogging():
                while end < len(mem):
                    extra = lnk.extra_factory.ExtraFactory(mem[end:])
                    try:
                        ec = extra.extra_class()
                    except Exception:
                        break
                    if ec is None:
                        break
                    if 'UNKNOWN' in ec().name():
                        break
                    end += extra.item_size()

            terminal_block = mem[end:end + 4]
            if terminal_block != B'\0\0\0\0':
                self.log_warn(F'detected LNK at offset 0x{pos:X}, but size calculation did not end on a terminal block')
                continue
            else:
                end += 4
            yield self.labelled(mem[pos:end], offset=pos)
            pos = end

class carve_pe (*paths, list=False, join_path=False, drop_path=False, path=b'name', recursive=False, keep_root=False, memdump=False, fileinfo=False)

Extracts anything from the input data that looks like a Portable Executable (PE) file.

Expand source code Browse git

class carve_pe(PathExtractorUnit):
    """
    Extracts anything from the input data that looks like a Portable
    Executable (PE) file.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, path=b'name',
        recursive: Arg.Switch('-r', help='Extract PE files that are contained in already extracted PEs.') = False,
        keep_root: Arg.Switch('-k', help='If the input chunk is itself a PE, include it as an output chunk.') = False,
        memdump  : Arg.Switch('-m', help='Use the virtual memory layout of a PE file to calculate its size.') = False,
        fileinfo : Arg.Switch('-f', help='Use the PE meta information to deduce a file name meta variable.') = False
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            path=path,
            recursive=recursive,
            keep_root=keep_root,
            memdump=memdump,
            fileinfo=fileinfo,
        )

    def unpack(self, data):
        cursor = 0
        mv = memoryview(data)

        while True:
            offset = data.find(B'MZ', cursor)
            if offset < cursor: break
            cursor = offset + 2
            ntoffset = mv[offset + 0x3C:offset + 0x3E]
            if len(ntoffset) < 2:
                return
            ntoffset, = unpack('H', ntoffset)
            if mv[offset + ntoffset:offset + ntoffset + 2] != B'PE':
                self.log_debug(F'invalid NT header signature for candidate at 0x{offset:08X}')
                continue
            try:
                pe = lief.load_pe_fast(mv[offset:])
            except Exception as err:
                self.log_debug(F'parsing of PE header at 0x{offset:08X} failed:', err)
                continue

            pesize = get_pe_size(pe, memdump=self.args.memdump)
            pedata = mv[offset:offset + pesize]
            info = {}
            if self.args.fileinfo:
                pe_meta_parser = pemeta()
                try:
                    info = pe_meta_parser.parse_version(pe) or {}
                except Exception as error:
                    self.log_warn(F'Unable to obtain file information: {error!s}')
                try:
                    info.update(pe_meta_parser.parse_header(pe) or {})
                except Exception:
                    pass
            try:
                path = info['OriginalFilename']
            except KeyError:
                try:
                    path = info['ExportName']
                except KeyError:
                    path = F'carve-0x{offset:08X}.{magic(pedata).extension}'

            if offset > 0 or self.args.keep_root:
                yield UnpackResult(path, pedata, offset=offset)
                self.log_info(F'extracted PE file of size 0x{pesize:08X} from 0x{offset:08X}')
            else:
                self.log_info(F'ignored root file of size 0x{pesize:08X} from 0x{offset:08X}')
                continue

            if not offset or self.args.recursive:
                cursor += pe.optional_header.sizeof_headers
            else:
                cursor += pesize - 2

class carve_rtf

Extracts anything from the input data that looks like an RTF document.

Expand source code Browse git

class carve_rtf(Unit):
    """
    Extracts anything from the input data that looks like an RTF document.
    """

    def process(self, data: bytearray):
        pos = 0
        mem = memoryview(data)
        sig = re.escape(b'{\\rtf')

        while True:
            match = re.search(sig, mem[pos:], flags=re.IGNORECASE)
            if match is None:
                break
            pos = pos + match.start()
            end = pos + 1
            depth = 1
            while depth and end < len(mem):
                if mem[end] == 0x7B:  # {
                    depth += 1
                if mem[end] == 0x7D:  # }
                    depth -= 1
                end += 1
            if depth > 0:
                break
            yield self.labelled(mem[pos:end], offset=pos)
            pos = end

class carve_xml

Extracts anything from the input data that looks like XML.

Expand source code Browse git

class carve_xml(Unit):
    """
    Extracts anything from the input data that looks like XML.
    """

    def process(self, data):
        for offset, chunk in XMLCarver(data):
            yield self.labelled(chunk, offset=offset)

class carve_zip

Extracts anything from the input data that looks like a zip archive file.

Expand source code Browse git

class carve_zip(Unit):
    """
    Extracts anything from the input data that looks like a zip archive file.
    """

    def process(self, data: bytearray):
        end = len(data)
        mem = memoryview(data)
        rev = []
        while True:
            end = data.rfind(ZipEndOfCentralDirectory.SIGNATURE, 0, end)
            if end < 0:
                break
            try:
                end_marker = ZipEndOfCentralDirectory(mem[end:])
            except ValueError as e:
                self.log_info(F'error parsing end of central directory at 0x{end:X}: {e!s}')
                continue
            else:
                self.log_info(F'successfully parsed end of central directory at 0x{end:X}')
            start = end - end_marker.directory_size
            shift = start - end_marker.directory_offset
            if start < 0:
                self.log_debug('end of central directory size is invalid')
                continue
            try:
                central_directory = ZipCentralDirectory(mem[start:])
            except ValueError:
                self.log_debug('computed location of central directory is invalid')
                end = end - len(ZipEndOfCentralDirectory.SIGNATURE)
                continue
            start = central_directory.header_offset + shift
            if mem[start:start + 4] not in (B'PK\x03\x04', B'\0\0\0\0'):
                # SFX payloads seem to have a nulled header, so we permit this.
                self.log_debug('computed start of ZIP archive does not have the correct signature bytes')
                continue
            rev.append((start, end + len(end_marker)))
            end = start
        for start, end in reversed(rev):
            zip = mem[start:end]
            yield self.labelled(zip, offset=start)

class cast (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

CAST encryption and decryption.

Expand source code Browse git

class cast(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(CAST)):
    """
    CAST encryption and decryption.
    """
    pass

class cca (data)

Short for ConCatAppend: This unit concatenates the input data with its argument by appending the latter to the former. See also ccp for the unit that prepends instead.

Expand source code Browse git

class cca(Unit):
    """
    Short for ConCatAppend: This unit concatenates the input data with its argument by
    appending the latter to the former. See also `refinery.ccp` for the unit that prepends
    instead.
    """

    def __init__(self, data: Arg(help='Binary string to be appended to the input.')):
        super().__init__(data=data)

    def process(self, data: bytearray):
        data.extend(self.args.data)
        return data

class ccp (data)

Short for ConCatPrepend: This unit concatenates the input data with its argument by prepending the latter to the former. See also cca for the unit that appends instead.

Expand source code Browse git

class ccp(Unit):
    """
    Short for ConCatPrepend: This unit concatenates the input data with its argument by
    prepending the latter to the former. See also `refinery.cca` for the unit that appends
    instead.
    """

    def __init__(self, data: Arg(help='Binary string to be prepended to the input.')):
        super().__init__(data=data)

    def process(self, data: bytearray):
        data[:0] = self.args.data
        return data

class chacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored.

Expand source code Browse git

class chacha(LatinCipherUnit):
    """
    ChaCha encryption and decryption. The nonce must be 8 bytes long as currently, only the
    original Bernstein algorithm is implemented. When 64 bytes are provided as the key, this
    data is interpreted as the initial state box and all other parameters are ignored.
    """
    def keystream(self) -> Iterable[int]:
        key = self.args.key
        if len(key) == 64:
            it = ChaChaCipher.FromState(key)
        else:
            it = ChaChaCipher(
                key,
                self.args.nonce,
                self.args.magic,
                self.args.rounds,
                self.args.offset,
            )
        yield from it

class chacha20 (key, nonce=b'REFINERY')

ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking this unit for ChaCha20 is functionally equivalent to chacha with 20 rounds, but this unit uses the PyCryptodome library C implementation rather than the pure Python implementation used by chacha.

Expand source code Browse git

class chacha20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20)):
    """
    ChaCha20 and XChaCha20 encryption and decryption. For ChaCha20, the IV (nonce) must
    be 8 or 12 bytes long; for XChaCha20, choose an IV which is 24 bytes long. Invoking
    this unit for ChaCha20 is functionally equivalent to `refinery.chacha` with 20 rounds,
    but this unit uses the PyCryptodome library C implementation rather than the pure
    Python implementation used by `refinery.chacha`.
    """
    pass

class chacha20poly1305 (key, nonce=b'REFINERY')

ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20 variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce instead.

Expand source code Browse git

class chacha20poly1305(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(ChaCha20_Poly1305)):
    """
    ChaCha20-Poly1305 and XChaCha20-Poly1305 encryption and decryption. For the ChaCha20
    variant, the nonce must be 8 or 12 bytes long; for XChaCha20, provide a 24 bytes nonce
    instead.
    """
    def _get_cipher(self, reset_cache=False):
        cipher = super()._get_cipher(reset_cache)
        cipher.block_size = 1
        return cipher

class chaskey (key, iv=b'', padding=None, mode=None, raw=False, rounds=12, swap=False, *, aad=None, tag=None, segment_size=0, little_endian=False)

This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey with 16 rounds and in CTR mode.

Expand source code Browse git

class chaskey(StandardBlockCipherUnit, cipher=BlockCipherFactory(Chaskey)):
    """
    This implements a block cipher based on the Chaskey algorithm. No subkeys are computed and the
    default Chaskey operation is performed on all blocks. Notably, the Donut framework uses Chaskey
    with 16 rounds and in CTR mode.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        rounds: Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R,
        swap: Arg.Switch('-s', help='Use big endian byte order for all blocks.') = False,
        **more
    ):
        super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, rounds=rounds, swap=swap, **more)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            swap=self.args.swap,
            rounds=self.args.rounds,
            **optionals
        )

class chop (size, step=None, truncate=False)

Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.

Expand source code Browse git

class chop(Unit):
    """
    Reinterprets the input as a sequence of equally sized chunks and outputs this sequence.
    """

    def __init__(
        self,
        size: Arg.Number('size', help='Chop data into chunks of this size'),
        step: Arg.Number('step', help=(
            'Optionally specify a step size (which is equal to the size by default) which indicates the number of bytes by '
            'which the cursor will be increased after extracting a chunk.')) = None,
        truncate: Arg.Switch('-t', help=(
            'Truncate possible excess bytes at the end of the input, by default they are appended as a single chunk.')) = False,
    ):
        return super().__init__(size=size, step=step, truncate=truncate)

    def process(self, data):
        view = memoryview(data)
        size = self.args.size
        step = self.args.step
        if size < 1:
            raise ValueError('The chunk size has to be a positive integer value.')
        yield from splitchunks(view, size, step, self.args.truncate)

class clower

Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the input to lowercase.

Expand source code Browse git

class clower(Unit):
    """
    Stands for "Convert to LOWER case"; The unit simply converts all latin alphabet chacters in the
    input to lowercase.
    """
    def process(self, data):
        return data.lower()

class cm (invert=False, all=False, reset=False, size=False, ext=False, entropy=False, ic=False, magic=False, sha1=False, sha256=False, crc32=False, md5=False, hashes=False, *names)

The Common Meta variables unit populates the set of meta variables of the current chunk with commonly used metadata. The unit has no effect outside a frame.

Expand source code Browse git

class cm(Unit):
    """
    The Common Meta variables unit populates the set of meta variables of the current chunk with commonly
    used metadata. The unit has no effect outside a frame.
    """
    def __init__(
        self,
        invert  : Arg.Switch('-x', group='ALL', help='populate only options that have not been specified') = False,
        all     : Arg.Switch('-a', group='ALL', help='populate all options') = False,
        reset   : Arg.Switch('-r', help='discard all meta variables that were not explicitly specified') = False,
        size    : Arg.Switch('-S', help='size of the chunk') = False,
        ext     : Arg.Switch('-X', help='guess file extension') = False,
        entropy : Arg.Switch('-E', help='compute data entropy') = False,
        ic      : Arg.Switch('-C', help='compute the index of coincidence') = False,
        magic   : Arg.Switch('-M', help='compute file magic') = False,
        sha1    : Arg.Switch('-1', help='compute hash: SHA-1') = False,
        sha256  : Arg.Switch('-2', help='compute hash: SHA-256') = False,
        crc32   : Arg.Switch('-3', help='compute hash: CRC32') = False,
        md5     : Arg.Switch('-5', help='compute hash: MD5') = False,
        hashes  : Arg.Switch('-H', help='compute all common hashes') = False,
        *names  : Arg(metavar='name', help=(
            F'A variable name that can include the common properties: {_COMMON_PROPERTIES_LIST}.'
            R' If none is given, the size variable is populated. For most of these, an optional '
            R'argument is available that can be used as a shorthand:'))
    ):
        def stringify(name):
            if isinstance(name, str):
                return name
            return name.decode(self.codec)

        names = {stringify(name) for name in names}
        if hashes:
            md5 = sha256 = sha1 = crc32 = True
        if size:
            names.add('size')
        if ext:
            names.add('ext')
        if entropy:
            names.add('entropy')
        if ic:
            names.add('ic')
        if magic:
            names.add('magic')
        if sha1:
            names.add('sha1')
        if sha256:
            names.add('sha256')
        if crc32:
            names.add('crc32')
        if md5:
            names.add('md5')
        if not names and not reset:
            names.add('size')
        if all:
            if invert:
                raise ValueError('invert and all are both enabled, resulting in empty configuration.')
            names = set(LazyMetaOracle.derivations)
        elif invert:
            names = set(LazyMetaOracle.derivations) - names
        super().__init__(names=names, reset=reset)

    def process(self, data):
        return data

    def filter(self, chunks):
        names = self.args.names
        reset = self.args.reset
        for chunk in chunks:
            chunk: Chunk
            if not chunk.visible:
                yield chunk
                continue
            meta = metavars(chunk)
            if reset:
                chunk.meta.clear()
            for name in names:
                chunk[name] = meta[name]
            yield chunk

class cp1252

Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.

Expand source code Browse git

class cp1252(Unit):
    """
    Encodes and decodes Windows CP 1252 (aka Latin1) encoded string data.
    """

    def process(self, data):
        return data.decode(self.codec).encode('cp1252')

    def reverse(self, data):
        return data.decode('cp1252').encode(self.codec)

class crc32 (reps=1, text=False)

Returns the CRC32 hash of the input data.

Expand source code Browse git

class crc32(HashUnit):
    """
    Returns the CRC32 hash of the input data.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return zlib.crc32(data).to_bytes(4, 'big')

class csb (format, utf16=True, ascii=True, stripspace=False)

Short for carve single buffer; carves the single largest buffer of a given format from the input data and returns it.

Expand source code Browse git

class csb(carve):
    """
    Short for carve single buffer; carves the single largest buffer of a given format from the
    input data and returns it.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=False,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )

class csd (format, utf16=True, ascii=True, stripspace=False)

Short for carve & decode; carves the single largest buffer of a given format from the input and decodes it with the appropriate decoder.

Expand source code Browse git

class csd(carve):
    """
    Short for carve & decode; carves the single largest buffer of a given format from the input
    and decodes it with the appropriate decoder.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=True,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )

class csv (quote=b'"', delim=b',')

Extracts the rows of a CSV document with header and converts them into JSON chunks.

Expand source code Browse git

class csv(Unit):
    """
    Extracts the rows of a CSV document with header and converts them into JSON chunks.
    """
    def __init__(
        self,
        quote: Unit.Arg('-q', help='Specify the quote character, the default is a double quote.') = B'"',
        delim: Unit.Arg('-d', help='Specify the delimiter, the default is a single comma.') = B','
    ):
        super().__init__(quote=quote, delim=delim)

    def json_to_csv(self, table: dict):
        quote = self.args.quote.decode(self.codec)
        delim = self.args.delim.decode(self.codec)

        if not isinstance(table, list):
            raise ValueError('Input must be a JSON list.')

        out = MemoryFile()

        with io.TextIOWrapper(out, self.codec, newline='') as stream:
            writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            for row in table:
                if not isinstance(row, list):
                    break
                if not all(isinstance(item, str) for item in row):
                    break
                writer.writerow(row)
            else:
                return out.getvalue()

        keys = {}
        # A dictionary is used here over a set because dictionaries remember insertion order.
        # When feeding the unit a sequence of JSON objects, the user would likely expect the
        # column order in the resulting CSV to derive from the entry oder in the JSON data.

        for row in table:
            for key in row:
                if not isinstance(key, str):
                    continue
                keys[key] = None

        keys = list(keys)
        out = MemoryFile()

        with io.TextIOWrapper(out, self.codec, newline='') as stream:
            writer = _csv.writer(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            writer.writerow(keys)
            for row in table:
                writer.writerow([str(row.get(key, '')) for key in keys])
            return out.getvalue()

    def reverse(self, data: bytearray):
        try:
            table: List[Dict[str, Any]] = json.loads(data)
        except Exception:
            table: List[Dict[str, Any]] = [json.loads(line) for line in data.splitlines()]
        return self.json_to_csv(table)

    def process(self, data):
        quote = self.args.quote.decode(self.codec)
        delim = self.args.delim.decode(self.codec)

        def convert(field: str):
            if field.isdigit() and not field.startswith('0'):
                return int(field)
            date = isodate(field)
            if date is not None:
                return date.isoformat(' ', 'seconds')
            return field

        with io.TextIOWrapper(MemoryFile(data), self.codec) as stream:
            rows = _csv.reader(stream, quotechar=quote, delimiter=delim, skipinitialspace=True)
            keys = next(rows)
            for row in rows:
                out = {key: convert(value) for key, value in zip(keys, row)}
                yield json.dumps(out, indent=4).encode(self.codec)

class cswap

Swap the case of the input string; all lowercase letters are turned into their uppercase variant and vice-versa.

Expand source code Browse git

class cswap(Unit):
    """
    Swap the case of the input string; all lowercase letters are turned into their uppercase
    variant and vice-versa.
    """
    def process(self, data: bytearray):
        lcase = bytes(range(B'a'[0], B'z'[0] + 1))
        ucase = bytes(range(B'A'[0], B'Z'[0] + 1))
        delta = lcase[0] - ucase[0]
        for k, letter in enumerate(data):
            if letter in ucase:
                data[k] += delta
            elif letter in lcase:
                data[k] -= delta
        return data

class cupper

Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the input to uppercase.

Expand source code Browse git

class cupper(Unit):
    """
    Stands for "Convert to UPPER case"; The unit simply converts all latin alphabet chacters in the
    input to uppercase.
    """
    def process(self, data):
        return data.upper()

class datefix (format='%Y-%m-%d %H:%M:%S', dos=False)

Parses all kinds of date formats and unifies them into the same format.

Expand source code Browse git

class datefix(Unit):
    """
    Parses all kinds of date formats and unifies them into the same format.
    """
    _TIMEZONE_REGEXES = [re_compile(p) for p in [
        R'([+-])(\d{2})(\d{2})$',           # Thu Apr 24 2014 12:32:21 GMT-0700
        R'([+-])(\d{2}):(\d{2})$',          # 2017:09:11 23:47:22+02:00
        R'GMT([+-])(\d{2})(\d{2}) \(.+\)$'  # Thu Apr 24 2014 12:32:21 GMT-0700 (PDT)
    ]]

    def __init__(
        self,
        format: Arg(help='Specify the output format as a strftime-like string, using ISO by default.') = '%Y-%m-%d %H:%M:%S',
        dos: Arg('-d', help='Parse timestamps in DOS rather than Unix format.') = False
    ):
        super().__init__(format=format, dos=dos)

    @staticmethod
    def dostime(stamp: int) -> datetime:
        """
        Parses a given DOS timestamp into a datetime object.
        """
        d, t = stamp >> 16, stamp & 0xFFFF
        s = (t & 0x1F) << 1

        return datetime(
            year   = ((d & 0xFE00) >> 0x9) + 1980,  # noqa
            month  = ((d & 0x01E0) >> 0x5),         # noqa
            day    = ((d & 0x001F) >> 0x0),         # noqa
            hour   = ((t & 0xF800) >> 0xB),         # noqa
            minute = ((t & 0x07E0) >> 0x5),         # noqa
            second = 59 if s == 60 else s,          # noqa
        )

    def _format(self, dt: datetime) -> str:
        return dt.strftime(self.args.format)

    def _extract_timezone(self, data):
        for r in self._TIMEZONE_REGEXES:
            m = r.search(data)
            if not m:
                continue
            pm = m[1]
            td = timedelta(
                hours=int(m[2]), minutes=int(m[3]))
            if pm == '-':
                td = -td
            return data[:-len(m[0])].strip(), td

        return data, None

    @linewise
    def process(self, data: str) -> str:
        data = data.strip()

        # replace colons (i.e. for exiftool dates: 2017:01:01)
        if len(data) > 10 and data[4] == ':' and data[7] == ':':
            data = F'{data[0:4]}-{data[5:7]}-{data[8:]}'

        # strips Z at end (i.e. 20171022055144Z)
        if data.endswith('Z'):
            data = data[:-1]

        if data.startswith('0x'):
            try:
                data = str(int(data, 16))
            except Exception:
                pass

        # parses timestamps and dates without much format
        if data.isdigit():
            time_stamp = int(data)
            if len(data) > 14:
                raise Exception('cannot parse all-numeric string as date: %s' % data)
            elif len(data) == 14:
                # i.e. 20111020193727
                return self._format(datetime.strptime(data, '%Y%m%d%H%M%S'))
            elif len(data) == 13:
                # i.e. 1458016535000
                time_stamp //= 1000
                data = data[:-3]
            if self.args.dos:
                return self._format(self.dostime(time_stamp))
            else:
                return self._format(date_from_timestamp(time_stamp))

        data, time_delta = self._extract_timezone(data)

        for f in _FORMATS:
            try:
                dt = datetime.strptime(data, f)
            except ValueError:
                continue
            return self._format(dt if time_delta is None else dt - time_delta)

        return data

class decompress (prepend=True, tolerance=12, max_ratio=1.0, min_ratio=0.0001, strict_limits=False)

Attempts all available decompression units against the input and returns the output of the first successful one. If none succeeds, the data is returned unaltered. The process is heavily biased against LZNT1 decompression due to a large tendency for LZNT1 false positives.

Expand source code Browse git

class decompress(Unit):
    """
    Attempts all available decompression units against the input and returns
    the output of the first successful one. If none succeeds, the data is
    returned unaltered. The process is heavily biased against LZNT1 decompression
    due to a large tendency for LZNT1 false positives.
    """
    def __init__(
        self,
        prepend: Arg.Switch('-P', '--no-prepend', off=True, help=(
            'By default, if decompression fails, the unit attempts to prefix '
            'the data with all possible values of a single byte and decompress '
            'the result. This behavior can be disabled with this flag.')
        ) = True,
        tolerance: Arg.Number('-t', help=(
            'Maximum number of bytes to strip from the beginning of the data; '
            'The default value is 12.')
        ) = 12,
        max_ratio: Arg('-m', metavar='R', help=(
            'To determine whether a decompression algorithm was successful, the '
            'ratio of compressed size to decompressed size may at most be as large '
            'as this number, a floating point value R; default value is {default}.')
        ) = 1.0,
        min_ratio: Arg('-n', metavar='R', help=(
            'Require that compression ratios must be at least as large as R. This '
            'is a "too good to be true" heuristic against algorithms like lznt1 '
            'that can produce false positives. The default is {default}.')
        ) = 0.0001,
        strict_limits: Arg('-l', action='store_true', help=(
            'For recognized formats, i.e. when a magic signature is present, the '
            'above limits are disabled by default. Activate this flag to enforce '
            'them in every case.')
        ) = False

    ):
        if min_ratio <= 0:
            raise ValueError('The compression factor must be nonnegative.')
        super().__init__(
            tolerance=tolerance,
            prepend=prepend,
            min_ratio=min_ratio,
            max_ratio=max_ratio,
            strict_limits=strict_limits,
        )
        self.engines: Dict[str, Unit] = {}
        for mode in (
            MSCF_MODE.XPRESS,
            MSCF_MODE.XPRESS_HUFF,
        ):
            mode = normalize_to_display(mode.name).casefold()
            unit = mscf.assemble(mode)
            self.engines[F'{unit.name}[{mode}]'] = unit
        for engine in [
            mscf,
            pkw,
            zstd,
            szdd,
            bz2,
            zl,
            lzf,
            lzma,
            lzw,
            jcalg,
            lzo,
            aplib,
            qlz,
            brotli,
            blz,
            lzjb,
            lz4,
            lznt1,
            nrv2e,
            nrv2d,
            nrv2b
        ]:
            unit: Unit = engine.assemble()
            _, _, name = unit.name.rpartition('auto-decompress-')
            self.engines[name] = unit
        for unit in self.engines.values():
            unit.log_detach()

    def process(self, data):

        data = memoryview(data)

        class Decompression(NamedTuple):
            method: str
            engine: Unit
            rating: _R
            result: Optional[ByteString] = None
            cutoff: int = 0
            prefix: Optional[int] = None

            def __str__(self):
                status = self.rating.summary
                method = self.method
                prefix = self.prefix
                if prefix is not None:
                    prefix = F'0x{prefix:02X}'
                return F'prefix={prefix}, cutoff=0x{self.cutoff:02X}, [{status}] method={method}'

            def __len__(self):
                return len(self.result)

            @property
            def ratio(self):
                if not self.result:
                    return INF
                return len(data) / len(self)

            @property
            def unmodified(self):
                return self.prefix is None and self.cutoff == 0

        if self.args.prepend:
            buffer = bytearray(1 + len(data))
            buffer[1:] = data

        best_by_rating: Dict[_R, Decompression] = {}

        def best_current_rating():
            return max(best_by_rating, default=_R.InvalidData)

        def decompress(method: str, engine: Unit, cutoff: int = 0, prefix: Optional[int] = None, careful: bool = False):
            ingest = data[cutoff:]
            rating = _R.ValidData
            if cutoff == 0 and prefix is None and not careful:
                rating |= _R.NotMangled
            if prefix is not None:
                buffer[0] = prefix
                ingest = buffer
            is_handled = engine.handles(ingest)
            if is_handled is True:
                rating |= _R.KnownFormat
            if is_handled is False:
                return Decompression(method, engine, _R.InvalidData, None, cutoff, prefix)
            try:
                result = next(engine.act(ingest))
            except RefineryPartialResult as pr:
                rating |= _R.HadOutput
                result = pr.partial
            except Exception:
                result = None
            else:
                rating |= _R.Successful
            return Decompression(method, engine, rating, result, cutoff, prefix)

        def update(new: Decompression, discard_if_too_good=False):
            ratio = new.ratio
            if self.args.strict_limits or not new.rating & _R.KnownFormat:
                if ratio > self.args.max_ratio:
                    return
                if ratio < self.args.min_ratio:
                    return
            best = best_by_rating.get(new.rating, None)
            prefix = new.prefix
            if prefix is not None:
                prefix = F'0x{prefix:02X}'
            if new.unmodified and best and not best.unmodified:
                threshold = 1
            else:
                threshold = 0.95
            if not best or len(new) < len(best):
                q = 0
            else:
                q = len(best) / len(new)
            ratio *= 100
            brief = new.rating.brief
            if q < threshold:
                if best and discard_if_too_good:
                    if q < 0.5:
                        return
                    if new.rating & _R.Successful != _R.Successful:
                        return
                self.log_info(lambda:
                    F'[switch] [{brief}] [q={q:07.4f}] compression ratio {ratio:07.4f}% with: {new!s}')
                best_by_rating[new.rating] = new
            else:
                self.log_debug(lambda:
                    F'[reject] [{brief}] [q={q:07.4f}] compression ratio {ratio:07.4f}% with: {new!s}')

        for method, engine in self.engines.items():
            self.log_debug(F'attempting engine: {method}')
            careful = isinstance(engine, (lznt1, lzf, lzjb))
            for t in range(self.args.tolerance + 1):
                if best_current_rating() >= _R.Successful and careful and t > 0:
                    break
                update(decompress(method, engine, t, None, careful), careful)
            if self.args.prepend and method not in _NO_PREFIX and best_current_rating() < _R.Successful:
                for p in range(0x100):
                    update(decompress(method, engine, 0, p, careful), careful)

        for r in sorted(best_by_rating, reverse=True):
            if dc := best_by_rating[r]:
                if not dc.rating & _R.HadOutput:
                    continue
                self.log_info(F'settling on {dc.method} decompression, cutoff={dc.cutoff} and prefix={dc.prefix}.')
                if dc.rating & _R.NotMangled:
                    self.log_info('supporting evidence: no modifications to the buffer were necessary')
                if dc.rating & _R.KnownFormat:
                    self.log_info('supporting evidence: found a known magic signature')
                if dc.rating & _R.HadNoErrors:
                    self.log_info('supporting evidence: engine produced output without errors')
                elif dc.rating & _R.HadOutput:
                    self.log_info('supporting evidence: there were errors, but the engine produced output')
                if not dc.rating & _R.Successful:
                    self.log_info('the only decompression with result returned only a partial result.')
                return self.labelled(dc.result, method=dc.method)

        raise ValueError('no compression engine worked')

class dedup (key=None, count=False)

Deduplicates a sequence of multiple inputs. The deduplication is limited to the current refinery.lib.frame.

Expand source code Browse git

class dedup(Unit):
    """
    Deduplicates a sequence of multiple inputs. The deduplication is limited to the current `refinery.lib.frame`.
    """
    def __init__(
        self,
        key: Arg('key', type=str, help='An optional meta variable expression to deduplicate.') = None,
        count: Arg.Switch('-c', help='Store the count of each deduplicated chunk.') = False
    ):
        super().__init__(key=key, count=count)

    def filter(self, chunks):
        keyvar = self.args.key

        if keyvar is not None:
            def key(chunk):
                v = PythonExpression.Evaluate(keyvar, metavars(chunk))
                if isbuffer(v):
                    v = md5(v).digest()
                return v
        else:
            def key(chunk):
                return md5(chunk).digest()

        if self.args.count:
            counts = {}
            buffer = {}
            hashes = None
        else:
            hashes = set()
            counts = None
            buffer = None

        for chunk in chunks:
            if not chunk.visible:
                yield chunk
                continue

            uid = key(chunk)

            if hashes is None:
                counts[uid] = counts.get(uid, 0) + 1
                buffer.setdefault(uid, chunk)
            elif uid in hashes:
                continue
            else:
                hashes.add(uid)
                yield chunk

        if hashes is None:
            for uid, chunk in buffer.items():
                yield self.labelled(chunk, count=counts[uid])

class defang (url_only=False, url_protocol=False, dot_only=False, quote_md=False)

Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot in the expression by [.]. For example, 127.0.0.1 will be replaced by 127.0.0[.]1. For URL indicators, the colon after the procol scheme is also wrapped in brackets.

Expand source code Browse git

class defang(Unit):
    """
    Defangs all URL, domain and IPv4 address indicators in the input data by replacing the last dot
    in the expression by `[.]`. For example, `127.0.0.1` will be replaced by `127.0.0[.]1`. For URL
    indicators, the colon after the procol scheme is also wrapped in brackets.
    """

    _WHITELIST = [
        B'wscript.shell',
    ]

    _PROTOCOL_ESCAPES = {
        B'http': B'hxxp',
        B'https': B'hxxps',
        B'ftp': B'fxp',
        B'ftps': B'fxps',
    }

    def __init__(
        self,
        url_only: Arg.Switch('-u', help='Only defang URLs, do not look for domains or IPs.') = False,
        url_protocol: Arg.Switch('-p', help='Escape the protocol in URLs.') = False,
        dot_only: Arg.Switch('-d', help='Do not escape the protocol colon in URLs.') = False,
        quote_md: Arg.Switch('-q', help='Wrap all indicators in backticks for markdown code.') = False
    ):
        self.superinit(super(), **vars())

    def _quote(self, word):
        return word if not self.args.quote_md else B'`%s`' % word

    def reverse(self, data: bytearray):
        def refang(hostname):
            return hostname[0].replace(B'[.]', B'.')
        data = defanged.hostname.sub(refang, data)
        data = data.replace(B'[:]//', B'://')
        data = data.replace(B'[://]', B'://')
        data = re.sub(B'h.{3}?(s?)://', B'http\\1://', data)
        data = re.sub(B'fxp(s?)://', B'ftp\\1://', data)
        return data

    def process(self, data):
        def replace_hostname(hostname: bytes, match=True):
            if match:
                return self._quote(replace_hostname(hostname[0], False))
            self.log_info('replace:', hostname)
            host = hostname
            user, atsgn, host = host.rpartition(B'@')
            host, colon, port = host.rpartition(B':')
            host = host.lower()
            if not colon:
                host = port
                port = B''
            if host in self._WHITELIST:
                return hostname
            host = re.split(R'(?:\[\.\]|\.)', host.decode('latin1'))
            if len(host) == 1:
                return hostname
            components = iter(reversed(host))
            defanged_parts = [next(components)]
            separator = '[.]'
            for part in components:
                defanged_parts.append(separator)
                defanged_parts.append(part)
                separator = '[.]' if part in tlds else '.'
            defanged_host = ''.join(reversed(defanged_parts)).encode('latin1')
            return user + atsgn + defanged_host + colon + port

        def replace_url(url: bytes):
            if not url:
                return url
            self.log_info('replace:', url)
            url = url.replace(B'[:]//', B'://', 1)
            url = url.replace(B'[.]', B'.')
            prefix = B'tcp'
            if url.startswith(B'://'):
                scheme = 0
            elif url.startswith(B'//'):
                scheme = 1
                prefix = prefix + B':'
            else:
                scheme = 2
                prefix = B''
            parsed = urlparse(prefix + url)
            operations = {
                name: self.process(getattr(parsed, name))
                for name in ('path', 'params', 'query', 'fragment')
            }
            if self.args.url_protocol and parsed.scheme:
                operations.update(scheme=self._PROTOCOL_ESCAPES.get(parsed.scheme.lower(), scheme))
            if scheme < 2:
                operations.update(scheme=B'')
            operations.update(netloc=replace_hostname(parsed.netloc, False))
            url = urlunparse(parsed._replace(**operations))
            if scheme == 0:
                url = B':' + url
            if not self.args.dot_only:
                url = url.replace(B'://', B'[:]//')
            return self._quote(url)

        urlsplit = defanged.url.split(data)
        step = defanged.url.value.groups + 1
        urlsplit[1::step] = [replace_url(t) for t in itertools.islice(iter(urlsplit), 1, None, step)]

        if not self.args.url_only:
            urlsplit[0::step] = [
                indicators.hostname.sub(replace_hostname, t)
                for t in itertools.islice(iter(urlsplit), 0, None, step)
            ]

        def fuse(urlsplit):
            txt = itertools.islice(iter(urlsplit), 0, None, step)
            url = itertools.islice(iter(urlsplit), 1, None, step)
            while True:
                try:
                    yield next(txt)
                    yield next(url)
                except StopIteration:
                    break

        return B''.join(fuse(urlsplit))

class deob_js_arithmetic

Expand source code Browse git

class deob_js_arithmetic(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.string, data)

        @strings.outside
        def evaluate(match: re.Match[str]):
            expression = match[0]
            expression = expression.strip()
            if not any(c.isdigit() for c in expression):
                return expression
            brackets = 0
            positions = []
            ok = True
            head = tail = rest = ''
            for end, character in enumerate(expression):
                if character == '(':
                    brackets += 1
                    positions.append(end)
                    continue
                if character == ')':
                    brackets -= 1
                    if brackets < 0:
                        expression, tail = expression[:end], expression[end:]
                        break
                    else:
                        positions.pop()
                    if brackets == 0 and expression[0] == '(':
                        expression, rest = expression[:end + 1], expression[end + 1:]
                        break
            if expression.isdigit():
                return match[0]
            if brackets > 0:
                pos = positions[~0] + 1
                head = expression[:pos]
                expression = expression[pos:]
            try:
                result = str(cautious_eval(expression + rest))
            except Exception:
                ok = False
            else:
                rest = ''
            if not ok and rest:
                try:
                    result = str(cautious_eval(expression))
                except Exception:
                    expression += rest
                else:
                    ok = True
            if not ok:
                result = expression
                self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})')
            else:
                if expression.startswith('(') and expression.endswith(')'):
                    result = F'({result})'
            if tail:
                tail = self.deobfuscate(tail)
            return F'{head}{result}{rest}{tail}'

        pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format(
            i=str(formats.integer), f=str(formats.float)))

        return pattern.sub(evaluate, data)

class deob_js_arrays

JavaScript deobfuscator to turn ["Z", "t", "s", "e"][0] into "Z".

Expand source code Browse git

class deob_js_arrays(Deobfuscator):
    """
    JavaScript deobfuscator to turn `["Z", "t", "s", "e"][0]` into `"Z"`.
    """

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.string, data)

        @strlit.outside
        def litpick(match: re.Match[str]):
            try:
                array = match[1]
                index = int(match[2])
                lpick = array.split(',')[index].strip()
                self.log_debug(lambda: F'{lpick} = {match[0]}')
            except (TypeError, IndexError):
                lpick = match[0]
            return lpick

        p = R'\s{{0,5}}'.join([
            '\\[', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\]', '\\[', '({i})', '\\]'
        ]).format(i=formats.integer, s=formats.string)
        return re.sub(p, litpick, data)

class deob_js_comments

JavaScript deobfuscator that removes comments from the script.

Expand source code Browse git

class deob_js_comments(Deobfuscator):
    """
    JavaScript deobfuscator that removes comments from the script.
    """
    def deobfuscate(self, data):
        strings = StringLiterals(formats.string, data)
        @strings.outside
        def remove(_): return ''

        data = re.sub(R'/\*.*?\*/', remove, data, flags=re.DOTALL)
        data = re.sub(R'(?m)//.*$', remove, data)
        return data

class deob_js_concat (timeout=100)

Expand source code Browse git

class deob_js_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''['"]\s*\+\s*['"]''')

    def deobfuscate(self, data):
        def concat(data):
            strlit = StringLiterals(formats.string, data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    _, a = strlit.ranges[a]
                    b, c = strlit.ranges[b]
                    yield data[:a - 1] + data[b + 1:c]
                    data = data[c:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_js_getattr

JavaScript deobfuscator to turn WScript["CreateObject"] into WScript.CreateObject.

Expand source code Browse git

class deob_js_getattr(Deobfuscator):
    """
    JavaScript deobfuscator to turn `WScript["CreateObject"]` into `WScript.CreateObject`.
    """

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.string, data)

        @strlit.outside
        def dottify(match: re.Match[str]):
            name = match[2][1:-1]
            if name.isidentifier():
                return F'{match[1]}.{name}'
            return match[0]

        return re.sub(FR'(\w+)\[({formats.string})\]', dottify, data)

class deob_js_tuples

JavaScript deobfuscator to turn ("Z", "t", "s", "e") into "e".

Expand source code Browse git

class deob_js_tuples(Deobfuscator):
    """
    JavaScript deobfuscator to turn `("Z", "t", "s", "e")` into `"e"`.
    """

    def deobfuscate(self, data):

        def litpick(match):
            try:
                array = match[1]
                lpick = array.split(',')[-1].strip()
                self.log_debug(lambda: F'{lpick} = {match[0]}')
            except (TypeError, IndexError):
                lpick = match[0]
            return lpick

        p = R'\s{{0,5}}'.join([
            '\\(', '((?:{i}|{s})', '(?:,', '(?:{i}|{s})', ')*)', '\\)'
        ]).format(i=formats.integer, s=formats.string)
        return re.sub(p, litpick, data)

class deob_ps1 (timeout=100)

Expand source code Browse git

class deob_ps1(IterativeDeobfuscator):

    _SUBUNITS: List[Type[Deobfuscator]] = [
        deob_ps1_escape,
        deob_ps1_cases,
        deob_ps1_brackets,
        deob_ps1_format,
        deob_ps1_typecast,
        deob_ps1_stringreplace,
        deob_ps1_b64convert,
        deob_ps1_encodings,
        deob_ps1_concat,
        deob_ps1_invoke,
        deob_ps1_uncurly
    ]

    def deobfuscate(self, data):
        units = [u() for u in self._SUBUNITS]
        for u in units:
            u.log_level = self.log_level
        for unit in units:
            self.log_debug(lambda: F'invoking {unit.name}')
            checkpoint = hash(data)
            data = unit.deobfuscate(data)
            if checkpoint != hash(data) and not self.log_debug('data has changed.'):
                self.log_info(F'used {unit.name}')
        return re.sub(R'[\r\n]+', '\n', data)

class deob_ps1_b64convert

Expand source code Browse git

class deob_ps1_b64convert(Deobfuscator):

    _SENTINEL = re.compile('\\s*'.join(
        (re.escape('[System.Convert]::FromBase64String'), '\\(', '({s})', '\\)')
    ).format(s=formats.ps1str), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        def replacer(match: re.Match[str]):
            if strlit.get_container(match.start()):
                return match[0]
            try:
                string, = string_unquote(match[1])
            except ValueError:
                return match[0]
            try:
                bytes = base64.b64decode(string)
            except Exception:
                return match[0]
            return '@({})'.format(','.join(F'0x{b:02X}' for b in bytes))

        return self._SENTINEL.sub(replacer, data)

class deob_ps1_brackets

PowerShell deobfuscation that removes superfluous brackets around constant literals, i.e. ("{0}{2}{1}") is transformed to "{0}{2}{1}". Currently, only integer and string constants are supported.

Expand source code Browse git

class deob_ps1_brackets(Deobfuscator):
    """
    PowerShell deobfuscation that removes superfluous brackets around constant
    literals, i.e. `("{0}{2}{1}")` is transformed to `"{0}{2}{1}"`. Currently,
    only integer and string constants are supported.
    """
    _SENTINEL = re.compile(
        RF'''(?<![\w"']{{2}})'''  # this may be a function call
        RF'''(\-\w+)?'''  # not a function call but an argument
        RF'''\(\s*({formats.integer}|{formats.ps1str})\s*(\S)''',
        flags=re.IGNORECASE
    )

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        repeat = True

        @strlit.outside
        def replacement(match):
            nonlocal repeat
            if match[3] == ')':
                repeat = True
                return (match[1] or '') + match[2]

        while repeat:
            repeat = False
            data = self._SENTINEL.sub(replacement, data)

        return data

class deob_ps1_cases

Expand source code Browse git

class deob_ps1_cases(Deobfuscator):
    _NAMES = [
        '-BXor',
        '-Exec Bypass',
        '-NoLogo',
        '-NonInter',
        '-Replace',
        '-Windows Hidden',
        '.Invoke',
        'Assembly',
        'Byte',
        'Char',
        'ChildItem',
        'CreateThread',
        'Get-Variable',
        'GetType',
        'IntPtr',
        'Invoke-Expression',
        'Invoke',
        'Length',
        'Net.WebClient',
        'PowerShell',
        'PSVersionTable',
        'Set-Item',
        'Set-Variable',
        'Start-Sleep',
        'ToString',
        'Type',
        'Value',
        'Void',
    ]

    @outside(formats.ps1str)
    def deobfuscate(self, data):
        for name in self._NAMES:
            data = re.sub(RF'\b{re.escape(name)}\b', name, data, flags=re.IGNORECASE)
        return data

class deob_ps1_concat (timeout=100)

Expand source code Browse git

class deob_ps1_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''['"]\s*[+&]\s*['"]''')

    def deobfuscate(self, data):

        def concat(data):
            strlit = Ps1StringLiterals(data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    a = strlit.ranges[a]
                    b = strlit.ranges[b]
                    stra = data[slice(*a)]
                    strb = data[slice(*b)]
                    parts = list(string_unquote(stra))
                    it = iter(string_unquote(strb))
                    parts[~0] += next(it)
                    parts.extend(it)
                    yield data[:a[0]] + string_quote(parts)
                    data = data[b[1]:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_ps1_encodings

Expand source code Browse git

class deob_ps1_encodings(Deobfuscator):

    _SENTINEL = re.compile('\\s*'.join(
        (re.escape('[System.Text.Encoding]::') + '(\\w+)\\.GetString', '\\(', '@\\(', '({a})', '\\)', '\\)')
    ).format(a=formats.intarray), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        def replacer(match: re.Match[str]):
            if strlit.get_container(match.start()):
                return match[0]
            try:
                bytes = bytearray(int(x.strip(), 0) for x in match[2].split(','))
            except Exception:
                return match[0]
            encoding = {
                'ASCII': 'ascii',
                'BigEndianUnicode': 'utf-16be',
                'Default': 'latin1',
                'Unicode': 'utf-16le',
            }.get(match[1], match[1])
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'utf8'
            try:
                string = bytes.decode(encoding)
            except Exception:
                return match[0]
            return string_quote(string)

        return self._SENTINEL.sub(replacer, data)

class deob_ps1_escape

Expand source code Browse git

class deob_ps1_escape(Deobfuscator):

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        @strlit.outside
        def repl(m): return m[1]
        return re.sub(R'''`([^0abfnrtv`#'"\$])''', repl, data)

class deob_ps1_format

PowerShell deobfuscation for the following "format string"-based technique:

"{0}{2}{1}"-f 'signa','ures','t'
"{0}na{2}{1}"-f 'sig','ures','t'

Expand source code Browse git

class deob_ps1_format(Deobfuscator):
    """
    PowerShell deobfuscation for the following "format string"-based technique:

    - `"{0}{2}{1}"-f 'signa','ures','t'`
    - `"{0}na{2}{1}"-f 'sig','ures','t'`
    """

    def deobfuscate(self, data):

        repeat = True

        while repeat:

            repeat = False

            for string in re.finditer(str(formats.ps1str), data):
                argmatch = re.search(R'^\s*-[fF]\s*((?:{s},\s*)*{s})'.format(s=formats.ps1str), data[string.end():])
                if not argmatch:
                    continue

                def dbgmsg():
                    sample = string[0]
                    if len(sample) > 33:
                        sample = F"{sample[1:30]}...{sample[0]}"
                    return F'found match at {string.start()}: {sample}'

                self.log_debug(dbgmsg)

                args = re.split(F'({formats.ps1str})', argmatch[1])
                args = [list(string_unquote(a.strip())) for a in args[1::2]]

                def formatter(string):
                    buffer = []
                    for k, part in enumerate(re.split(R'(\{\d+\})', string)):
                        if k % 2 == 0:
                            if part:
                                buffer.append(part)
                            continue
                        try:
                            index = int(part[1:-1])
                            arg = args[index]
                        except IndexError as IE:
                            raise IndexError(F'only found {len(args)} arguments and format sequence {index}, aborting.') from IE

                        it = iter(arg)
                        buffer.append(next(it))

                        if len(arg) > 1:
                            yield ''.join(buffer)
                            buffer = []
                            for last, part in lookahead(it):
                                if last:
                                    buffer.append(part)
                                    break
                                yield part

                    yield ''.join(buffer)

                try:
                    result = string_apply(string[0], formatter)
                except IndexError:
                    continue

                data = data[:string.start()] + result + data[argmatch.end() + string.end():]
                repeat = True
                break

        return data

class deob_ps1_invoke

Expand source code Browse git

class deob_ps1_invoke(Deobfuscator):
    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        @strlit.outside
        def invrepl1(m): return m[1] + m[3]

        data = re.sub(
            R'''(\.|::)'''                    # preceeded by dot or namespace delimiter
            R'''(['"])(\w{1,200})\2'''        # quoted string (actually a method name)
            R'''(?=[\s\(\.\,\;\+\-])''',      # only if followed by certain characters
            invrepl1, data                    # remove quotes around symbol
        )

        @strlit.outside
        def invrepl2(m): return m[1] + '('

        data = re.sub(
            '\\s{0,5}'.join([
                '[.&]', '(\\(',               # sourcing operator
                '(?:gcm|get-command)', ')?',  # potentially a get-command
                '([\'"])([-a-z]{1,100})\\2'   # string enclosing a command
                '(?(1)\\s{0,5}\\)|)',         # closing bracket for get-command
            ]), '\\3', data, flags=re.IGNORECASE
        )
        data = re.sub(
            R'''(\w{1,200})\.Invoke\s*\(''',
            invrepl2, data,
            flags=re.IGNORECASE
        )

        return data

class deob_ps1_secstr (*a)

Expand source code Browse git

class deob_ps1_secstr(Deobfuscator):
    def __init__(self, *a, **kw):
        super().__init__(*a, **kw)

        self._pack = pack()
        self._secstr = secstr()

        self._pattern = re.compile(
            R'\s{{0,20}}'.join([
                R'''(['"])({b})\1''',
                R'\|', R'\.?', R'&?',
                R'''(['"]?)ConvertTo-SecureString\3''',
                R'-ke?y?',
                R'''(\(?)({a}|{i}\s{{0,20}}\.\.\s{{0,20}}{i})''',
                R'((?:\)\s{{0,20}}){{0,10}})?'
            ]).format(
                b=formats.b64,
                a=formats.intarray,
                i=formats.integer
            ),
            flags=re.IGNORECASE | re.DOTALL
        )

    def _decrypt_block(self, data, match):
        if '..' in match[5]:
            a, b = [int(x.strip(), 0) for x in match[5].split('..')]
            key = range(min(a, b), max(a, b) + 1)
            if a > b:
                key = reversed(key)
            self._secstr.args.key = bytes(bytearray(key))
        else:
            self._secstr.args.key = self._pack(match[5].encode(self.codec))
        decoded = self._secstr(match[2].encode(self.codec))
        decoded = decoded.decode(self.codec)
        result = F'\n\n{decoded}\n\n'
        brackets = match[6].count(')')
        start = match.start()
        if match[4]:
            brackets -= 1
        if brackets <= 0:
            if brackets < 0:
                result += ')'
            return start, result
        while brackets:
            start -= 1
            if data[start] == '(':
                brackets -= 1
            if data[start] == ')':
                brackets += 1
        return start, result

    def deobfuscate(self, data):
        while True:
            match = self._pattern.search(data)
            if not match:
                break
            start, result = self._decrypt_block(data, match)
            data = data[:start] + result + data[match.end():]
        return data

class deob_ps1_stringreplace

Expand source code Browse git

class deob_ps1_stringreplace(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)[\'"]\s*'               # end of haystack string
        R'(-c|-i|-|\.)replace'        # the replace call
        R'([\(\s]*)({s})([\)\s]*),'   # needle (with brackets)
        R'([\(\s]*)({s})([\)\s]*)'    # insert (with brackets)
    ).format(s=formats.ps1str), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        repeat = True
        strlit = Ps1StringLiterals(data)

        while repeat:
            repeat = False
            needle = None

            for match in self._SENTINEL.finditer(data):
                k = strlit.get_container(match.start())
                if k is None:
                    continue
                offset, end = strlit.ranges[k]
                if match.start() != end - 1:
                    continue
                string = data[offset:end]
                pf, bl1, needle, bl2, br1, insert, br2 = match.groups()
                end = match.end()
                case = '' if pf[0] in '.c' else '(?i)'
                bl = bl1.count('(') - bl2.count(')')
                br = br2.count(')') - br1.count('(')
                if pf[0] == '.':
                    bl -= 1
                    br -= 1
                if bl != 0 or br < 0:
                    continue
                needle = list(string_unquote(needle))
                if len(needle) > 1:
                    continue

                needle = needle[0]
                head, *body = string_unquote(insert)

                self.log_info('replacing', needle, 'by', insert)

                if not body:
                    def perform_replacement(string):
                        return re.sub(F'{case}{re.escape(needle)}', lambda _: head, string)
                else:
                    *body, tail = body
                    def perform_replacement(string): # noqa
                        parts = re.split(F'{case}{re.escape(needle)}', string)
                        if len(parts) == 1:
                            yield string
                            return
                        it = iter(parts)
                        yield next(it) + head
                        yield from body
                        for last, part in lookahead(it):
                            if last:
                                yield tail + part
                            else:
                                yield tail + part + head
                                yield from body

                replaced = string_apply(string, perform_replacement) + (br * ')')
                strlit.ranges[k] = offset, offset + len(replaced) - br
                strlit.ranges[k + 1: k + 3] = []
                strlit.shift(len(replaced) + offset - end, k + 1)
                data = data[:offset] + replaced + data[end:]
                repeat = True
                break

        return data

class deob_ps1_typecast

Replaces sequences like [Char]120 to their string representation, in this case the string "x".

Expand source code Browse git

class deob_ps1_typecast(Deobfuscator):
    """
    Replaces sequences like [Char]120 to their string representation, in this
    case the string "x".
    """

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)

        @strlit.outside
        def strip_typecast(m): return m[1]

        data = re.sub(
            FR'\[(?:string|char\[\])\]\s*({formats.ps1str!s})',
            strip_typecast,
            data,
            flags=re.IGNORECASE
        )

        @strlit.outside
        def char_literal(match):
            c = chr(int(match[1].lower(), 0))
            if c == "'":
                return '''"'"'''
            return F"'{c}'"

        data = re.sub(
            R'\[char\]\s*0*(0x[0-9a-f]+|\d+)',
            char_literal,
            data,
            flags=re.IGNORECASE
        )

        def char_array(match):
            result = bytes(int(x, 0) for x in match[1].split(','))
            try:
                result = result.decode('ascii')
                if not all(x in string.printable or x.isspace() for x in result):
                    raise ValueError
            except ValueError:
                return match[0]
            else:
                return string_quote(result)

        data = re.sub(
            R'\s*'.join([
                R'\[char\[\]\]',
                R'\((',
                R'(?:\s*(?:0x[0-9a-f]+|\d+)\s*,)+',
                R'(?:0x[0-9a-f]+|\d+)',
                R')\)'
            ]),
            char_array,
            data,
            flags=re.IGNORECASE
        )

        return data

class deob_ps1_uncurly

PowerShell deobfuscation that removes superfluous curly braces around variable names that do not require it, i.e. ${variable} is transformed to just $variable.

Expand source code Browse git

class deob_ps1_uncurly(Deobfuscator):
    """
    PowerShell deobfuscation that removes superfluous curly braces around variable
    names that do not require it, i.e. `${variable}` is transformed to just `$variable`.
    """

    _SENTINEL = re.compile(R'\$\{(\w+)\}')

    def deobfuscate(self, data):
        strlit = Ps1StringLiterals(data)
        @strlit.outside
        def strip(m): return F'${m[1]}'
        return self._SENTINEL.sub(strip, data)

class deob_vba (timeout=100)

Expand source code Browse git

class deob_vba(IterativeDeobfuscator):

    _SUBUNITS: List[Type[Deobfuscator]] = [
        deob_vba_comments,
        deob_vba_brackets,
        deob_vba_char_function,
        deob_vba_concat,
        deob_vba_arithmetic,
        deob_vba_constants,
        deob_vba_dummy_variables,
        deob_vba_stringreplace,
        deob_vba_stringreverse,
    ]

    def deobfuscate(self, data):
        units = [u() for u in self._SUBUNITS]
        for u in units:
            u.log_level = self.log_level
        for unit in units:
            self.log_debug(lambda: F'invoking {unit.name}')
            checkpoint = hash(data)
            data = unit.deobfuscate(data)
            if checkpoint != hash(data) and not self.log_debug('data has changed.'):
                self.log_info(F'used {unit.name}')
        return re.sub(R'[\r\n]+', '\n', data)

class deob_vba_arithmetic

Expand source code Browse git

class deob_vba_arithmetic(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.vbastr, data)

        def vba_int_eval(match: re.Match[str]) -> str:
            s = match[0].lower()
            if not s.startswith('&'):
                return s
            t, s = s[1], s[2:].rstrip('&')
            if t == 'h':
                return str(int(s, 16))
            if t == 'b':
                return str(int(s, 2))
            if t == 'o':
                return str(int(s, 8))

        @strings.outside
        def evaluate(match: re.Match[str]):
            expression = match[0]
            expression = expression.strip()
            if not any(c.isdigit() for c in expression):
                return expression
            expression = re.sub(str(formats.vbaint), vba_int_eval, expression)
            brackets = 0
            positions = []
            ok = True
            head = tail = rest = ''
            for end, character in enumerate(expression):
                if character == '(':
                    brackets += 1
                    positions.append(end)
                    continue
                if character == ')':
                    brackets -= 1
                    if brackets < 0:
                        expression, tail = expression[:end], expression[end:]
                        break
                    else:
                        positions.pop()
                    if brackets == 0 and expression[0] == '(':
                        expression, rest = expression[:end + 1], expression[end + 1:]
                        break
            if expression.isdigit():
                return match[0]
            if brackets > 0:
                pos = positions[~0] + 1
                head = expression[:pos]
                expression = expression[pos:]
            try:
                result = str(_cautious_vba_eval(expression + rest))
            except Exception:
                ok = False
            else:
                rest = ''
            if not ok and rest:
                try:
                    result = str(_cautious_vba_eval(expression))
                except Exception:
                    expression += rest
                else:
                    ok = True
            if not ok:
                result = expression
                self.log_info(F'error trying to parse arithmetic expression at offset {match.start()}: ({expression})')
            else:
                if expression.startswith('(') and expression.endswith(')'):
                    result = F'({result})'
            if tail:
                tail = self.deobfuscate(tail)
            return F'{head}{result}{rest}{tail}'

        pattern = re.compile(R'(?:{i}|{f}|[-+(])(?:[^\S\r\n]{{0,20}}(?:{i}|{f}|[-%|&~<>()+/*^]))+'.format(
            i=str(formats.vbaint), f=str(formats.float)))

        return pattern.sub(evaluate, data)

class deob_vba_brackets

Expand source code Browse git

class deob_vba_brackets(Deobfuscator):
    _SENTINEL = re.compile(
        RF'''(?<![\w"']{{2}})'''  # this may be a function call
        RF'''\(\s*({formats.vbaint}|{formats.vbastr}|{formats.float})\s*(\S)''',
        flags=re.IGNORECASE
    )

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)
        repeat = True

        @strlit.outside
        def replacement(match):
            nonlocal repeat
            if match[2] == ')':
                repeat = True
                return match[1]

        while repeat:
            repeat = False
            data = self._SENTINEL.sub(replacement, data)

        return data

class deob_vba_char_function

Expand source code Browse git

class deob_vba_char_function(Deobfuscator):
    def deobfuscate(self, data):
        strings = StringLiterals(formats.vbastr, data)

        @strings.outside
        def evaluate_char_function(match: re.Match[str]):
            try:
                c = chr(int(match[1]))
            except ValueError:
                return match[0]
            if c == '"':
                return '""""'
            if c == '\\':
                return '"\\"'
            c = repr(c)[1:-1]
            if len(c) > 1:
                return match[0]
            return '"{}"'.format(c)

        return re.sub(R'(?i)\bchrw?\s*\(\s*(\d+)\s*\)', evaluate_char_function, data)

class deob_vba_chr_literals

Expand source code Browse git

class deob_vba_chr_literals(Unit):
    def process(self, data):
        def _chr(m):
            code = int(m[1], 0)
            if code == 34:
                return B'""""'
            return B'"%s"' % chr(code).encode('unicode_escape')
        data = re.sub(BR'Chr\((\d+x?\d+)\)', _chr, data, flags=re.IGNORECASE)
        data = re.sub(BR'"\s*\&\s*"', B'', data)
        return data

class deob_vba_comments

Expand source code Browse git

class deob_vba_comments(Deobfuscator):
    def deobfuscate(self, data):
        return re.sub(R"(?im)^\s{0,20}(?:'|rem\b|dim\b).*(?:\Z|$\n\r?)", '', data)

class deob_vba_concat (timeout=100)

Expand source code Browse git

class deob_vba_concat(IterativeDeobfuscator):
    _SENTINEL = re.compile(R'''"\s*(\++|&)\s*"''')

    def deobfuscate(self, data):

        def concat(data):
            strlit = StringLiterals(formats.vbastr, data)
            repeat = True
            while repeat:
                for match in self._SENTINEL.finditer(data):
                    a, b = match.span()
                    a = strlit.get_container(a)
                    if a is None:
                        continue
                    b = strlit.get_container(b)
                    if b is None or b != a + 1:
                        continue
                    _, a = strlit.ranges[a]
                    b, c = strlit.ranges[b]
                    yield data[:a - 1] + data[b + 1:c]
                    data = data[c:]
                    strlit.update(data)
                    break
                else:
                    repeat = False
            yield data

        return ''.join(concat(data))

class deob_vba_constants

Expand source code Browse git

class deob_vba_constants(Deobfuscator):
    def deobfuscate(self, data):
        codelines = data.splitlines(keepends=True)
        constants = {}
        constline = {}
        variables = set()
        for k, line in enumerate(codelines):
            match = re.match(R'(?im)^\s*(?:sub|function)\s*(\w+)', line)
            if match:
                variables.add(match[1])
                continue
            match = re.match(
                R'(?im)^(?:\s*const)?\s*(\w+)\s*=\s*({i}|{s})\s*(?:\'|rem|$)'.format(
                    s=formats.ps1str,
                    i=formats.integer
                ), line)
            if match is None or match[1] in variables:
                pass
            elif match[2] != constants.get(match[1], match[2]):
                self.log_debug(F'del {match[1]}')
                del constants[match[1]]
                del constline[match[1]]
                variables.add(match[1])
            else:
                self.log_debug(F'add {match[1]} = {match[2]}')
                constants[match[1]] = match[2]
                constline[match[1]] = k
        codelines = [line for k, line in enumerate(codelines) if k not in constline.values()]
        data = ''.join(codelines)
        for name, value in constants.items():
            data = re.sub(RF'\b{re.escape(name)!s}\b', lambda _: value, data)

        return data

class deob_vba_dummy_variables

Expand source code Browse git

class deob_vba_dummy_variables(Deobfuscator):
    def deobfuscate(self, data):
        lines = data.splitlines(keepends=False)
        names = collections.defaultdict(list)

        def might_be_used_in(name, line):
            # avoid finding the name within a string literal
            line = '""'.join(re.split(str(formats.ps1str), line))
            line = re.split(RF'\b{name}\b', line)
            try:
                L, R = line
            except ValueError:
                return False
            L = L.strip().lower()
            if L.startswith("'") or L.startswith('rem'):
                return False
            R = R.strip().lower()
            if R.startswith('=') and 'if' not in L:
                return False
            if L.startswith('dim'):
                return False
            return True

        pattern = re.compile(
            R'(?i)^\s{0,8}(?:const\s{1,8})?(\w+)\s{1,8}=\s{1,8}.*$'
        )

        for k, line in enumerate(lines):
            try:
                name = pattern.match(line)[1]
            except (AttributeError, TypeError):
                continue
            if re.search(r'\w+\(', line):
                # might be a function call
                continue
            names[name].append(k)

        for line in lines:
            while True:
                for name in names:
                    if might_be_used_in(name, line):
                        del names[name]
                        break
                else:
                    break

        return '\n'.join(line for k, line in enumerate(lines) if not any(
            k in rows for rows in names.values()))

class deob_vba_stringreplace

Expand source code Browse git

class deob_vba_stringreplace(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)\bReplace\s*\('  # the replace call
        R'\s*({s}),'           # haystack (with brackets)
        R'\s*({s}),'           # needle (with brackets)
        R'\s*({s})\s*\)'       # insert (with brackets)
    ).format(s=formats.vbastr), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)

        @strlit.outside
        def replacement(match: re.Match[str]):
            return string_quote(
                string_unquote(match[1]).replace(
                    string_unquote(match[2]),
                    string_unquote(match[3])
                )
            )

        return self._SENTINEL.sub(replacement, data)

class deob_vba_stringreverse

Expand source code Browse git

class deob_vba_stringreverse(Deobfuscator):

    _SENTINEL = re.compile((
        R'(?i)\bStrReverse\s*\('  # the reverse call
        R'\s*({s})\s*\)'          # string
    ).format(s=formats.vbastr), flags=re.IGNORECASE)

    def deobfuscate(self, data):
        strlit = StringLiterals(formats.vbastr, data)

        @strlit.outside
        def replacement(match: re.Match[str]):
            return string_quote(''.join(reversed(string_unquote(match[1]))))

        return self._SENTINEL.sub(replacement, data)

class des (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

DES encryption and decryption.

Expand source code Browse git

class des(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES)):
    """
    DES encryption and decryption.
    """
    pass

class des3 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

3-DES encryption and decryption.

Expand source code Browse git

class des3(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(DES3)):
    """
    3-DES encryption and decryption.
    """
    pass

class deskd (size=8)

Stands for "DES Key Derivation". It implements the same functionality as DES_string_to_key in OpenSSL. It converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern key derivation function.

Expand source code Browse git

class deskd(KeyDerivation):
    """
    Stands for "DES Key Derivation". It implements the same functionality as `DES_string_to_key` in OpenSSL. It
    converts a string to an 8 byte DES key with odd byte parity, per FIPS specification. This is not a modern
    key derivation function.
    """
    def __init__(self, size: Arg(help='The number of bytes to generate, default is the maximum of 8.') = 8):
        super().__init__(size=size, salt=None)

    def process(self, password):
        from Cryptodome.Cipher import DES
        from Cryptodome.Util.strxor import strxor

        password = bytes(password)
        key = bytearray(8)

        for i, p in enumerate(password):
            if ((i % 16) < 8):
                key[i % 8] ^= (p << 1) & 0xFF
            else:
                p = (((p << 4) & 0xf0) | ((p >> 4) & 0x0f))
                p = (((p << 2) & 0xcc) | ((p >> 2) & 0x33))
                p = (((p << 1) & 0xaa) | ((p >> 1) & 0x55))
                key[7 - (i % 8)] ^= p

        des_set_odd_parity(key)

        if password:
            n = len(password)
            password = password.ljust(n + 7 - ((n - 1) % 8), b'\0')
            des = DES.new(key, DES.MODE_ECB)
            for k in range(0, n, 8):
                key[:] = des.encrypt(strxor(password[k:k + 8], key))
            des_set_odd_parity(key)

        if self.args.size > 8:
            raise RefineryPartialResult('can provide at most 8 bytes.', partial=key)

        return key[:self.args.size]

class dexstr

Extract strings from DEX (Dalvik Executable) files.

Expand source code Browse git

class dexstr(Unit):
    """
    Extract strings from DEX (Dalvik Executable) files.
    """
    def process(self, data):
        dex = DexFile(data)
        for string in dex.read_strings():
            yield string.encode(self.codec)

class djb2 (reps=1, text=False)

Computes the DJB2 hash of the input data.

Expand source code Browse git

class djb2(HashUnit):
    """
    Computes the DJB2 hash of the input data.
    """
    def _algorithm(self, data: bytes) -> bytes:
        h = 5381
        for b in data:
            h = ((h << 5) + h + b) & 0xFFFFFFFF
        return h.to_bytes(4, 'big')

class dnarrays

Extracts arrays of strings or integers that are encoded in the .NET binary as IL opcodes. The data is exported as JSON.

Expand source code Browse git

class dnarrays(Unit):
    """
    Extracts arrays of strings or integers that are encoded in the .NET binary as IL opcodes.
    The data is exported as JSON.
    """
    @staticmethod
    def _read_int(reader: StructReader):
        value = reader.read_byte() - 0x16
        if value < 0:
            raise ValueError
        elif value <= 8:
            return value
        elif value == 9:
            return reader.read_byte()
        elif value == 10:
            return reader.u32()
        else:
            raise ValueError

    @staticmethod
    def _read_str(reader: StructReader, header: DotNetHeader):
        if reader.read_byte() != 0x72:
            raise ValueError
        token: int = reader.read_integer(24)
        value: str = header.meta.Streams.US[token]
        if reader.read_byte() != 0x70:
            raise ValueError
        return value

    _STACK_ARRAY_PATTERN_STR = re.compile(
        BR'''(?x)
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load array length
        (?:  \x8D...\x01               ) # newarr System.String
        (?:
        (?:  \x25                      ) # dup
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load integer index
        (?:  \x72...\x70               ) # load the string
        (?:  \xA2                      ) # stelem.ref
        ){4,}
        ''', flags=re.DOTALL)

    def _str_arrays(self, data: ByteStr, header: DotNetHeader, tables: NetMetaDataTables):
        for match in self._STACK_ARRAY_PATTERN_STR.finditer(data):
            reader = StructReader(match[0])
            result: list[str] = []
            size = self._read_int(reader)
            if reader.read_byte() != 0x8D:
                raise RuntimeError
            stt = reader.read_integer(24)
            if reader.read_byte() != 0x01:
                raise RuntimeError
            if stt < 1 or tables.TypeRef[stt - 1].TypeName != 'String':
                continue
            self.log_info(F'str array pattern at 0x{match.start():X}, size {size}')
            for k in range(size):
                if reader.read_byte() != 0x25:
                    raise RuntimeError
                if self._read_int(reader) != k:
                    break
                result.append(self._read_str(reader, header))
                if reader.read_byte() != 0xA2:
                    raise RuntimeError
            else:
                yield match.start(), result

    _STACK_ARRAY_PATTERN_INT = re.compile(
        BR'''(?x)
        (    \x12.|\xFE\x0D..          ) # load array variable
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # push integer value
        (?:  \x52                      ) # store value into array
        (?:
        (?:  \1                        ) # load same array variable
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # load integer index
        (?:  \x58                      ) # add; compute offset
        (?: [\x16-\x1E]|\x1F.|\x20.{4} ) # push integer value
        (?:  \x52                      ) # store value into array
        ){4,}
        ''', flags=re.DOTALL)

    def _int_arrays(self, data: ByteStr, header: DotNetHeader, tables: NetMetaDataTables):
        for match in self._STACK_ARRAY_PATTERN_INT.finditer(data):
            self.log_info(F'int array pattern at 0x{match.start():X}')
            reader = StructReader(match[0])
            result: list[int] = []
            opc, = reader.peek(1)
            skip = {0x12: 2, 0xFE: 4}[opc]
            reader.seekrel(skip)
            for index in itertools.count(1):
                result.append(self._read_int(reader))
                assert reader.read_byte() == 0x52
                if reader.eof:
                    yield match.start(), result
                    break
                reader.seekrel(skip)
                if self._read_int(reader) != index:
                    self.log_info('index inconsistency; aborting')
                    break
                assert reader.read_byte() == 0x58

    def process(self, data):
        header = DotNetHeader(data)
        tables = header.meta.Streams.Tables
        cp = CodePath(header)

        arrays = dict(itertools.chain(
            self._int_arrays(data, header, tables),
            self._str_arrays(data, header, tables),
        ))
        result = collections.defaultdict(list)
        for offset in sorted(arrays):
            result[cp.method_spec(offset)].append(arrays[offset])

        result = {m: {F'v{k}': v for k, v in enumerate(t, 1)} for m, t in result.items()}
        return json.dumps(result, indent=4).encode(self.codec)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnasm (*, count=None, until=None, no_il_refs=False, no_address=False, no_hexdump=False, no_args=False, description=False)

Disassembles the input data as MSIL (.NET/C# bytecode) and produces a human-readable disassembly listing. If you are looking for a more programmatic disassembly, take a look at dnopc.

Expand source code Browse git

class dnasm(DotnetDisassemblerUnit):
    """
    Disassembles the input data as MSIL (.NET/C# bytecode) and produces a human-readable disassembly listing. If you are
    looking for a more programmatic disassembly, take a look at `refinery.dnopc`.
    """

    def __init__(
        self, *,
        count=None, until=None,
        no_il_refs: Arg.Switch('-I', help='Disable reference resolution to IL_*.') = False,
        no_address: Arg.Switch('-A', help='Disable address display.') = False,
        no_hexdump: Arg.Switch('-H', help='Disable opcodes hexdump.') = False,
        no_args: Arg.Switch('-O', help='Disable output of instruction arguments.') = False,
        description: Arg.Switch('-d', help='Enable opcodes descriptions in output.') = False,
    ):
        self._output_factory = OutputFactory(
            il_refs=not no_il_refs,
            address=not no_address,
            hexdump=not no_hexdump,
            arguments=not no_args,
        )
        self._disassembler = Disassembler()
        super().__init__(
            count=count,
            until=until,
            description=description,
        )

    def process(self, data):
        meta = metavars(data)
        r = re.compile(r't[0-9a-f]+', re.IGNORECASE)
        self._output_factory.extend_token_labels({int(k[1:], 16): v for k, v in meta.items() if r.match(k)})
        until = str(self.args.until or '').lower()

        max_line_length = 0
        if self.args.description:
            disasm = []
            for ins in self._disassembler.disasm(data, self.args.count):
                disasm.append(ins)
                line = self._output_factory.instruction(ins)
                max_line_length = max(max_line_length, len(line))
        else:
            disasm = self._disassembler.disasm(data, self.args.count)

        for ins in disasm:
            line = self._output_factory.instruction(ins)
            if self.args.description:
                line += ' ' * (max_line_length - len(line) + 2)
                line += f'-- {ins.op.description}'
            yield line.encode("utf-8")

            if until and until in line.lower():
                break

class dnblob

Extracts all blobs defined in the #Blob stream of .NET executables.

Expand source code Browse git

class dnblob(Unit):
    """
    Extracts all blobs defined in the `#Blob` stream of .NET executables.
    """
    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        for blob in header.meta.Streams.Blob.values():
            yield blob

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnds (dereference=True, encode=None, digest=None)

Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class "BinaryFormatter". The output is a representation of the deserialized data in JSON format.

Expand source code Browse git

class dnds(JSONEncoderUnit):
    """
    Stands for "DotNet DeSerialize": Expects data that has been serialized using the .NET class
    "BinaryFormatter". The output is a representation of the deserialized data in JSON format.
    """

    def __init__(
        self, dereference: Arg.Switch('-r', '--keep-references', off=True,
            help='Do not resolve Object references in serialized data.') = True,
        encode=None, digest=None
    ):
        super().__init__(encode=encode, digest=digest, dereference=dereference)

    def process(self, data):
        self.log_debug('initializing parser, will fail on malformed stream')
        bf = BinaryFormatterParser(
            data,
            keep_meta=True,
            dereference=self.args.dereference,
            ignore_errors=not self.log_debug(),
        )

        return self.to_json([
            {
                'Type': repr(record),
                'Data': record
            } for record in bf
        ])

class dnfields (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

This unit can extract data from constant field variables in classes of .NET executables. Since the .NET header stores only the offset and not the size of constant fields, heuristics are used to search for opcode sequences that load the data and additional heuristics are used to guess the size of the data type.

Expand source code Browse git

class dnfields(PathExtractorUnit):
    """
    This unit can extract data from constant field variables in classes of .NET
    executables. Since the .NET header stores only the offset and not the size of
    constant fields, heuristics are used to search for opcode sequences that load
    the data and additional heuristics are used to guess the size of the data
    type.
    """
    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

    def unpack(self, data):
        header = DotNetHeader(data, parse_resources=False)
        tables = header.meta.Streams.Tables
        fields = tables.FieldRVA
        cpaths = CodePath(header)

        if not fields:
            return

        icache: Dict[bytes, FieldInfo] = {}
        memory = memoryview(data)

        def _guess_field_info(t: int, signature: bytes, field_name: Optional[str] = None, sizemap: dict = {
            '^s?byte$'       : 1,
            '^s?char$'       : 2,
            '^[us]?int.?16$' : 2,
            '^[us]?int.?32$' : 4,
            '^[us]?int.?64$' : 8,
        }) -> Tuple[Optional[str], FieldInfo]:
            try:
                info = icache[signature]
            except KeyError:
                info = None
            else:
                if field_name is not None:
                    return field_name, info
            pattern = (
                BR'(\x20....|\x1F.|[\x17-\x1E])'    # ldc.i4  count
                BR'\x8D(...)([\x01\x02])'           # newarr  col|row
                BR'\x25'                            # dup
                BR'\xD0\x%02x\x%02x\x%02x\x04'      # ldtoken t
                BR'(?:.{0,12}?'                     # ...
                BR'\x80(...)\x04)?' % (             # stsfld variable
                    (t >> 0x00) & 0xFF,
                    (t >> 0x08) & 0xFF,
                    (t >> 0x10) & 0xFF
                )
            )
            for match in re.finditer(pattern, memory, flags=re.DOTALL):
                if info is None:
                    count, j, r, name = match.groups()
                    count = integer_from_ldc(count)
                    j, r = struct.unpack('<LB', B'%s\0%s' % (j, r))
                    typename = tables[r][j - 1].TypeName
                else:
                    name = match.group(4)
                    typename = info.type
                for pattern, size in sizemap.items():
                    if not re.match(pattern, typename, flags=re.IGNORECASE):
                        continue
                    if name:
                        try:
                            name = struct.unpack('<L', B'%s\0' % name)
                            name = name[0]
                            name = tables[4][name - 1].Name
                        except Exception as E:
                            self.log_info(F'attempt to parse field name failed: {E!s}')
                            name = None
                    if name is None:
                        name = field_name
                    if info is None:
                        info = FieldInfo(typename, count, size, match.start())
                    icache[signature] = info
                    return name, info
            else:
                return None, None

        iwidth = len(str(len(fields)))
        rwidth = max(len(F'{field.RVA:X}') for field in fields)
        rwidth = max(rwidth, 4)
        remaining_field_indices = set(range(len(tables.Field)))

        unpack = []
        name_count = Counter(tables.Field[rv.Field.Index - 1].Name for rv in fields)
        name_width = len(str(len(fields)))

        for k, rv in enumerate(fields):
            _index = rv.Field.Index
            field = tables.Field[_index - 1]
            remaining_field_indices.discard(_index - 1)
            if not field.Flags.HasFieldRVA:
                continue
            fname = field.Name
            type = None
            signature: bytes = field.Signature
            offset = header.pe.rva_to_offset(rv.RVA)

            if len(signature) == 2:
                # Crude signature parser for non-array case. Reference:
                # https://www.codeproject.com/Articles/42649/NET-File-Format-Signatures-Under-the-Hood-Part-1
                # https://www.codeproject.com/Articles/42655/NET-file-format-Signatures-under-the-hood-Part-2
                guess = {
                    0x03: FieldInfo('Char',   1, 1, 0),  # noqa
                    0x04: FieldInfo('SByte',  1, 1, 0),  # noqa
                    0x05: FieldInfo('Byte',   1, 1, 0),  # noqa
                    0x06: FieldInfo('Int16',  1, 2, 0),  # noqa
                    0x07: FieldInfo('UInt16', 1, 2, 0),  # noqa
                    0x08: FieldInfo('Int32',  1, 4, 0),  # noqa
                    0x09: FieldInfo('UInt32', 1, 4, 0),  # noqa
                    0x0A: FieldInfo('Int64',  1, 8, 0),  # noqa
                    0x0B: FieldInfo('UInt64', 1, 8, 0),  # noqa
                    0x0C: FieldInfo('Single', 1, 4, 0),  # noqa
                    0x0D: FieldInfo('Double', 1, 8, 0),  # noqa
                }.get(signature[1], None)
            else:
                fname, guess = _guess_field_info(_index, signature, fname)

            if guess is None:
                self.log_warn(lambda: F'field {k:0{iwidth}d} with signature {field.Signature.hex()}: unable to guess type information')
                continue
            if not fname.isprintable() or name_count[fname] > 1:
                fname = F'Field{k + 1:0{name_width}d}'
            type = guess.type.lower()
            if guess.count > 1:
                type += F'[{guess.count}]'
            self.log_debug(
                F'field {k:0{iwidth}d}; token 0x{_index:06X}; RVA 0x{rv.RVA:04X}; count {guess.count}; type {guess.type}; name {fname}')
            end = offset + guess.count * guess.size
            path = cpaths.method_path(guess.offset) if guess.offset else ''
            unpack.append(UnpackResult(F'{path}/{fname}', memory[offset:end], name=fname, type=type))

        for _index in remaining_field_indices:
            field = tables.Field[_index]
            index = _index + 1
            name = field.Name
            if field.Flags.HasFieldRVA:
                self.log_warn(F'field {name} has RVA flag set, but no RVA was found')
            token = index.to_bytes(3, 'little')
            values = {}
            for match in re.finditer((
                BR'\x72(?P<token>...)\x70'          # ldstr
                BR'(?:\x6F(?P<function>...)\x0A)?'  # call GetBytes
                BR'\x80%s\x04'                      # stsfld
            ) % re.escape(token), data, re.DOTALL):
                md = match.groupdict()
                fn_token = md.get('function')
                fn_index = fn_token and int.from_bytes(fn_token, 'little') or None
                if fn_index is not None:
                    fn_name = tables.MemberRef[fn_index].Name
                    if fn_name != 'GetBytes':
                        self.log_info(F'skipping string assignment passing through call to {fn_name}')
                        continue
                k = int.from_bytes(md['token'], 'little')
                values[match.start()] = header.meta.Streams.US[k].encode(self.codec)
            if not values:
                continue
            if len(values) == 1:
                offset, value = values.popitem()
                path = cpaths.method_path(offset)
                unpack.append(UnpackResult(F'{path}/{name}', value, name=name, type='string'))

        unpack.sort(key=lambda u: u.path)
        yield from unpack

class dnhdr (resources=False, encode=None, digest=None)

Expects data that has been formatted with the BinaryFormatter class. The output is a representation of the deserialized data in JSON format.

Expand source code Browse git

class dnhdr(JSONEncoderUnit):
    """
    Expects data that has been formatted with the `BinaryFormatter` class. The
    output is a representation of the deserialized data in JSON format.
    """
    def __init__(
        self,
        resources: Arg.Switch('-r', '--resources', help='Also parse .NET resources.') = False,
        encode=None, digest=None
    ):
        super().__init__(encode=encode, digest=digest, resources=resources)

    def process(self, data):
        dn = DotNetHeader(data, parse_resources=self.args.resources)
        dn = {
            'Head': dn.head,
            'Meta': dn.meta
        }

        if self.args.resources:
            dn['RSRC'] = dn.resources

        return self.to_json(dn)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnmr (*paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name', raw=False)

Extracts subfiles from .NET managed resources.

Expand source code Browse git

class dnmr(PathExtractorUnit):
    """
    Extracts subfiles from .NET managed resources.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, exact=False, fuzzy=0, regex=False, path=b'name',
        raw: Arg.Switch('-w', help='Do not deserialize the managed resource entry data.') = False
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            path=path,
            raw=raw,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
        )

    def unpack(self, data):
        try:
            managed = NetStructuredResources(data)
        except NoManagedResource:
            managed = None
        if not managed:
            raise RefineryPartialResult('no managed resources found', partial=data)
        for entry in managed:
            if entry.Error:
                self.log_warn(F'entry {entry.Name} carried error message: {entry.Error}')
            data = entry.Data
            if not self.args.raw:
                if isinstance(entry.Value, str):
                    data = entry.Value.encode('utf-16le')
                elif isbuffer(entry.Value):
                    data = entry.Value
            yield UnpackResult(entry.Name, data)

    @classmethod
    def handles(cls, data):
        return data[:4] == b'\xCE\xCA\xEF\xBE'

class dnopc (*, count=None, until=None, nvar='name', avar='addr', ovar='arg')

Disassembles the input data as MSIL (.NET/C# bytecode) and generates opcodes with metadata as output. This is useful for programmatic disassembly, while the dnasm unit outputs a human-readable representation.

Expand source code Browse git

class dnopc(DotnetDisassemblerUnit):
    """
    Disassembles the input data as MSIL (.NET/C# bytecode) and generates opcodes with metadata as output. This
    is useful for programmatic disassembly, while the `refinery.dnasm` unit outputs a human-readable
    representation.
    """

    def __init__(
        self,
        *,
        count=None,
        until=None,
        nvar: Arg.String(
            '-n',
            help='Variable to receive the disassembled mnemonic. Default is "{default}".',
        ) = 'name',
        avar: Arg.String(
            '-a',
            help='Variable to receive the address of the instruction. Default is "{default}".',
        ) = 'addr',
        ovar: Arg.String(
            '-o',
            help=('Variable prefix for instruction operands. Default is "{default}". The complete operand '
                  'string will be in {default}s, the first argument in {default}1, the second in {default}2, '
                  'and so on.'),
        ) = 'arg',
        **more
    ):
        super().__init__(
            count=count,
            until=until,
            nvar=nvar,
            avar=avar,
            ovar=ovar,
            **more
        )

    def process(self, data):
        until = str(self.args.until or '').lower()
        factory = OutputFactory()
        for ins in Disassembler().disasm(data, self.args.count):
            kwargs = {
                self.args.avar: ins.offset,
                self.args.nvar: ins.op.mnemonic,
            }
            for k, arg in enumerate(ins.arguments, 1):
                kwargs[F'{self.args.ovar}{k}'] = arg.value
            yield self.labelled(ins.data, **kwargs)

            if until and until in factory.instruction(ins).lower():
                break

class dnrc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extracts all .NET resources whose name matches any of the given patterns and outputs them. Use the refinery.units.formats.pe.dotnet.dnmr unit to extract subfiles from managed .NET resources.

Expand source code Browse git

class dnrc(PathExtractorUnit):
    """
    Extracts all .NET resources whose name matches any of the given patterns
    and outputs them. Use the `refinery.units.formats.pe.dotnet.dnmr` unit to
    extract subfiles from managed .NET resources.
    """
    def unpack(self, data):
        header = DotNetHeader(data)

        if not header.resources:
            if self.args.list:
                return
            raise ValueError('This file contains no resources.')

        for resource in header.resources:
            yield UnpackResult(resource.Name, resource.Data)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class dnsdomain (min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)

Extracts domain names in the format as they appear in DNS requests. This can be used as a quick and dirty way to extract domains from PCAP files, for example.

Expand source code Browse git

class dnsdomain(PatternExtractorBase):
    """
    Extracts domain names in the format as they appear in DNS requests. This
    can be used as a quick and dirty way to extract domains from PCAP files,
    for example.
    """

    _DOMAIN_CHARACTERS = (
        B'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        B'abcdefghijklmnopqrstuvwxyz'
        B'0123456789-_'
    )

    _DOMAIN_PATTERN = BR'(?:%s){1,20}(?:%s)\b' % (_lps(0xFF), _lps(25))

    def process(self, data):

        def transform(match):
            match = bytearray(match[0])
            pos = 0
            while pos < len(match):
                length = match[pos]
                match[pos] = 0x2E
                if len(match) < length + pos:
                    return None
                if any(x not in self._DOMAIN_CHARACTERS for x in match[pos + 1 : pos + length]):
                    return None
                pos += 1 + length
            return match[1:]

        yield from self.matches_filtered(memoryview(data), self._DOMAIN_PATTERN, transform)

class dnsfx (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extracts files from .NET single file applications.

Expand source code Browse git

class dnsfx(PathExtractorUnit):
    """
    Extracts files from .NET single file applications.
    """
    _SIGNATURE = bytes([
        # 32 bytes represent the bundle signature: SHA-256 for '.net core bundle'
        0x8b, 0x12, 0x02, 0xb9, 0x6a, 0x61, 0x20, 0x38,
        0x72, 0x7b, 0x93, 0x02, 0x14, 0xd7, 0xa0, 0x32,
        0x13, 0xf5, 0xb9, 0xe6, 0xef, 0xae, 0x33, 0x18,
        0xee, 0x3b, 0x2d, 0xce, 0x24, 0xb3, 0x6a, 0xae
    ])

    def unpack(self, data):
        reader = StreamReader(data)
        reader.seek(self._find_bundle_manifest_offset(data))

        major_version = reader.expect(UInt32)
        minor_version = reader.expect(UInt32)
        self.log_info(F'version {major_version}.{minor_version}')

        count = reader.expect(UInt32)
        bhash = reader.expect(StringPrimitive)
        self.log_info(F'bundle {bhash} contains {count} files')

        if major_version >= 2:
            reader.expect(UInt64) # depsOffset
            reader.expect(UInt64) # depsSize
            reader.expect(UInt64) # runtimeConfigOffset
            reader.expect(UInt64) # runtimeConfigSize
            reader.expect(UInt64) # flags

        for _ in range(count):
            try:
                offset = reader.expect(UInt64)
                size = reader.expect(UInt64)
                compressed_size = 0
                if major_version >= 6:
                    compressed_size = reader.expect(UInt64)
                type = reader.expect(Byte)
                path = reader.expect(StringPrimitive)

                def _logmsg():
                    _log = F'read item at offset 0x{offset:08X}, type 0x{type:02X}, size {SizeInt(size)!r}'
                    if compressed_size:
                        return F'{_log}, compressed to size {SizeInt(compressed_size)!r}'
                    return F'{_log}, uncompressed'

                self.log_debug(_logmsg)

                with reader.checkpoint():
                    reader.seek(offset)
                    if compressed_size:
                        item_data = reader.read(compressed_size) | zl | bytearray
                    else:
                        item_data = reader.read(size)

                yield UnpackResult(path, item_data)
            except ParserEOF:
                self.log_warn('unexpected EOF while parsing bundle, terminating')
                break

    def _find_bundle_manifest_offset(self, data: bytearray) -> int:
        bundle_sig_offset = data.find(self._SIGNATURE, 0)
        if bundle_sig_offset < 0:
            raise ValueError('Cannot find valid Bundle Manifest offset. Is this a .NET Bundle?')
        return int.from_bytes(data[bundle_sig_offset - 8:bundle_sig_offset], 'little')

    @classmethod
    def handles(cls, data: bytearray):
        return cls._SIGNATURE in data

class dnstr (user=True, meta=True)

Extracts all strings defined in the #Strings and #US streams of .NET executables.

Expand source code Browse git

class dnstr(Unit):
    """
    Extracts all strings defined in the `#Strings` and `#US` streams of .NET executables.
    """

    def __init__(
        self,
        user: Arg.Switch('-m', '--meta', off=True, group='HEAP', help='Only extract from #Strings.') = True,
        meta: Arg.Switch('-u', '--user', off=True, group='HEAP', help='Only extract from #US.') = True,
    ):
        if not meta and not user:
            raise ValueError('Either ascii or utf16 strings must be enabled.')
        super().__init__(meta=meta, user=user)

    def process(self, data):
        header = DotNetHeader(data, parse_resources=False)
        if self.args.meta:
            for string in header.meta.Streams.Strings.values():
                yield string.encode(self.codec)
        if self.args.user:
            for string in header.meta.Streams.US.values():
                yield string.encode(self.codec)

    @classmethod
    def handles(cls, data):
        from refinery.lib.id import is_likely_pe_dotnet
        return is_likely_pe_dotnet(data)

class docmeta (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract metadata from Word Documents such as custom document properties.

Expand source code Browse git

class docmeta(PathExtractorUnit):
    """
    Extract metadata from Word Documents such as custom document properties.
    """
    @PathExtractorUnit.Requires('olefile', 'formats', 'office')
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data: bytearray):
        properties = data | xtdoc('docProps/custom.xml') | str
        if not properties:
            return
        properties = xml.parse(properties)
        while properties.tag.lower() != 'properties':
            properties = properties.children[0]
        for node in properties:
            assert node.tag.lower() == 'property'
            assert len(node.children) == 1
            content = node.children[0].content
            assert content is not None
            yield UnpackResult(node.attributes['name'], content.encode(self.codec))

class doctxt

Extracts the text body from Word documents.

Expand source code Browse git

class doctxt(Unit):
    """
    Extracts the text body from Word documents.
    """

    @Unit.Requires('olefile', 'formats', 'office', 'extended')
    def _olefile():
        import olefile
        return olefile

    def process(self, data: bytearray):
        extractors: Dict[str, Callable[[bytearray], str]] = OrderedDict(
            doc=self._extract_ole,
            docx=self._extract_docx,
            odt=self._extract_odt,
        )
        if data.startswith(B'PK'):
            self.log_debug('document contains zip file signature, likely a odt or docx file')
            extractors.move_to_end('doc')
            if 'opendocument' in str(data | xtzip('mimetype')):
                self.log_debug('odt signature detected')
                extractors.move_to_end('odt', last=False)
        for filetype, extractor in extractors.items():
            self.log_debug(F'trying to extract as {filetype}')
            try:
                result = extractor(data)
            except ImportError:
                raise
            except Exception as error:
                self.log_info(F'failed extractring as {filetype}: {error!s}')
            else:
                return result.encode(self.codec)
        raise ValueError('All extractors failed, the input data is not recognized as any known document format.')

    def _extract_docx(self, data: Chunk) -> str:
        NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARAGRAPH = F'{NAMESPACE}p'
        TEXT = F'{NAMESPACE}t'
        chunk = data | xtzip('word/document.xml') | bytearray
        if not chunk:
            raise ValueError('No document.xml file found.')
        root: Element = XML(chunk)
        with StringIO() as output:
            for index, paragraph in enumerate(root.iter(PARAGRAPH)):
                if index > 0:
                    output.write('\n')
                for node in paragraph.iter(TEXT):
                    if node.text:
                        output.write(node.text)
            return output.getvalue()

    def _extract_odt(self, data: bytes):
        def _extract_text(node: Element):
            NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}'
            PARAGRAPH = F'{NAMESPACE}p'
            SPAN = F'{NAMESPACE}span'
            SPACE = F'{NAMESPACE}s'
            with StringIO() as res:
                for element in node:
                    tag = element.tag
                    text = element.text or ''
                    tail = element.tail or ''
                    if tag in [PARAGRAPH, SPAN]:
                        res.write(text)
                    elif tag == SPACE:
                        res.write(' ')
                    else:
                        self.log_debug(F'unknown tag: {tag}')
                    res.write(_extract_text(element))
                    res.write(tail)
                    if tag == PARAGRAPH:
                        res.write('\n')
                return res.getvalue()

        NAMESPACE = '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}'
        BODY = F'{NAMESPACE}body'
        TEXT = F'{NAMESPACE}text'
        for part in xtzip().unpack(data):
            if part.path != 'content.xml':
                continue
            xml_content: bytes = part.get_data()
            root: Element = XML(xml_content)
            body: Element = root.find(BODY)
            text: Element = body.find(TEXT)
            return _extract_text(text)
        else:
            raise ValueError('found no text')

    def _extract_ole(self, data: bytearray) -> str:
        stream = MemoryFile(data)
        with self._olefile.OleFileIO(stream) as ole:
            doc = ole.openstream('WordDocument').read()
            with StructReader(doc) as reader:
                table_name = F'{(doc[11] >> 1) & 1}Table'
                reader.seek(0x1A2)
                offset = reader.u32()
                length = reader.u32()
            with StructReader(ole.openstream(table_name).read()) as reader:
                reader.seek(offset)
                table = reader.read(length)
            piece_table = self._load_piece_table(table)
            return self._get_text(doc, piece_table)

    def _load_piece_table(self, table: bytes) -> bytes:
        with StructReader(table) as reader:
            while not reader.eof:
                entry_type = reader.read_byte()
                if entry_type == 1:
                    reader.seekrel(reader.read_byte())
                    continue
                if entry_type == 2:
                    length = reader.u32()
                    return reader.read(length)
                raise NotImplementedError(F'Unsupported table entry type value 0x{entry_type:X}.')

    def _get_text(self, doc: bytes, piece_table: bytes) -> str:
        piece_count: int = 1 + (len(piece_table) - 4) // 12
        with StringIO() as text:
            with StructReader(piece_table) as reader:
                character_positions = [reader.u32() for _ in range(piece_count)]
                for i in range(piece_count - 1):
                    cp_start = character_positions[i]
                    cp_end = character_positions[i + 1]
                    fc_value = reader.read_struct('xxLxx', unwrap=True)
                    is_ansi = bool((fc_value >> 30) & 1)
                    fc = fc_value & 0xBFFFFFFF
                    cb = cp_end - cp_start
                    if is_ansi:
                        encoding = 'cp1252'
                        fc = fc // 2
                    else:
                        encoding = 'utf16'
                        cb *= 2
                    raw = doc[fc : fc + cb]
                    text.write(raw.decode(encoding).replace('\r', '\n'))
            return text.getvalue()

class drp (consecutive=False, align=False, min=1, max=∞, len=None, all=False, threshold=20, weight=0, buffer=1024, chug=False)

Detect Repeating Patterns - detects the most prevalent repeating byte pattern in a chunk of data. The unit computes a suffix tree which may require a lot of memory for large buffers.

Expand source code Browse git

class drp(Unit):
    """
    Detect Repeating Patterns - detects the most prevalent repeating byte pattern
    in a chunk of data. The unit computes a suffix tree which may require a lot of
    memory for large buffers.
    """
    def __init__(
        self,
        consecutive: Arg.Switch('-c', help='Assume that the repeating pattern is consecutive when observable.') = False,
        align: Arg.Switch('-d', help='Assume that the pattern occurs at offsets that are multiples of its length.') = False,
        min: Arg.Number('-n', help='Minimum size of the pattern to search for. Default is {default}.') = 1,
        max: Arg.Number('-N', help='Maximum size of the pattern to search for. Default is {default}.') = INF,
        len: Arg.Number('-l', help='Set the exact size of the pattern. This is equivalent to --min=N --max=N.') = None,
        all: Arg.Switch('-a', help='Produce one output for each repeating pattern that was detected.') = False,
        threshold: Arg.Number('-t', help='Patterns must match this performance threshold in percent, lest they be discarded.') = 20,
        weight: Arg.Number('-w', help='Specifies how much longer patterns are favored over small ones. Default is {default}.') = 0,
        buffer: Arg.Number('-b', group='BFR', help='Maximum number of bytes to inspect at once. The default is {default}.') = 1024,
        chug  : Arg.Switch('-g', group='BFR', help='Compute the prefix tree for the entire buffer instead of chunking it.') = False
    ):
        if len is not None:
            min = max = len
        super().__init__(
            min=min,
            max=max,
            all=all,
            consecutive=consecutive,
            align=align,
            weight=weight,
            buffer=buffer,
            chug=chug,
            threshold=threshold
        )

    def _get_patterns(self, data):
        with stackdepth(len(data)):
            tree = SuffixTree(data)
        min_size = self.args.min
        max_size = self.args.max
        patterns = set()
        cursor = 0
        while cursor < len(data):
            node = tree.root
            rest = data[cursor:]
            remaining = len(rest)
            length = 0
            offset = None
            while node.children and length < remaining:
                for child in node.children.values():
                    if tree.data[child.start] == rest[length]:
                        node = child
                        break
                if node.start >= cursor:
                    break
                offset = node.start - length
                length = node.end + 1 - offset
            if offset is None:
                cursor += 1
                continue
            length = min(remaining, length)
            if max_size >= length >= min_size:
                pattern = rest[:length].tobytes()
                patterns.add(pattern)
            cursor += length
        del tree
        return patterns

    @staticmethod
    def _consecutive_count(data, pattern):
        length = len(pattern)
        if length == 1:
            return data.count(pattern)
        view = memoryview(data)
        return max(sum(1 for i in range(k, len(view), length) if view[i:i + length] == pattern)
            for k in range(len(pattern)))

    @staticmethod
    def _truncate_pattern(pattern):
        offset = 0
        for byte in pattern[1:]:
            if byte == pattern[offset]:
                offset += 1
            else:
                offset = 0
        if offset > 0:
            pattern = pattern[:-offset]
        return pattern

    def process(self, data: bytearray):
        if len(data) <= 1:
            yield data
            return

        memview = memoryview(data)
        weight = 1 + (self.args.weight / 10)

        if self.args.chug:
            patterns = self._get_patterns(memview)
        else:
            patterns = set()
            chunksize = self.args.buffer
            for k in range(0, len(memview), chunksize):
                patterns |= self._get_patterns(memview[k:k + chunksize])
        if not patterns:
            raise RefineryPartialResult('no repeating sequences found', data)

        self.log_debug('removing duplicate pattern detections')
        duplicates = set()
        maxlen = max(len(p) for p in patterns)
        for pattern in sorted(patterns, key=len):
            for k in range(2, maxlen // len(pattern) + 1):
                repeated = pattern * k
                if repeated in patterns:
                    duplicates.add(repeated)
        patterns -= duplicates

        self.log_debug(F'counting coverage of {len(patterns)} patterns')
        pattern_count = {p: data.count(p) for p in patterns}
        pattern_performance = dict(pattern_count)

        for consecutive in (False, True):
            if consecutive:
                self.log_debug(F're-counting coverage of {len(patterns)} patterns')
                patterns = {self._truncate_pattern(p) for p in patterns}
                pattern_performance = {p: self._consecutive_count(data, p) for p in patterns}

            self.log_debug('evaluating pattern performance')
            for pattern, count in pattern_performance.items():
                pattern_performance[pattern] = count * (len(pattern) ** weight)
            best_performance = max(pattern_performance.values())
            for pattern, performance in pattern_performance.items():
                pattern_performance[pattern] = performance / best_performance

            self.log_debug('removing patterns below performance threshold')
            threshold = self.args.threshold
            patterns = {p for p in patterns if pattern_performance[p] * 100 >= threshold}
            pattern_count = {p: data.count(p) for p in patterns}

            if not self.args.consecutive:
                break

        if self.args.all:
            for pattern in sorted(patterns, key=pattern_performance.get, reverse=True):
                yield self.labelled(pattern, count=pattern_count[pattern])
            return

        best_patterns = [p for p in patterns if pattern_performance[p] == 1.0]

        if len(best_patterns) > 1:
            self.log_warn('could not determine unique best repeating pattern, returning the first of these:')
            for k, pattern in enumerate(best_patterns):
                self.log_warn(F'{k:02d}.: {pattern.hex()}')

        result = best_patterns[0]

        if self.args.align:
            def rotated(pattern):
                for k in range(len(pattern)):
                    yield pattern[k:] + pattern[:k]
            rotations = {k % len(result): r for k, r in (
                (data.find(r), r) for r in rotated(result)) if k >= 0}
            result = rotations[min(rotations)]

        yield result

class dsjava

Deserialize Java serialized data and re-serialize as JSON.

Expand source code Browse git

class dsjava(Unit):
    """
    Deserialize Java serialized data and re-serialize as JSON.
    """
    @Unit.Requires('javaobj-py3>=0.4.0.1', 'formats')
    def _javaobj():
        import javaobj.v2
        return javaobj.v2

    def process(self, data):
        with JavaEncoder as encoder:
            return encoder.dumps(self._javaobj.loads(data)).encode(self.codec)

class dsphp

Deserialize PHP serialized data and re-serialize as JSON.

Expand source code Browse git

class dsphp(Unit):
    """
    Deserialize PHP serialized data and re-serialize as JSON.
    """
    @Unit.Requires('phpserialize', 'formats')
    def _php():
        import phpserialize
        return phpserialize

    def reverse(self, data):
        return self._php.dumps(json.loads(data))

    def process(self, data):
        phpobject = self._php.phpobject

        class encoder(json.JSONEncoder):
            def default(self, obj):
                try:
                    return super().default(obj)
                except TypeError:
                    pass
                if isinstance(obj, bytes) or isinstance(obj, bytearray):
                    return obj.decode('utf8')
                if isinstance(obj, phpobject):
                    return obj._asdict()

        return json.dumps(
            self._php.loads(
                data,
                object_hook=phpobject,
                decode_strings=True
            ),
            indent=4,
            cls=encoder
        ).encode(self.codec)

class dump (*files, tee=False, stream=False, plain=False, force=False)

Dump incoming data to files on disk. It is possible to specify filenames with format fields. Any metadata field on an incoming chunk is available. Additionally, any field that can be populated by the cm unit is also available. These include the following:

{ext}    : Automatically guessed file extension
{crc32}  : CRC32 checksum of the data
{index}  : Index of the data in the input stream, starting at 0
{size}   : Size of the data in bytes
{md5}    : MD5 hash of the data
{sha1}   : SHA1 hash of the data
{sha256} : SHA-256 hash of the data
{path}   : Associated path; defaults to {sha256} if none is given.

When not using formatted file names, the unit ingests as many incoming inputs as filenames were specified on the command line. Unless connected to a terminal, the remaining inputs will be forwarded on STDOUT. The -t or --tee switch can be used to forward all inputs, under all circumstances, regardless of whether or not they have been processed.

If no file is specified, all ingested inputs are concatenated and written to the clipboard. This will only succeed when the data can successfully be encoded.

Expand source code Browse git

class dump(Unit):
    """
    Dump incoming data to files on disk. It is possible to specify filenames with format fields.
    Any metadata field on an incoming chunk is available. Additionally, any field that can be
    populated by the `refinery.cm` unit is also available. These include the following:

        {ext}    : Automatically guessed file extension
        {crc32}  : CRC32 checksum of the data
        {index}  : Index of the data in the input stream, starting at 0
        {size}   : Size of the data in bytes
        {md5}    : MD5 hash of the data
        {sha1}   : SHA1 hash of the data
        {sha256} : SHA-256 hash of the data
        {path}   : Associated path; defaults to {sha256} if none is given.

    When not using formatted file names, the unit ingests as many incoming inputs as filenames were
    specified on the command line. Unless connected to a terminal, the remaining inputs will be
    forwarded on STDOUT. The `-t` or `--tee` switch can be used to forward all inputs, under all
    circumstances, regardless of whether or not they have been processed.

    If no file is specified, all ingested inputs are concatenated and written to the clipboard. This
    will only succeed when the data can successfully be encoded.
    """

    def __init__(
        self, *files: Arg(metavar='file', type=str, help='Optionally formatted filename.'),
        tee    : Arg.Switch('-t', help='Forward all inputs to STDOUT.') = False,
        stream : Arg.Switch('-s', help='Dump all incoming data to the same file.') = False,
        plain  : Arg.Switch('-p', help='Never apply any formatting to file names.') = False,
        force  : Arg.Switch('-f', help='Remove files if necessary to create dump path.') = False,
    ):
        if stream and len(files) != 1:
            raise ValueError('Can only use exactly one file in stream mode.')
        super().__init__(files=files, tee=tee, stream=stream, force=force)
        self.stream = None
        self._formatted = not plain and any(self._has_format(f) for f in files)
        self._reset()

    @staticmethod
    def _has_format(filename):
        if not isinstance(filename, str):
            return False
        formatter = Formatter()
        return any(
            any(t.isalnum() for t in fields)
            for _, fields, *__ in formatter.parse(filename) if fields
        )

    def _reset(self):
        self.exhausted = False
        self.paths = cycle(self.args.files) if self._formatted else iter(self.args.files)
        self._close()

    @property
    def _clipcopy(self):
        return not self.args.files

    def _components(self, path):
        def _reversed_components(path):
            while True:
                path, component = os.path.split(path)
                if not component:
                    break
                yield component
            yield path
        components = list(_reversed_components(path))
        components.reverse()
        return components

    def _open(self, path, unc=False):
        if hasattr(path, 'close'):
            return path
        path = os.path.abspath(path)
        base = os.path.dirname(path)
        if not unc:
            self.log_info('opening:', path)
        try:
            os.makedirs(base, exist_ok=True)
        except FileExistsError:
            self.log_info('existed:', path)
            part, components = '', self._components(path)
            while components:
                component, *components = components
                part = os.path.join(part, component)
                if os.path.exists(part) and os.path.isfile(part):
                    if self.args.force:
                        os.unlink(part)
                        return self._open(path, unc)
                    break
            raise RefineryCriticalException(F'Unable to dump to {path} because {part} is a file.')
        except FileNotFoundError:
            if unc or os.name != 'nt':
                raise
            path = F'\\\\?\\{path}'
            return self._open(path, unc=True)
        except OSError as e:
            if not self.log_info():
                self.log_warn('opening:', path)
            self.log_warn('errored:', e.args[1])
            return open(os.devnull, 'wb')
        else:
            mode = 'ab' if self.args.stream else 'wb'
            return open(path, mode)

    def _close(self, final=False):
        if not self.stream:
            return
        self.stream.flush()
        if self.args.stream and not final:
            return
        if self._clipcopy:
            if os.name == 'nt':
                from refinery.lib.winclip import ClipBoard, CF
                try:
                    img = self._image.open(self.stream)
                    with io.BytesIO() as out:
                        img.save(out, 'BMP')
                except Exception:
                    with ClipBoard(CF.TEXT) as cpb:
                        cpb.copy(self.stream.getvalue())
                else:
                    with ClipBoard(CF.DIB) as cpb:
                        out.seek(14, io.SEEK_SET)
                        cpb.copy(out.read())
            else:
                data = self.stream.getvalue()
                data = data.decode(self.codec, errors='backslashreplace')
                self._pyperclip.copy(data)
        self.stream.close()
        self.stream = None

    @Unit.Requires('pyperclip')
    def _pyperclip():
        import pyperclip
        return pyperclip

    @Unit.Requires('Pillow', 'formats')
    def _image():
        from PIL import Image
        return Image

    def process(self, data: bytes):
        forward_input_data = self.args.tee
        if self._clipcopy:
            self.stream.write(data)
        elif not self.exhausted:
            if not self.stream:
                # This should happen only when the unit is called from Python code
                # rather than via the command line.
                try:
                    path = next(self.paths)
                except StopIteration:
                    raise RefineryCriticalException('the list of filenames was exhausted.')
                else:
                    with self._open(path) as stream:
                        stream.write(data)
            else:
                self.stream.write(data)
                self.log_debug(F'wrote 0x{len(data):08X} bytes')
                self._close()
        else:
            forward_input_data = forward_input_data or not self.isatty()
            if not forward_input_data:
                size = metavars(data).size
                self.log_warn(F'discarding unprocessed chunk of size {size!s}.')
        if forward_input_data:
            yield data

    def filter(self, chunks):
        if self.exhausted:
            self._reset()

        nostream = not self.args.stream
        clipcopy = self._clipcopy

        if clipcopy:
            self.stream = io.BytesIO()

        for index, chunk in enumerate(chunks, 0):
            if not chunk.visible:
                continue
            if not clipcopy and not self.exhausted and (nostream or not self.stream):
                try:
                    path = next(self.paths)
                except StopIteration:
                    self.exhausted = True
                else:
                    if self._has_format(path):
                        meta = metavars(chunk)
                        meta.ghost = True
                        meta.index = index
                        path = meta.format_str(path, self.codec, [chunk])
                    self.stream = self._open(path)
            yield chunk

        self._close(final=True)
        self.exhausted = True

class eat (name)

Consume a meta variable and replace the contents of the current chunk with it. If the variable contains a string, it is encoded with the default codec. If the variable cannot be converted to a byte string, the data is lost and an empty chunk is returned.

Expand source code Browse git

class eat(Unit):
    """
    Consume a meta variable and replace the contents of the current chunk with it. If the variable
    contains a string, it is encoded with the default codec. If the variable cannot be converted to
    a byte string, the data is lost and an empty chunk is returned.
    """
    def __init__(
        self,
        name: Arg(help='The name of the variable to be used.', type=str),
    ):
        super().__init__(name=check_variable_name(name))

    def process(self, data: Chunk):
        def invalid_type():
            return F'variable {name} is of type "{type}", unable to convert to byte string - data is lost'
        name = self.args.name
        meta = metavars(data)
        data = meta.pop(name)
        type = data.__class__.__name__
        if isinstance(data, int):
            self.log_info(F'variable {name} is an integer, converting to string.')
            data = str(data).encode(self.codec)
        if isinstance(data, str):
            self.log_info(F'variable {name} is a string, encoding as {self.codec}')
            data = data.encode(self.codec)
        elif not isbuffer(data):
            try:
                wrapped = bytearray(data)
            except Exception:
                self.log_warn(invalid_type())
                data = None
            else:
                data = wrapped
        return data

class ef (*filenames, list=False, meta=False, size=None, read=0, wild=False, tame=False, symlinks=False, linewise=False)

Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to read large files in chunks.

Expand source code Browse git

class ef(Unit):
    """
    Short for "emit file". The unit reads files from disk and outputs them individually. Has the ability to
    read large files in chunks.
    """

    def __init__(self,
        *filenames: Arg(metavar='FILEMASK', nargs='+', type=str, help=(
            'A list of file masks. Each matching file will be read from disk and '
            'emitted. The file masks can include format string expressions which '
            'will be substituted from the current meta variables. The masks can '
            'use wild-card expressions, but this feature is disabled by default on '
            'Posix platforms, where it has to be enabled explicitly using the -w '
            'switch. On Windows, the feature is enabled by default and can be '
            'disabled using the -t switch.'
        )),
        list: Arg.Switch('-l', help='Only lists files with metadata.') = False,
        meta: Arg.Switch('-m', help=(
            'Adds the atime, mtime, ctime, and size metadata variables.'
        )) = False,
        size: Arg.Bounds('-s', range=True, help=(
            'If specified, only files are read whose size is in the given range.')) = None,
        read: Arg.Number('-r', help=(
            'If specified, files will be read in chunks of size N and each '
            'chunk is emitted as one element in the output list.'
        )) = 0,
        wild: Arg.Switch('-w', group='W', help='Force use of wildcard patterns in file masks.') = False,
        tame: Arg.Switch('-t', group='W', help='Disable wildcard patterns in file masks.') = False,
        symlinks: Arg.Switch('-y', help='Follow symbolic links and junctions, these are ignored by default.') = False,
        linewise: Arg.Switch('-i', help=(
            'Read the file linewise. By default, one line is read at a time. '
            'In line mode, the --read argument can be used to read the given '
            'number of lines in each chunk.'
        )) = False
    ):
        if wild and tame:
            raise ValueError('Cannot be both wild and tame!')
        super().__init__(
            size=size,
            read=read,
            list=list,
            meta=meta,
            wild=wild,
            tame=tame,
            symlinks=symlinks,
            linewise=linewise,
            filenames=filenames
        )

    def _read_chunks(self, fd):
        while True:
            buffer = fd.read(self.args.read)
            if not buffer:
                break
            yield buffer

    def _read_lines(self, fd):
        count = self.args.read or 1
        if count == 1:
            while True:
                buffer = fd.readline()
                if not buffer:
                    break
                yield buffer
            return
        with MemoryFile() as out:
            while True:
                for _ in range(count):
                    buffer = fd.readline()
                    if not buffer:
                        break
                    out.write(buffer)
                if not out.tell():
                    break
                yield out.getvalue()
                out.seek(0)
                out.truncate()

    def _absolute_path(self, path_string: str):
        path = Path(path_string).resolve().absolute()
        if os.name == 'nt' and not path.parts[0].startswith('\\\\?\\'):
            # The pathlib glob method will simply fail mid-traversal if it attempts to descend into
            # a folder or to a file whose path exceeds MAX_PATH on Windows. As a workaround, we use
            # UNC paths throughout and truncate to relative paths after enumeration.
            path = Path(F'\\\\?\\{path!s}')
        return path

    def _glob(self, pattern: str) -> Iterable[Path]:
        if pattern.endswith('**'):
            pattern += '/*'
        wildcard = re.search(R'[\[\?\*]', pattern)
        if wildcard is None:
            yield self._absolute_path(pattern)
            return
        k = wildcard.start()
        base, pattern = pattern[:k], pattern[k:]
        path = self._absolute_path(base or '.')
        last = path.parts[-1]
        if base.endswith(last):
            # /base/something.*
            pattern = F'{last}{pattern}'
            path = path.parent

        scandir = os.scandir

        class EmptyIterator:
            def __enter__(self): return self
            def __exit__(self, *_, **__): pass
            def __next__(self): raise StopIteration
            def __iter__(self): return self

        if sys.version_info >= (3, 12):
            def islink(path):
                return os.path.islink(path) or os.path.isjunction(path)
        else:
            def islink(path):
                try:
                    return bool(os.readlink(path))
                except OSError:
                    return False

        paths_scanned = set()

        def _patched_scandir(path):
            if islink(path):
                if not self.args.symlinks:
                    return EmptyIterator()
                try:
                    rp = os.path.realpath(path, strict=True)
                except OSError:
                    return EmptyIterator()
                if rp in paths_scanned:
                    self.log_warn(F'file system loop at: {path!s}')
                    return EmptyIterator()
                paths_scanned.add(rp)
                path = rp
            try:
                return scandir(path)
            except Exception as e:
                ignore = _ERROR_IGNORES.get(os.name, set())
                if not any(p.lower() in ignore for p in Path(path).parts):
                    self.log_warn(F'error calling scandir, {exception_to_string(e)}: {path}')
                return EmptyIterator()

        try:
            os.scandir = _patched_scandir
            for match in path.glob(pattern):
                yield match
        finally:
            os.scandir = scandir

    def process(self, data):
        meta = metavars(data)
        size = self.args.size
        size = size and range(size.start, size.stop, size.step)
        meta.ghost = True
        wild = (os.name == 'nt' or self.args.wild) and not self.args.tame
        root = self._absolute_path('.')
        paths = self._glob if wild else lambda mask: [self._absolute_path(mask)]
        do_meta = self.args.meta
        do_stat = size or do_meta

        class SkipErrors:
            unit = self

            def __init__(self):
                self._history: Set[type] = set()
                self._message: Dict[type, Optional[str]] = {
                    ValueError: (
                        None
                    ), PermissionError: (
                        'access error while scanning: {}'
                    ), OSError: (
                        'system error while scanning: {}'
                    ), FileNotFoundError: (
                        'file unexpectedly not found: {}'
                    ), Exception: (
                        'unknown error while reading: {}'
                    ),
                }
                self.path = None

            def reset(self, path):
                self._history.clear()
                self.path = path
                return self

            def __enter__(self):
                return self

            def __exit__(self, et, ev, trace):
                if et is None:
                    return False
                for t, msg in self._message.items():
                    if issubclass(et, t):
                        if t not in self._history:
                            self._history.add(t)
                            if msg is not None:
                                self.unit.log_info(msg.format(self.path))
                        return True
                else:
                    return False

        for mask in self.args.filenames:
            mask = meta.format_str(mask, self.codec, [data])
            self.log_debug('scanning for mask:', mask)
            kwargs = dict()
            skip_errors = SkipErrors()
            for path in paths(mask):
                skip_errors.reset(path)
                filesize = None
                with skip_errors:
                    path = path.relative_to(root)
                with skip_errors:
                    if wild and not path.is_file():
                        continue
                with skip_errors:
                    if do_stat:
                        stat = path.stat()
                        filesize = stat.st_size
                    if do_meta:
                        kwargs.update(
                            fsize=filesize,
                            atime=datetime.fromtimestamp(stat.st_atime).isoformat(' ', 'seconds'),
                            ctime=datetime.fromtimestamp(stat.st_ctime).isoformat(' ', 'seconds'),
                            mtime=datetime.fromtimestamp(stat.st_mtime).isoformat(' ', 'seconds')
                        )
                if size is not None and filesize not in size:
                    continue
                with skip_errors:
                    if self.args.list:
                        yield self.labelled(str(path).encode(self.codec), **kwargs)
                        continue
                    with path.open('rb') as stream:
                        if self.args.linewise:
                            yield from self._read_lines(stream)
                        elif self.args.read:
                            yield from self._read_chunks(stream)
                        else:
                            contents = stream.read()
                            self.log_info(lambda: F'reading: {path!s} ({len(contents)} bytes)')
                            yield self.labelled(contents, path=path.as_posix(), **kwargs)

class emit (*data)

Expand source code Browse git

class emit(Unit):

    def __init__(self, *data: Arg(help=(
        'Data to be emitted. If no argument is specified, data is retrieved from '
        'the clipboard. Multiple arguments are output in framed format.'
    ))):
        super().__init__(data=data)

    @Unit.Requires('pyperclip')
    def _pyperclip():
        import pyperclip
        return pyperclip

    def process(self, data):
        if self.args.data:
            yield from self.args.data
            return
        if os.name == 'nt':
            from refinery.lib.winclip import get_any_data
            mode, data = get_any_data()
            if mode is not None:
                self.log_info(F'retrieved clipboard data in {mode.name} format')
            yield data
        else:
            data = self._pyperclip.paste()
            if not data:
                return
            yield data.encode(self.codec, 'replace')

class esc (hex=False, unicode=False, greedy=False, unquoted=False, quoted=False, bare=False)

Encodes and decodes common ASCII escape sequences.

Expand source code Browse git

class esc(Unit):
    """
    Encodes and decodes common ASCII escape sequences.
    """
    _ESCAPE = {
        0x00: BR'\0',
        0x07: BR'\a',
        0x08: BR'\b',
        0x0C: BR'\f',
        0x0A: BR'\n',
        0x0D: BR'\r',
        0x09: BR'\t',
        0x0B: BR'\v',
        0x5C: BR'\\',
        0x27: BR'\'',
        0x22: BR'\"'
    }
    _UNESCAPE = {
        BR'0': B'\x00',
        BR'a': B'\x07',
        BR'b': B'\x08',
        BR'f': B'\x0C',
        BR'n': B'\x0A',
        BR'r': B'\x0D',
        BR't': B'\x09',
        BR'v': B'\x0B',
        B'\\': B'\x5C',
        BR"'": B'\x27',
        BR'"': B'\x22'
    }

    def __init__(self,
        hex     : Arg.Switch('-x', help='Hex encode everything, do not use C escape sequences.') = False,
        unicode : Arg.Switch('-u', help='Use unicode escape sequences and UTF-8 encoding.') = False,
        greedy  : Arg.Switch('-g', help='Replace \\x by x and \\u by u when not followed by two or four hex digits, respectively.') = False,
        unquoted: Arg.Switch('-p', group='Q', help='Never remove enclosing quotes.') = False,
        quoted  : Arg.Switch('-q', group='Q', help='Remove enclosing quotes while decoding and add them for encoding.') = False,
        bare    : Arg.Switch('-b', help='Do not escape quote characters.') = False,
    ) -> Unit: pass  # noqa

    def process(self, data):
        data = memoryview(data)

        if self.args.quoted:
            quote = data[0]
            if data[-1] != quote:
                self.log_info('string is not correctly quoted')
            else:
                data = data[1:-1]
        elif not self.args.unquoted:
            quote = data[:1]
            strip = data[1:-1]
            if data[-1:] == quote and not re.search(br'(?<!\\)' + re.escape(quote), strip):
                self.log_info('removing automatically detected quotes')
                data = strip

        def unescape(match):
            c = match[1]
            if len(c) > 1:
                if c[0] == 0x75:
                    # unicode
                    upper = int(c[1:3], 16)
                    lower = int(c[3:5], 16)
                    if self.args.unicode:
                        return bytes((lower, upper)).decode('utf-16le').encode(self.codec)
                    return bytes((lower,))
                elif c[0] == 0x78:
                    # hexadecimal
                    return bytes((int(c[1:3], 16),))
                else:
                    # octal escape sequence
                    return bytes((int(c, 8) & 0xFF,))
            elif c in B'ux':
                return c if self.args.greedy else match[0]
            return self._UNESCAPE.get(c, c)
        data = re.sub(
            RB'\\(u[a-fA-F0-9]{4}|x[a-fA-F0-9]{1,2}|[0-7]{3}|.)', unescape, data)
        return data

    def reverse(self, data):
        if self.args.unicode:
            string = data.decode(self.codec).encode('UNICODE_ESCAPE')
        else:
            if not self.args.hex:
                def escape(match):
                    c = match[0][0]
                    return self._ESCAPE.get(c, RB'\x%02x' % c)
                pattern = RB'[\x00-\x1F\x22\x27\x5C\x7F-\xFF]'
                if self.args.bare:
                    pattern = RB'[\x00-\x1F\x5C\x7F-\xFF]'
                string = re.sub(pattern, escape, data)
            else:
                string = bytearray(4 * len(data))
                for k in range(len(data)):
                    a = k * 4
                    b = k * 4 + 4
                    string[a:b] = RB'\x%02x' % data[k]
        if self.args.quoted:
            string = B'"%s"' % string
        return string

class escps

Escapes and unescapes PowerShell strings.

Expand source code Browse git

class escps(Unit):
    """
    Escapes and unescapes PowerShell strings.
    """
    UNESCAPE = {
        '`0': '\0',
        '`a': '\a',
        '`b': '\b',
        '`f': '\f',
        '`n': '\n',
        '`r': '\r',
        '`t': '\t',
        '`v': '\v',
        '``': '`',
        "`'": '\'',
        '`"': '\"',
    }
    ESCAPE = {
        '`' : '``',
        '$' : '`$',
        '\0': '`0',
        '\a': '`a',
        '\b': '`b',
        '\f': '`f',
        '\n': '`n',
        '\r': '`r',
        '\t': '`t',
        '\v': '`v',
        '\'': "`'",
        '\"': '""',
    }

    def __init__(self): pass

    @unicoded
    def process(self, data):
        match = re.fullmatch(R'''@(['"])\s*?[\r\n](.*?)[\r\n]\1@''', data, flags=re.DOTALL)
        if match:
            return match.group(2)
        if data[0] not in '\'\"' or data[-1] != data[0]:
            raise ValueError(
                'No quotes found at beginning of input. To escape a PowerShell string, the '
                'quotes must be included because quote escaping depends on whether a single '
                'or double quote was used.')

        quote, data = data[0], data[1:-1]

        def unescape(match):
            string = match[0]
            return self.UNESCAPE.get(string, string[1:])

        if quote == '"':
            if re.search(R'(?<!`)\$(?=[\w\(\{\$\?\^:])', data):
                self.log_warn('Loss of information: double quoted string contains variable substitutions.')
            data = re.sub('`.', unescape, data)

        return data.replace(quote + quote, quote)

    @unicoded
    def reverse(self, data):
        def escaper(match):
            char = match[0]
            return escps.ESCAPE.get(char, char)
        return '"{}"'.format(re.sub(R'''[\x00\x07-\x0D`$'"]''', escaper, data))

class escvb

Escapes and unescapes Visual Basic strings.

Expand source code Browse git

class escvb(Unit):
    """
    Escapes and unescapes Visual Basic strings.
    """
    def process(self, data):
        if data[:1] == B'"' and data[-1:] == B'"':
            data = data[1:-1]
        return data.replace(B'""', B'"')

    def reverse(self, data):
        return B'"%s"' % data.replace(B'"', B'""')

class evtx (raw=False)

Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single output chunk in XML format.

Expand source code Browse git

class evtx(Unit):
    """
    Extracts data from Windows Event Log files (EVTX). Each extracted log entry is returned as a single
    output chunk in XML format.
    """

    def __init__(self, raw: Unit.Arg.Switch('-r', help='Extract raw event data rather than XML.') = False):
        super().__init__(raw=raw)

    @Unit.Requires('python-evtx', 'formats')
    def _evtx():
        from Evtx.Evtx import Evtx
        return Evtx

    def process(self, data):
        with VirtualFileSystem() as vfs:
            raw = self.args.raw
            with self._evtx(vfs.new(data)) as log:
                for record in log.records():
                    yield record.data() if raw else record.xml().encode(self.codec)

class fernet (key)

Decrypt Fernet messages.

Expand source code Browse git

class fernet(Unit):
    """
    Decrypt Fernet messages.
    """
    def __init__(self, key: Arg(help='A fernet key, either in base64 or raw binary.')):
        super().__init__(key=key)

    def _b64(self, data):
        try:
            return data | b64(urlsafe=True) | bytearray
        except Exception:
            return data

    def process(self, data):
        fk = self._b64(self.args.key)
        if len(fk) != 32:
            raise ValueError(F'The given Fernet key has length {len(fk)}, expected 32 bytes.')
        signing_key = fk[:16]
        encryption_key = fk[16:]
        decoded = self._b64(data)
        reader = StructReader(memoryview(decoded), bigendian=True)
        signed_data = reader.peek(reader.remaining_bytes - 32)
        version = reader.u8()
        timestamp = datetime.fromtimestamp(reader.u64())
        iv = reader.read(16)
        if version != 0x80:
            self.log_warn(F'The Fernet version is 0x{version:02X}, the only documented one is 0x80.')
        ciphertext = reader.read(reader.remaining_bytes - 32)
        if len(ciphertext) % 16 != 0:
            raise ValueError('The encoded ciphertext is not 16-byte block aligned.')
        signature = reader.read(32)
        hmac = HMAC.new(signing_key, digestmod=SHA256)
        hmac.update(signed_data)
        if hmac.digest() != signature:
            self.log_warn('HMAC verification failed; the message has been tampered with.')
            self.log_info(F'computed signature: {hmac.hexdigest().upper()}')
            self.log_info(F'provided signature: {signature.hex().upper()}')
        plaintext = ciphertext | aes(mode='cbc', iv=iv, key=encryption_key) | bytearray
        return self.labelled(plaintext, timestamp=timestamp.isoformat(' ', 'seconds'))

class gost (key, iv=b'', padding=None, mode=None, raw=False, swap=False, sbox=SBOX.R34, *, aad=None, tag=None, segment_size=0, little_endian=False)

GOST encryption and decryption.

Expand source code Browse git

class gost(StandardBlockCipherUnit, cipher=BlockCipherFactory(GOST)):
    """
    GOST encryption and decryption.
    """
    def __init__(
        self, key, iv=B'', padding=None, mode=None, raw=False,
        swap: Arg.Switch('-s', help='Decode blocks as big endian rather than little endian.') = False,
        sbox: Arg.Option('-x', choices=SBOX, help=(
            'Choose an SBOX. The default is {default}, which corresponds to the R-34.12.2015 standard. '
            'The other option is CBR, which is the SBOX used by the Central Bank of Russia.'
        )) = SBOX.R34, **more
    ):
        sbox = Arg.AsOption(sbox, SBOX)
        super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, swap=swap, sbox=sbox, **more)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            swap=self.args.swap,
            sbox=self.args.sbox,
            **optionals
        )

class group (size)

Group incoming chunks into frames of the given size.

Expand source code Browse git

class group(Unit):
    """
    Group incoming chunks into frames of the given size.
    """
    def __init__(self, size: Arg.Number(help='Size of each group; must be at least 2.', bound=(2, None))):
        super().__init__(size=size)

    def process(self, data: Chunk):
        if not data.temp:
            return
        yield data
        yield from islice(data.temp, 0, self.args.size - 1)

    def filter(self, chunks):
        it = iter(chunks)
        while True:
            try:
                head: Chunk = next(it)
            except StopIteration:
                return
            head.temp = it
            yield head

class groupby (name)

Group incoming chunks by the contents of a meta variable. Note that the unit blocks and cannot stream any output until the input frame is consumed: It has to read every input chunk to make sure that all groupings are complete.

Expand source code Browse git

class groupby(Unit):
    """
    Group incoming chunks by the contents of a meta variable. Note that the unit
    blocks and cannot stream any output until the input frame is consumed: It has
    to read every input chunk to make sure that all groupings are complete.
    """
    def __init__(self, name: Arg(type=str, help='name of the meta variable')):
        super().__init__(name=check_variable_name(name))

    def process(self, data):
        yield from data.temp

    def filter(self, chunks: Iterable[Chunk]) -> Generator[Chunk, None, None]:
        name = self.args.name
        members = defaultdict(list)
        for chunk in chunks:
            try:
                value = chunk.meta[name]
            except KeyError:
                value = None
            members[value].append(chunk)
        for chunklist in members.values():
            dummy = chunklist[0]
            dummy.temp = chunklist
            yield dummy

class hc128 (key, discard=0, stateful=False)

HC-128 encryption and decryption.

Expand source code Browse git

class hc128(StreamCipherUnit):
    """
    HC-128 encryption and decryption.
    """
    key_size = {32}

    def keystream(self) -> Iterable[int]:
        return HC128(self.args.key)

class hc256 (key, iv=b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', discard=0, stateful=False)

HC-256 encryption and decryption.

Expand source code Browse git

class hc256(StreamCipherUnit):
    """
    HC-256 encryption and decryption.
    """
    key_size = {32}

    def __init__(
        self, key,
        iv: Arg(help='An initialization vector; the default is a sequence of 32 zero bytes.') = bytes(32),
        discard=0, stateful=False,
    ):
        super().__init__(key=key, iv=iv, stateful=stateful, discard=discard)
        self._keystream = None

    def keystream(self) -> Iterable[int]:
        for num in HC256(self.args.key, self.args.iv):
            yield from num.to_bytes(4, 'little')

class hex

Hex-decodes and encodes binary data. Non-hex characters are removed from the input. For decoding, any odd trailing hex digits are stripped as two hex digits are required to represent a byte.

Expand source code Browse git

class hex(Unit):
    """
    Hex-decodes and encodes binary data. Non-hex characters are removed from
    the input. For decoding, any odd trailing hex digits are stripped as two
    hex digits are required to represent a byte.
    """

    def reverse(self, data):
        import base64
        return base64.b16encode(data)

    def process(self, data):
        import re
        import base64
        data = re.sub(B'[^A-Fa-f0-9]+', B'', data)
        if len(data) % 2:
            data = data[:-1]
        return base64.b16decode(data, casefold=True)

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib.patterns import formats
        if formats.spaced_hex.fullmatch(data):
            return True

class hexload (blocks=1, dense=False, expand=False, narrow=False, width=0)

Convert hex dumps back to the original data and vice versa. All options of this unit apply to its reverse operation where binary data is converted to a readable hexdump format. The default mode of the unit expects the input data to contain a readable hexdump and converts it back to binary.

Expand source code Browse git

class hexload(HexViewer):
    """
    Convert hex dumps back to the original data and vice versa. All options of this unit apply
    to its reverse operation where binary data is converted to a readable hexdump format.
    The default mode of the unit expects the input data to contain a readable hexdump and
    converts it back to binary.
    """
    @regex
    class _ENCODED_BYTES:
        R"""
        (?ix)(?:^|(?<=\s))                      # encoded byte patches must be prefixed by white space
        (?:
            (?:                                 # separated chunks of hex data
                [a-f0-9]{2}                     # hexadecimal chunk; single byte (two hexadecimal letters)
                \s{1,2}                         # encoded byte followed by whitespace
                (?:                             # at least one more encoded byte
                    [a-f0-9]{2}                 # followed by more encoded bytes
                    (?:\s{1,2}[a-f0-9]{2})*     # unless it was just a single byte
                )?
            )
            | (?:[a-f0-9]{4}\s{1,2}             # 2-byte chunks
              (?:[a-f0-9]{4}
              (?:\s{1,2}[a-f0-9]{4})*)?)
            | (?:[a-f0-9]{8}\s{1,2}             # 4-byte chunks
              (?:[a-f0-9]{8}
              (?:\s{1,2}[a-f0-9]{8})*)?)
            | (?:(?:[a-f0-9]{2})+)              # continuous line of hexadecimal characters
        )(?=\s|$)                               # terminated by a whitespace or line end
        """

    def __init__(self, blocks=1, dense=False, expand=False, narrow=False, width=0):
        super().__init__(blocks=blocks, dense=dense, expand=expand, narrow=narrow, width=width)
        self._hexline_pattern = re.compile(F'{make_hexline_pattern(1)}(?:[\r\n]|$)', flags=re.MULTILINE)

    def process(self, data: bytearray):
        lines = data.decode(self.codec).splitlines(keepends=False)

        if not lines:
            return None

        decoded_bytes = bytearray()
        encoded_byte_matches: List[Dict[int, int]] = []

        for line in lines:
            matches: Dict[int, int] = {}
            encoded_byte_matches.append(matches)
            for match in self._ENCODED_BYTES.finditer(line):
                a, b = match.span()
                matches[a] = b - a

        it = iter(encoded_byte_matches)
        offsets = set(next(it).keys())
        for matches in it:
            offsets.intersection_update(matches.keys())
        if not offsets:
            raise ValueError('unable to determine the position of the hex bytes in this dump')
        lengths: Dict[int, List[int]] = {offset: [] for offset in offsets}
        del offsets
        for matches in encoded_byte_matches:
            for offset in lengths:
                lengths[offset].append(matches[offset])
        for offset in lengths:
            lengths[offset].sort()
        midpoint = len(encoded_byte_matches) // 2
        offset, length = max(((offset, lengths[offset][midpoint]) for offset in lengths),
            key=operator.itemgetter(1))
        end = offset + length
        del lengths
        for k, line in enumerate(lines, 1):
            encoded_line = line[offset:end]
            onlyhex = re.search(r'^[\sA-Fa-f0-9]+', encoded_line)
            if not onlyhex:
                self.log_warn(F'ignoring line without hexadecimal data: {line}')
                continue
            if onlyhex.group(0) != encoded_line:
                if k != len(lines):
                    self.log_warn(F'ignoring line with mismatching hex data length: {line}')
                    continue
                encoded_line = onlyhex.group(0)
            self.log_debug(F'decoding: {encoded_line.strip()}')
            decoded_line = bytes.fromhex(encoded_line)
            decoded_bytes.extend(decoded_line)
            txt = line[end:]
            txt_stripped = re.sub('\\s+', '', txt)
            if not txt_stripped:
                continue
            if len(decoded_line) not in range(len(txt_stripped), len(txt) + 1):
                self.log_warn(F'preview size {len(txt_stripped)} does not match decoding: {line}')
        if decoded_bytes:
            yield decoded_bytes

    def reverse(self, data):
        metrics = self._get_metrics(len(data))
        if not self.args.width:
            metrics.fit_to_width(allow_increase=True)
        for line in self.hexdump(data, metrics):
            yield line.encode(self.codec)

class hkdf (size, salt, hash='SHA512')

HKDF Key derivation

Expand source code Browse git

class hkdf(KeyDerivation):
    """HKDF Key derivation"""

    def __init__(self, size, salt, hash='SHA512'):
        super().__init__(size=size, salt=salt, hash=hash)

    def process(self, data):
        from Cryptodome.Protocol.KDF import HKDF
        return HKDF(data, self.args.size, self.args.salt, self.hash)

class hmac (salt, hash='SHA1', size=None)

HMAC based key derivation

Expand source code Browse git

class hmac(KeyDerivation):
    """
    HMAC based key derivation
    """

    def __init__(self, salt, hash='SHA1', size=None):
        super().__init__(salt=salt, size=size, hash=hash)

    def process(self, data):
        from Cryptodome.Hash import HMAC
        return HMAC.new(data, self.args.salt, digestmod=self.hash).digest()

class htmlesc

Encodes and decodes HTML entities.

Expand source code Browse git

class htmlesc(Unit):
    """
    Encodes and decodes HTML entities.
    """

    @unicoded
    def process(self, data: str) -> str:
        return html_entities.unescape(data)

    @unicoded
    def reverse(self, data: str) -> str:
        return html_entities.escape(data)

class httprequest

Parses HTTP request data, as you would obtain from a packet dump. The unit extracts POST data in any format; each uploaded file is emitted as a separate chunk.

Expand source code Browse git

class httprequest(Unit):
    """
    Parses HTTP request data, as you would obtain from a packet dump. The unit extracts
    POST data in any format; each uploaded file is emitted as a separate chunk.
    """
    def process(self, data):
        def header(line: bytes):
            name, colon, data = line.decode('utf8').partition(':')
            if colon:
                yield (name.strip().lower(), data.strip())

        head, _, body = data.partition(b'\r\n\r\n')
        request, *headers = head.splitlines(False)
        headers = dict(t for line in headers for t in header(line))
        method, path, _, *rest = request.split()

        mode = _Fmt.RawBody

        if rest:
            self.log_warn('unexpected rest data while parsing HTTP request:', rest)

        if method == b'GET' and not body:
            mode = _Fmt.UrlEncode
            body = path.partition(B'?')[1]
        if method == b'POST' and (ct := headers.get('content-type', None)):
            ct, _ = _parse_header(ct)
            try:
                mode = _Fmt(ct)
            except ValueError:
                mode = _Fmt.RawBody

        def chunks(upload: Dict[Union[str, bytes], List[bytes]]):
            for key, values in upload.items():
                if not isinstance(key, str):
                    key = key.decode('utf8')
                for value in values:
                    yield self.labelled(value, name=key)

        if mode is _Fmt.RawBody:
            yield body
            return
        if mode is _Fmt.Multipart:
            _, _, message_data = data.partition(b'\n')
            msg = BytesParser().parsebytes(message_data)
            for part in msg.walk():
                payloads = part.get_payload(decode=True)
                if not isinstance(payloads, list):
                    payloads = [payloads]
                for payload in payloads:
                    if not isbuffer(payload):
                        continue
                    if name := part.get_filename():
                        payload = self.labelled(payload, name=name)
                    yield payload

        if mode is _Fmt.UrlEncode:
            yield from chunks(parse_qs(body, keep_blank_values=1))

    @classmethod
    def handles(self, data: bytearray) -> bool | None:
        return data.startswith(B'POST ') or data.startswith(B'GET ')

class httpresponse

Parses HTTP response text, as you would obtain from a packet dump. This can be useful if chunked or compressed transfer encoding was used.

Expand source code Browse git

class httpresponse(Unit):
    """
    Parses HTTP response text, as you would obtain from a packet dump. This can be
    useful if chunked or compressed transfer encoding was used.
    """
    def process(self, data):
        with SockWrapper(data) as mock:
            mock.seek(0)
            parser = HTTPResponse(mock)
            parser.begin()
            try:
                return parser.read()
            except IncompleteRead as incomplete:
                msg = F'incomplete read: {len(incomplete.partial)} bytes processed, {incomplete.expected} more expected'
                raise RefineryPartialResult(msg, incomplete.partial) from incomplete

class iemap (legend=False, background=False, block_char='#', *label)

The information entropy map displays a colored bar on the terminal visualizing the file's local entropy from beginning to end.

Expand source code Browse git

class iemap(Unit):
    """
    The information entropy map displays a colored bar on the terminal visualizing the file's local
    entropy from beginning to end.
    """
    def __init__(
        self,
        legend: Unit.Arg.Switch('-l', help='Show entropy color legend.') = False,
        background: Unit.Arg.Switch('-b', help='Generate the bar by coloring the background.') = False,
        block_char: Unit.Arg('-c', '--block-char', type=str, metavar='C',
            help='Character used for filling the bar, default is {default}') = '#',
        *label: Unit.Arg(type=str, metavar='label-part', help=(
            'The remaining command line specifies a format string expression that will be printed '
            'over the heat map display of each processed chunk.'
        ))
    ):
        super().__init__(label=' '.join(label), background=background, legend=legend, block_char=block_char)

    @Unit.Requires('colorama', 'display', 'default', 'extended')
    def _colorama():
        import colorama
        return colorama

    def process(self, data):
        from sys import stderr
        from os import name as os_name
        colorama = self._colorama
        colorama.init(autoreset=False, convert=(os_name == 'nt'))

        nobg = not self.args.background
        meta = metavars(data)

        label = meta.format_str(self.args.label, self.codec, [data])
        if label:
            if not label.endswith(' '):
                label = F'{label} '
            if not label.startswith(' '):
                label = F' {label}'

        bgmap = [
            colorama.Back.BLACK,
            colorama.Back.WHITE,
            colorama.Back.YELLOW,
            colorama.Back.CYAN,
            colorama.Back.BLUE,
            colorama.Back.GREEN,
            colorama.Back.LIGHTRED_EX,
            colorama.Back.MAGENTA,
        ]
        fgmap = [
            colorama.Fore.LIGHTBLACK_EX,
            colorama.Fore.LIGHTWHITE_EX,
            colorama.Fore.LIGHTYELLOW_EX,
            colorama.Fore.LIGHTCYAN_EX,
            colorama.Fore.LIGHTBLUE_EX,
            colorama.Fore.LIGHTGREEN_EX,
            colorama.Fore.LIGHTRED_EX,
            colorama.Fore.LIGHTMAGENTA_EX,
        ]

        _reset = colorama.Back.BLACK + colorama.Fore.WHITE + colorama.Style.RESET_ALL

        clrmap = fgmap if nobg else bgmap
        header = '['
        header_length = 1
        footer_length = 4 + 7

        if self.args.legend:
            header = '[{1}{0}] {2}'.format(_reset, ''.join(F'{bg}{k}' for k, bg in enumerate(clrmap, 1)), header)
            header_length += 3 + len(clrmap)

        _tw = get_terminal_size()
        width = _tw - header_length - footer_length
        if width < 16:
            raise RuntimeError(F'computed terminal width {_tw} is too small for heatmap')

        def entropy_select(value, map):
            index = min(len(map) - 1, math.floor(value * len(map)))
            return map[index]

        view = memoryview(data)
        size = len(data)
        chunk_size = 0

        for block_size in range(1, width + 1):
            block_count = width // block_size
            chunk_size = size // block_count
            if chunk_size > 1024:
                break

        q, remainder = divmod(width, block_size)
        assert q == block_count
        indices = list(range(q))
        random.seed(sum(view[:1024]))
        random.shuffle(indices)

        block_sizes = [block_size] * q
        q, r = divmod(remainder, block_count)
        for i in indices:
            block_sizes[i] += q
        for i in indices[:r]:
            block_sizes[i] += 1
        assert sum(block_sizes) == width

        q, remainder = divmod(size, block_count)
        assert q == chunk_size
        chunk_sizes = [chunk_size] * block_count
        for i in indices[:remainder]:
            chunk_sizes[i] += 1
        assert sum(chunk_sizes) == size

        stream = MemoryFile(view)
        filler = self.args.block_char if nobg else ' '

        try:
            stderr.write(header)
            if label is not None:
                stderr.write(colorama.Fore.WHITE)
                stderr.flush()
            it = itertools.chain(itertools.repeat(filler, 3), label, itertools.cycle(filler))
            cp = None
            for chunk_size, block_size in zip(chunk_sizes, block_sizes):
                chunk = stream.read(chunk_size)
                chunk_entropy = entropy(chunk)
                pp = entropy_select(chunk_entropy, clrmap)
                string = ''.join(itertools.islice(it, block_size))
                if pp != cp:
                    string = F'{pp}{string}'
                cp = pp
                stderr.write(string)
                stderr.flush()
        except BaseException:
            eraser = ' ' * width
            stderr.write(F'\r{_reset}{eraser}\r')
            raise
        else:
            stderr.write(F'{_reset}] [---.--%]')
            te = meta['entropy']
            stderr.write('\b' * footer_length)
            stderr.write(F'] [{te!r:>7}]\n')
            stderr.flush()
        if not self.isatty():
            yield data

class iff (*expression, ge=None, gt=None, le=None, lt=None, ct=None, ne=None, iN=None, eq=None, retain=False)

Filter incoming chunks depending on whether a given Python expression evaluates to true. If no expression is given, the unit filters out empty chunks.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

Expand source code Browse git

class iff(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether a given Python expression evaluates to true. If no
    expression is given, the unit filters out empty chunks.
    """
    def __init__(
        self,
        *expression: Arg(metavar='token', type=str, help=(
            'All "token" arguments to this unit are joined with spaces to produce the expression '
            'to be evaluated. This is done so that unnecessary shell quoting is avoided.')),
        ge: Arg('-ge', type=str, metavar='RHS', group='OP',
            help='check that the expression is greater or equal to {varname}') = None,
        gt: Arg('-gt', type=str, metavar='RHS', group='OP',
            help='check that the expression is greater than {varname}') = None,
        le: Arg('-le', type=str, metavar='RHS', group='OP',
            help='check that the expression is less or equal to {varname}') = None,
        lt: Arg('-lt', type=str, metavar='RHS', group='OP',
            help='check that the expression is less than {varname}') = None,
        ct: Arg('-ct', type=str, metavar='RHS', group='OP',
            help='check that the expression contains {varname}') = None,
        ne: Arg('-ne', type=str, metavar='RHS', group='OP',
            help='check that the expression is equal to {varname}') = None,
        iN: Arg('-in', type=str, metavar='RHS', group='OP',
            help='check that the expression is contained in {varname}') = None,
        eq: Arg('-eq', type=str, metavar='RHS', group='OP',
            help='check that the expression is equal to {varname}') = None,
        retain=False,
    ):
        def encodings(v: str):
            if not isinstance(v, str):
                return
            for codec in [self.codec, 'latin1', 'utf-16le']:
                yield v.encode(codec)

        def __br_contains__(container, value):
            if value in container:
                return True
            if isinstance(value, str):
                return any(b in container for b in encodings(value))
            else:
                return any(value == b for v in container for b in encodings(v))

        operators = [
            (ge, operator.__ge__),
            (gt, operator.__gt__),
            (le, operator.__le__),
            (lt, operator.__lt__),
            (eq, operator.__eq__),
            (ne, operator.__ne__),
            (ct, __br_contains__),
            (iN, lambda a, b: __br_contains__(b, a)),
        ]

        operators = [
            (rhs, cmp)
            for (rhs, cmp) in operators
            if rhs is not None
        ]

        rhs, cmp, lhs = None, None, '\x20'.join(expression) or None

        if len(operators) > 0:
            if not lhs:
                raise ValueError('Comparison operator with empty left hand side.')
            if len(operators) > 1:
                raise ValueError('Only one comparison operation can be specified.')
            rhs, cmp = operators[0]

        super().__init__(
            lhs=lhs,
            rhs=rhs,
            cmp=cmp,
            retain=retain,
        )

    def match(self, chunk):
        meta = metavars(chunk)
        lhs: Optional[str] = self.args.lhs
        rhs: Optional[Any] = self.args.rhs
        cmp: Optional[Callable[[Any, Any], bool]] = self.args.cmp

        if cmp is None and rhs is not None:
            raise ValueError('right hand side defined but no operator')

        if lhs is not None:
            if rhs is not None:
                lhs = DelayedNumSeqArgument(lhs, additional_types=(float, str))(chunk)
            else:
                lhs = PythonExpression.Evaluate(lhs, meta)

        rhs = rhs and DelayedNumSeqArgument(rhs, additional_types=(float, str))(chunk)

        self.log_info(F'lhs: type={lhs.__class__.__name__}; value={lhs!r}')
        self.log_info(F'rhs: type={rhs.__class__.__name__}; value={rhs!r}')

        if lhs is None:
            return bool(chunk)
        if rhs is None:
            return bool(lhs)

        return cmp(lhs, rhs)

class iffc (*bounds, retain=False)

Filter incoming chunks depending on whether their size is within any of the given bounds.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

Expand source code Browse git

class iffc(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether their size is within any of the given bounds.
    """
    def __init__(
        self,
        *bounds: Arg.Bounds(help='Specifies an (inclusive) range to check for.', intok=True),
        retain=False,
    ):
        if not bounds:
            raise ValueError('cannot filter for size without specifying any bounds')
        super().__init__(
            bounds=bounds,
            retain=retain,
        )

    def match(self, chunk):
        length: int = len(chunk)
        for bounds in self.args.bounds:
            if isinstance(bounds, int):
                if length == bounds:
                    return True
            if isinstance(bounds, slice):
                a = bounds.start or 0
                b = bounds.stop or INF
                t = bounds.step or 1
                if a <= length <= b and not (length - a) % t:
                    return True
        return False

class iffp (*patterns, partial=False, retain=False)

Filter incoming chunks depending on whether it matches any of a given set of patterns. The available patterns are the following: integer, float, number, string, multiline_string, cmdstr, ps1str, vbastr, vbaint, printable, urlquote, urlquote_coarse, urlquote_narrow, intarray, numarray, word, letters, wshenc, alphanumeric, b32, b58, b62, b64, b85, a85, b92, b64any, b64url, hex, uppercase_hex, spaced_hex, spaced_b64, spaced_b85, spaced_a85, utf8, hexdump, hexarray, uuencode, domain, email, guid, date, ipv4, ipv6, md5, sha1, sha256, hostname, socket, subdomain, url, btc, pem, xmr, path_terse, path, winpath, nixpath, environment_variable.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

Expand source code Browse git

class iffp(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether it matches any of a given set of patterns. The
    available patterns are the following: {}.
    """

    def __init__(
        self,
        *patterns: Arg.Choice(metavar='pattern', choices=_PATTERNS),
        partial: Arg.Switch('-p', help='Allow partial matches on the data.') = False,
        retain=False
    ):
        super().__init__(
            retain=retain,
            patterns=patterns,
            partial=partial
        )

    def match(self, chunk):
        for name in self.args.patterns:
            p: pattern = _PATTERNS[name]
            matcher = p.match if self.args.partial else p.fullmatch
            if matcher(chunk):
                return True
        return False

class iffs (needle, retain=False)

Filter incoming chunks depending on whether they contain a given binary substring.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

Expand source code Browse git

class iffs(ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks depending on whether they contain a given binary substring.
    """
    def __init__(
        self,
        needle: Arg(help='the string to search for'),
        retain=False,
    ):
        super().__init__(
            needle=needle,
            retain=retain,
        )

    def match(self, chunk):
        return self.args.needle in chunk

class iffx (regex, count=0, fullmatch=False, multiline=False, ignorecase=False)

Filter incoming chunks by discarding those that do not match the given regular expression.

Note: The reverse operation of a conditional unit uses the logical negation of its condition.

Expand source code Browse git

class iffx(SingleRegexUnit, ConditionalUnit, docs='{0}{p}{1}'):
    """
    Filter incoming chunks by discarding those that do not match the given
    regular expression.
    """
    def match(self, chunk):
        return bool(self.matcher(chunk))

class ifps (bytes, codec='cp1252')

Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp.

Expand source code Browse git

class ifps(IFPSBase):
    """
    Disassembles compiled Pascal script files that start with the magic sequence "IFPS". These
    scripts can be found, for example, when unpacking InnoSetup installers using innounp.
    """
    def __init__(
        self,
        bytes: IFPSBase.Arg.Switch('-b', help='Print opcode bytes in the disassembly.'),
        codec='cp1252'
    ):
        super().__init__(codec=codec, bytes=bytes)

    def process(self, data):
        return IFPSFile(data, self.args.codec).disassembly(self.args.bytes).encode(self.codec)

    @classmethod
    def handles(cls, data) -> bool:
        return data[:len(IFPSFile.Magic)] == IFPSFile.Magic

class ifpsstr (codec='cp1252')

Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS". These scripts can be found, for example, when unpacking InnoSetup installers using innounp.

Expand source code Browse git

class ifpsstr(IFPSBase):
    """
    Extracts strings from compiled Pascal script files that start with the magic sequence "IFPS".
    These scripts can be found, for example, when unpacking InnoSetup installers using innounp.
    """
    def process(self, data):
        ifps = IFPSFile(data, self.args.codec)
        for string in ifps.strings:
            yield string.encode(self.codec)

    @classmethod
    def handles(self, data: bytearray) -> bool:
        return data.startswith(IFPSFile.Magic)

class imgdb

Provides access to the direct bytes of an image file. Each row of pixels is emitted as an individual chunk.

Expand source code Browse git

class imgdb(Unit):
    """
    Provides access to the direct bytes of an image file. Each row of pixels is emitted as an
    individual chunk.
    """
    @Unit.Requires('Pillow', 'formats')
    def _image():
        from PIL import Image
        return Image

    def _get_rows(self, image: Image):
        width = image.width
        pixels = iter(image.getdata())
        while row := list(islice(pixels, 0, width)):
            yield row

    def process(self, data):
        try:
            image = self._image.open(MemoryFile(data, read_as_bytes=True))
        except Exception:
            raise ValueError('input could not be parsed as an image')
        test = image.getpixel((0, 0))
        if isinstance(test, int):
            for row in self._get_rows(image):
                yield bytearray(row)
        else:
            count = len(test)
            total = count * image.width
            out = bytearray(total)
            for row in self._get_rows():
                for pixel, offset in zip(row, range(0, total, count)):
                    out[offset:offset + count] = pixel
            yield out

class imgtp (transformation='R')

Perform a number of transpositions on an input image. The transformation string must be a sequence composed of the letters H, V, and R. Each letter represents an operation:

R rotates the image to the left by 90 degrees.
V flips the image top to bottom (vertically).
H flips the image left to right (horizontally).

These transpositions are performed in the order in which they are specified.

Expand source code Browse git

class imgtp(Unit):
    """
    Perform a number of transpositions on an input image. The transformation string must be a
    sequence composed of the letters H, V, and R. Each letter represents an operation:

    - R rotates the image to the left by 90 degrees.
    - V flips the image top to bottom (vertically).
    - H flips the image left to right (horizontally).

    These transpositions are performed in the order in which they are specified.
    """
    def __init__(
        self,
        transformation: Arg.String(help='The transformation sequence; default is {default}.') = 'R'
    ):
        transformation = [Arg.AsOption(t, T) for t in transformation]
        super().__init__(transformation=transformation)

    @Unit.Requires('Pillow', 'formats')
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        imglib = self._image

        try:
            image = imglib.open(MemoryFile(data, read_as_bytes=True))
        except Exception:
            raise ValueError('input could not be parsed as an image')
        else:
            format = image.format
        conversion = {
            T.V: imglib.Transpose.FLIP_TOP_BOTTOM,
            T.H: imglib.Transpose.FLIP_LEFT_RIGHT,
            T.R: imglib.Transpose.ROTATE_90,
        }
        for tf in self.args.transformation:
            image = image.transpose(conversion[tf])
        with io.BytesIO() as out:
            image.save(out, format)
            return out.getvalue()

class imphash (reps=1, text=False)

Implements the import hash for PE files.

Expand source code Browse git

class imphash(HashUnit):
    """
    Implements the import hash for PE files.
    """

    def _algorithm(self, data):
        pe = lief.load_pe(data)
        th = lief.PE.get_imphash(pe, lief.PE.IMPHASH_MODE.PEFILE)
        return bytes.fromhex(th)

class innopwd

This unit emulates an InnoSetup installer in an attempt to determine the installer password. This works only when the password is contained within the script, but several malware samples are known to use this technique.

Expand source code Browse git

class innopwd(Unit):
    """
    This unit emulates an InnoSetup installer in an attempt to determine the installer password.
    This works only when the password is contained within the script, but several malware samples
    are known to use this technique.
    """
    def process(self, data: bytearray):
        if data.startswith(IFPSFile.Magic):
            inno = IFPSFile(data)
            self.log_info('running in script-only mode; cannot check passwords')
            file = None
        else:
            inno = InnoArchive(data, self)
            file = inno.get_encrypted_sample()
            if file is None:
                self.log_info('the archive is not password-protected, password is empty')
                return None
            self.log_info('password type:', file.password_type.name)
            self.log_info('password hash:', file.password_hash)
            self.log_info('password salt:', file.password_salt)

        emulator = InnoSetupEmulator(inno)

        for password in emulator.emulate_installation():
            if not isinstance(password, NewPassword):
                continue
            if file and not inno.check_password(file, password):
                self.log_info('discarding password:', password)
                continue
            yield password.encode(self.codec)
            if file is not None:
                self.log_info('aborting emulation after validating password')
                return

    @classmethod
    def handles(self, data):
        import re
        if data[:2] != B'MZ':
            return False
        if re.search(re.escape(InnoArchive.ChunkPrefix), data) is None:
            return False
        return bool(
            re.search(BR'Inno Setup Setup Data \(\d+\.\d+\.', data))

class isaac (key, discard=0, stateful=False)

The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.

Expand source code Browse git

class isaac(StreamCipherUnit):
    """
    The ISAAC (Indirection, Shift, Accumulate, Add, Count) cipher.
    """

    def keystream(self) -> Iterable[int]:
        key = self.args.key

        A: int = 0
        B: int = 0
        C: int = 0
        S: List[int] = [0x9E3779B9] * 8
        T: List[int] = []
        K = list(chunks.unpack(key + bytearray(0x400 - len(key)), 4, bigendian=False))
        U = 0xFFFFFFFF

        def _mix_state():
            a, b, c, d, e, f, g, h = S
            a ^= (b << 0x0B) & U; d = d + a & U; b = b + c & U # noqa
            b ^= (c >> 0x02) & U; e = e + b & U; c = c + d & U # noqa
            c ^= (d << 0x08) & U; f = f + c & U; d = d + e & U # noqa
            d ^= (e >> 0x10) & U; g = g + d & U; e = e + f & U # noqa
            e ^= (f << 0x0A) & U; h = h + e & U; f = f + g & U # noqa
            f ^= (g >> 0x04) & U; a = a + f & U; g = g + h & U # noqa
            g ^= (h << 0x08) & U; b = b + g & U; h = h + a & U # noqa
            h ^= (a >> 0x09) & U; c = c + h & U; a = a + b & U # noqa
            S[:] = a, b, c, d, e, f, g, h
            return S

        def _initialize_with(R: List[int]):
            for i in range(0, 0x100, 8):
                S[:] = (x + R[j] & U for j, x in enumerate(S, i))
                T[i:i + 8] = _mix_state()

        for _ in range(4):
            _mix_state()

        _initialize_with(K)
        _initialize_with(T)

        operations = [
            (__lshift__, 0x0D),
            (__rshift__, 0x06),
            (__lshift__, 0x02),
            (__rshift__, 0x10),
        ]

        while True:
            C = (C + 1) & U
            B = (B + C) & U
            for i in range(0x100):
                X = T[i]
                shift, k = operations[i % 4]
                A = (A ^ shift(A, k)) & U
                A = (A + T[i ^ 0x80]) & U
                Y = T[+i] = T[X // 4 & 0xFF] + A + B & U
                B = K[~i] = X + T[Y // 1024 & 0xFF] & U
            yield from chunks.pack(K, 4, True)

class jamv (name, data=None)

Short for "Join as Meta Variables": It joins all chunks in the current frame into a single one by storing the contents of each chunk as the contents of a meta variable in the output.

Expand source code Browse git

class jamv(Unit):
    """
    Short for "Join as Meta Variables": It joins all chunks in the current frame into a single one
    by storing the contents of each chunk as the contents of a meta variable in the output.
    """
    def __init__(
        self,
        name: Arg.String(metavar='format', help=(
            'A format string that specifies the variable name for storing the chunk.')),
        data: Arg.Binary(metavar='data', help=(
            'Optionally specify the body of the fused output chunk; empty by default.')) = None,
    ):
        super().__init__(name=name, data=data)

    def process(self, data: Chunk):
        try:
            meta = data.temp
        except Exception:
            meta = None
        if not isinstance(meta, dict):
            raise RuntimeError('this unit can only be used inside a frame')
        data.meta.update(meta)
        data[:] = self.args.data or B''
        return data

    def filter(self, inputs):
        head = None
        spec = self.args.name
        meta = {}
        for chunk in inputs:
            if not chunk.visible:
                yield chunk
                continue
            used = set()
            name = chunk.meta.format_str(spec, self.codec, [chunk], used=used)
            if head is None:
                for u in used:
                    chunk.meta.discard(u)
                head = chunk
            if name in meta:
                self.log_warn('overwriting duplicate variable:', name, clip=True)
            meta[name] = chunk
        if head:
            head.temp = meta
            yield head

class jcalg (ignore_header=False)

JCALG decompression.

Expand source code Browse git

class jcalg(Unit):
    """
    JCALG decompression.
    """
    def __init__(
        self,
        ignore_header: Unit.Arg('-g', help=(
            'Keep decompressing even after the output has reached the final size as given by the header value.')) = False,
    ):
        super().__init__(ignore_header=ignore_header)

    def process(self, data: bytearray):
        with MemoryFile() as output, StructReader(data) as reader:
            if reader.read(2) != B'JC':
                self.log_warn('data does not begin with magic sequence, assuming that header is missing')
                reader.seek(0)
                size = checksum = None
            else:
                size = reader.u32()
                checksum = reader.u32()
            if self.args.ignore_header:
                size = None
            self._decompress(output, reader, size)
            if size is not None:
                if len(output) > size:
                    self.log_info(F'tuncating to size {size}')
                    output.truncate(size)
                elif len(output) < size:
                    self.log_warn(F'header size was {size}, but only {len(data)} bytes were decompressed')
            data = output.getvalue()
            if checksum:
                c = self._checksum(data)
                if c != checksum:
                    self.log_warn(F'header checksum was {checksum:08X}, computed value is {c:08X}')
            return data

    @classmethod
    def handles(cls, data: bytearray):
        if data[:2] == B'JC':
            return True

    def _checksum(self, data):
        from refinery.lib import chunks
        checksum = 0
        it = chunks.unpack(data, 4)
        if len(data) % 4:
            import itertools
            it = itertools.chain(it, (int.from_bytes(data[-4:], 'little'),))
        for chunk in it:
            checksum += chunk
            checksum ^= ((chunk & 0x7FFFFFFF) << 1) + (chunk >> 31) + 1
            checksum &= 0xFFFFFFFF
        return checksum

    def _decompress(self, writer: MemoryFile, reader_: StructReader[bytearray], size: Optional[int] = None):
        index = 1
        base = 8
        literal_bits = None
        literal_offset = None
        flags = BitBufferedReader(reader_, 32)

        while True:
            if size and len(writer) >= size:
                break
            if flags.next():
                b = flags.read(literal_bits) + literal_offset
                b = b & 0xFF
                writer.write_byte(b)
                continue
            if flags.next():
                high = flags.variable_length_integer()
                if high == 2:
                    match_length = flags.variable_length_integer()
                else:
                    index = ((high - 3) << base) + flags.read(base)
                    match_length = flags.variable_length_integer()
                    if index >= 0x10000:
                        match_length += 3
                    elif index >= 0x37FF:
                        match_length += 2
                    elif index >= 0x27F:
                        match_length += 1
                    elif index <= 127:
                        match_length += 4
                writer.replay(index, match_length)
                continue
            if not flags.next():
                new_index = flags.read(7)
                match_length = 2 + flags.read(2)
                if new_index == 0:
                    if match_length == 2:
                        break
                    base = flags.read(match_length + 1)
                else:
                    index = new_index
                    writer.replay(index, match_length)
                continue
            one_byte_phrase_value = flags.read(4) - 1
            if one_byte_phrase_value == 0:
                writer.write_byte(0)
            elif one_byte_phrase_value > 0:
                b = writer.getvalue()[-one_byte_phrase_value]
                writer.write_byte(b)
            else:
                if not flags.next():
                    literal_bits = 7 + flags.next()
                    literal_offset = 0
                    if literal_bits != 8:
                        literal_offset = flags.read(8)
                    continue
                while True:
                    for _ in range(0x100):
                        b = flags.read(8)
                        writer.write_byte(b)
                    if not flags.next():
                        break

class jvdasm (*paths, gray=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

Disassembles the JVM bytecode instructions of methods of classes defined in Java class files. The unit is implemented as a path extractor and each path name corresponds to the name of one method defined in the class file.

Expand source code Browse git

class jvdasm(PathExtractorUnit):
    """
    Disassembles the JVM bytecode instructions of methods of classes defined in Java class
    files. The unit is implemented as a path extractor and each path name corresponds to the
    name of one method defined in the class file.
    """
    _OPC_STRLEN = max(len(op.name) for op in opc)

    def _hex(self, bytestring, sep=''):
        return sep.join(F'{x:02x}' for x in bytestring)

    def __init__(
        self, *paths,
        gray: PathExtractorUnit.Arg.Switch('-g', help='Disable colored output.') = False,
        **keywords
    ):
        super().__init__(*paths, gray=gray, **keywords)

    def unpack(self, data):
        def _name(method: JvClassMember):
            name = method.name
            if name == '<init>':
                _, _, name = str(jc.this).rpartition('/')
            elif m := re.fullmatch('<(.*?)>', name):
                name = F'.{m[0]}'
            return name

        def _path(method: JvClassMember):
            return F'{jc.this!s}/{_name(method)}'
        try:
            if self.args.gray or not self.isatty():
                raise ImportError
            import colorama
        except ImportError:
            class _FG():
                def __getattr__(self, _):
                    return ''
            FG = _FG()
            RS = ''
        else:
            FG = colorama.Fore
            RS = colorama.Style.RESET_ALL
        finally:
            c_none = RS
            c_space = FG.LIGHTCYAN_EX
            c_types = FG.LIGHTCYAN_EX
            c_member = FG.LIGHTYELLOW_EX
            c_kwd = FG.LIGHTYELLOW_EX
            c_const = FG.LIGHTRED_EX
            c_string = FG.LIGHTRED_EX
            c_address = FG.LIGHTBLACK_EX
            c_label = RS

        def _color(arg, offset):
            if isinstance(arg, (str, JvString)):
                color = c_string
            elif isinstance(arg, (JvClassProperty, JvTypePath)):
                ns, dd, prop = str(arg).partition('::')
                if not dd:
                    return repr(arg)
                ns = ns.split('.')
                ns = '.'.join(F'{c_space}{p}{c_none}' for p in ns)
                return F'{ns}{dd}{c_member}{prop}{c_none}'
            elif isinstance(arg, int) and arg + offset in labels:
                return F'{c_label}0x{arg + offset:08X}{c_none}'
            elif isinstance(arg, (bool, int, float)):
                color = c_const
            elif isinstance(arg, JvBaseType):
                color = c_kwd
            else:
                return repr(arg)
            return F'{color}{arg!r}{c_none}'

        jc = JvClassFile(data)
        tab = ' '
        namespace = '.'.join(str(jc.this).split('/'))
        opcw = self._OPC_STRLEN
        path_counter = collections.defaultdict(int)
        path_index = collections.defaultdict(int)

        for method in jc.methods:
            path_counter[_path(method)] += 1
        for method in jc.methods:
            for attribute in method.attributes:
                if attribute.name == 'Code':
                    break
            else:
                self.log_warn(F'no code found for method: {method.name}')
                continue
            code: JvCode = attribute.parse(JvCode)
            with io.StringIO() as display:
                rv, args = _parse_descriptor(method.descriptor, c_none, c_space, c_types, c_kwd)
                args = ', '.join(args)
                print(
                    F'{c_types}{rv}{c_none} {c_space}{namespace}{c_none}'
                    F'::{c_member}{_name(method)}{c_none}({args})', file=display)
                offset = 0
                labels = set()
                addresses = set()

                for op in code.disassembly:
                    addresses.add(offset)
                    if op.table:
                        labels.update(offset + jmp for jmp in op.table.values())
                    elif op.code in (opc.goto, opc.goto_w):
                        labels.update(offset + arg for arg in op.arguments if isinstance(arg, int))
                    offset += len(op.raw)

                offset = 0
                labels = labels & addresses

                for op in code.disassembly:
                    if offset in labels:
                        label = F'{c_label}{offset:08X}{c_none}:'
                    else:
                        label = F'{c_address}{offset:08X}{c_none}:'
                    addr = offset
                    olen = len(op.raw)
                    offset += olen
                    if op.table is None:
                        args = ', '.join(_color(a, addr) for a in op.arguments)
                    else:
                        ow = 4 if op.code is opc.tableswitch else 8
                        olen = olen - (len(op.table) - 1) * ow
                        args = F'___default => {c_label}{op.table[None] + addr:#010x}{c_none}'
                        jmps = []
                        for k, (key, jmp) in enumerate(op.table.items()):
                            if key is None:
                                continue
                            raw = self._hex(op.raw[olen + k * ow: olen + k * ow + ow], ' ')
                            jmps.append(
                                F'{label}{tab}'
                                F'{raw!s:<{opcw + 15}} '
                                F'{c_const}{key:#010x}{c_none} => '
                                F'{c_label}{jmp + addr:#010x}{c_none}')
                        args = '\n'.join((args, *jmps))
                    opch = self._hex(op.raw[:olen], ' ')
                    if len(opch) > 14:
                        opch += F'\n{label}{tab}{tab:<15}'
                    print(
                        F'{label}{tab}'
                        F'{opch:<15}'
                        F'{c_kwd}{op.code!r:<{opcw}}{c_none} {args}', file=display)
                path = _path(method)
                if path_counter[path] > 1:
                    k = path_index[path]
                    path_index[path] = k + 1
                    path = F'{path}[{k}]'
                yield UnpackResult(path, display.getvalue().encode(self.codec))

    @classmethod
    def handles(self, data):
        return data[:4] == B'\xCA\xFE\xBA\xBE'

class jvstr

Extract string constants from Java class files.

Expand source code Browse git

class jvstr(Unit):
    """
    Extract string constants from Java class files.
    """
    def process(self, data):
        jc = JvClassFile(data)
        for string in jc.strings:
            yield string.encode(self.codec)

    @classmethod
    def handles(self, data):
        return data[:4] == B'\xCA\xFE\xBA\xBE'

class kblob

Extracts a key from a Microsoft Crypto API BLOB structure.

Expand source code Browse git

class kblob(Unit):
    """
    Extracts a key from a Microsoft Crypto API BLOB structure.
    """

    def process(self, data):
        blob = CRYPTOKEY(data)
        try:
            return self.labelled(
                bytes(blob.key),
                type=blob.header.type.name,
                algorithm=blob.header.algorithm.name
            )
        except AttributeError as A:
            raise ValueError(F'unable to derive key from {blob.header.type!s}') from A

class keccak256 (reps=1, text=False)

Returns the KECCAK256 hash of the input data.

class kramer

Deobfuscate Python samples obfuscated with Kramer.

Expand source code Browse git

class kramer(Unit):
    """
    Deobfuscate Python samples obfuscated with Kramer.
    """

    _LINEBREAK_MAGIC = 950

    def process(self, data):
        kramer = str()
        secret = set()
        _pyver = None

        def crawl(code: CodeType, depth=1):
            nonlocal kramer
            for instruction in disassemble_code(code, _pyver):
                arg = instruction.argval
                if arg is None:
                    continue
                if isinstance(arg, tuple):
                    continue
                if isinstance(arg, str):
                    if len(arg) > len(kramer):
                        kramer = arg
                    continue
                if isinstance(arg, int):
                    secret.add(arg)
                    continue
                try:
                    crawl(arg, depth + 1)
                except Exception as E:
                    self.log_info(F'error crawling arg of type {type(arg).__name__} at depth {depth}: {E}')

        for code in extract_code_from_buffer(bytes(data)):
            _pyver = code.version
            crawl(code.container)

        if not kramer:
            raise ValueError('could not find the encoded string')

        separator = re.search('[^a-fA-F0-9]+', kramer)

        if not separator:
            raise ValueError('no separator detected; encoding method may have changed')

        def rotchar(c: int):
            if c in range(0x61, 0x7a) or c in range(0x30, 0x39):
                return c + 1
            if c == 0x7a:
                return 0x30
            if c == 0x39:
                return 0x61
            return c

        def decrypt(c: int, k: int):
            if c >= k:
                out = rotchar(c - k)
                if out not in range(0x100):
                    raise _WrongKey
                return out
            if c == self._LINEBREAK_MAGIC:
                return 0x0A
            raise _WrongKey

        def decrypt_with_key(key: int):
            decrypted = bytearray(decrypt(c, key) for c in encrypted)
            if not re.fullmatch(B'[\\s!-~]+', decrypted):
                raise _WrongKey
            return decrypted

        separator = separator.group(0)
        encrypted = [ord(bytes.fromhex(e).decode()) for e in kramer.split(separator)]

        ubound = min(x for x in encrypted if x != self._LINEBREAK_MAGIC)
        lbound = ubound - 0xFF

        secret = {k for k in secret if k > lbound and k < ubound}
        self.log_debug('potential secrets from code:', secret)

        for key in sorted(secret, reverse=True):
            try:
                return decrypt_with_key(key)
            except _WrongKey:
                pass

        self.log_info(F'all candidates failed, searching [{lbound}, {ubound}]')

        for key in range(ubound, lbound - 1, -1):
            try:
                self.log_debug('attempting key:', key)
                return decrypt_with_key(key)
            except _WrongKey:
                pass

        raise RuntimeError('could not find decryption key')

class lnk (tabular=False, details=False)

Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This unit is a thin wrapper around the LnkParse3 library.

Expand source code Browse git

class lnk(Unit):
    """
    Parse Windows Shortcuts (LNK files) and returns the parsed information in JSON format. This
    unit is a thin wrapper around the LnkParse3 library.
    """

    @Unit.Requires('LnkParse3>=1.4.0', 'formats', 'default', 'extended')
    def _LnkParse3():
        import LnkParse3
        return LnkParse3

    _PATHS = {
        'data': ...,
        'header': {'creation_time', 'accessed_time', 'modified_time', 'windowstyle'},
        'link_info': {'local_base_path', 'location'},
    }

    def __init__(
        self,
        tabular: Unit.Arg('-t', help='Print information in a table rather than as JSON.') = False,
        details: Unit.Arg('-d', help='Print all details; some properties are hidden by default.') = False,
    ):
        super().__init__(tabular=tabular, details=details)

    def process(self, data):
        with NoLogging():
            parsed = self._LnkParse3.lnk_file(MemoryFile(data)).get_json()
        if not self.args.details:
            paths = self._PATHS
            noise = [key for key in parsed if key not in paths]
            for key in noise:
                del parsed[key]
            for path, scope in paths.items():
                if scope is (...):
                    continue
                try:
                    section = parsed[path]
                except KeyError:
                    continue
                noise = [key for key in section if key not in scope]
                for key in noise:
                    del section[key]
        with JSONEncoderEx as encoder:
            pp = ppjson(tabular=self.args.tabular)
            yield from pp._pretty_output(
                parsed, indent=4, cls=encoder, ensure_ascii=False)

class loop (count, suffix, do_while, do_until, fullmatch=False, multiline=False, ignorecase=False)

Applies a given multibin suffix to the input chunk repeatedly. For example, the following command would carve the largest base64-encoded buffer from the input, decode it, and then decompress the result 20 times:

emit data | loop 20 csd[b64]:zl

Notably, the argument after the count is a suffix, which means that handlers are applied from left to right (not from right to left). The loop is aborted and the previous result returned if the newly computed result is empty. If the an error occurs while computing the suffix and the unit is lenient (i.e. the -L switch is set), the last known result is returned.

Expand source code Browse git

class loop(RegexUnit):
    """
    Applies a given multibin suffix to the input chunk repeatedly. For example, the following
    command would carve the largest base64-encoded buffer from the input, decode it, and then
    decompress the result 20 times:

        emit data | loop 20 csd[b64]:zl

    Notably, the argument after the count is a suffix, which means that handlers are applied
    from left to right (not from right to left). The loop is aborted and the previous result
    returned if the newly computed result is empty. If the an error occurs while computing
    the suffix and the unit is lenient (i.e. the `-L` switch is set), the last known result
    is returned.
    """

    def __init__(
        self,
        count: Arg.Number(metavar='count', help='The number of repeated applications of the suffix.'),
        suffix: Arg(type=str, help='A multibin expression suffix.'),
        do_while: Arg('-w', '--while', type=regexp, metavar='RE',
            help='Halt when the given regular expression does not match the data.'),
        do_until: Arg('-u', '--until', type=regexp, metavar='RE',
            help='Halt when the given regular expression matches the data.'),
        fullmatch=False, multiline=False, ignorecase=False,
    ):
        super().__init__(
            count=count,
            suffix=suffix,
            do_while=do_while,
            do_until=do_until,
            fullmatch=fullmatch,
            multiline=multiline,
            ignorecase=ignorecase,
        )

    def process(self, data):
        _count = self.args.count
        _width = len(str(_count))
        _while = self._while
        _until = self._until

        for k in range(_count):
            if _while and not _while(data):
                self.log_info(F'step {k:0{_width}}: stopping, while-condition violated')
                break
            if _until and _until(data):
                self.log_info(F'step {k:0{_width}}: stopping, until-condition satisfied')
                break
            try:
                out = DelayedBinaryArgument(
                    self.args.suffix, reverse=True, seed=data)(data)
            except Exception as error:
                self.log_info(F'step {k:0{_width}}: error;', exception_to_string(error))
                msg = F'Stopped after {k} steps, increase verbosity for additional details.'
                raise RefineryPartialResult(msg, data) from error
            if not out:
                self.log_info(F'step {k:0{_width}}: stopping after empty result')
                break
            data[:] = out
            self.log_debug(F'step {k:0{_width}}: data =', data, clip=True)

        return data

    @property
    def _while(self):
        return self._make_matcher(self.args.do_while)

    @property
    def _until(self):
        return self._make_matcher(self.args.do_until)

class lz4

LZ4 block decompression. See also: https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format

Expand source code Browse git

class lz4(Unit):
    """
    LZ4 block decompression. See also:
    https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md#compressed-block-format
    """
    def _read_block(self, reader: StructReader, output: io.BytesIO, ubound=None):
        entry = reader.tell()
        lastend = 0

        def ubound_check():
            if ubound is None:
                return False
            consumed = reader.tell() - entry
            if consumed > ubound:
                raise ValueError(F'upper bound {ubound} exceeded by {consumed - ubound} in LZ4 block')
            return consumed == ubound

        while not reader.eof:
            reflen = reader.read_nibble()
            litlen = reader.read_nibble()
            litlen = reader.read_size(litlen)
            literal = reader.read(litlen)
            output.write(literal)
            if ubound_check():
                break
            try:
                refpos = reader.u16()
            except EOFError:
                break
            if refpos - 1 not in range(output.tell()):
                with StreamDetour(output, lastend):
                    if output.read(len(literal)) == literal:
                        # This literal could have been encoded in the last match, but it wasn't.
                        # Therefore, it is very likely that we have reached the end of the stream.
                        break
                position = reader.tell()
                remaining = len(literal) - position
                raise RefineryPartialResult(
                    F'encountered invalid match offset value {refpos} at position {position} with {remaining} bytes remaining',
                    partial=output.getvalue())
            reflen = reader.read_size(reflen)
            if ubound_check():
                raise ValueError('last sequence in block contained a match')
            reflen += 4
            available_bytes = min(refpos, reflen)
            q, r = divmod(reflen, available_bytes)
            with StreamDetour(output, -refpos, io.SEEK_CUR):
                match = output.read(available_bytes)
                match = q * match + match[:r]
                assert len(match) == reflen
                lastend = output.tell() - available_bytes + r
            output.write(match)

    def process(self, data):
        output = io.BytesIO()
        reader = LZ4Reader(memoryview(data))
        try:
            magic = reader.u32() == 0x184D2204
        except EOFError:
            magic = False
        if not magic:
            reader.seek(0)
            self._read_block(reader, output)
            return output.getvalue()

        (v1, v2, blocks_independent, blocks_checksummed,
            content_size, content_checksummed, rsrv1, dict_id) = reader.read_bits(8)
        rsrv2 = reader.read_nibble()
        try:
            block_maximum = {
                7: 0x400000,
                6: 0x100000,
                5: 0x040000,
                4: 0x010000,
            }[reader.read_integer(3)]
        except KeyError:
            raise ValueError('unknown maximum block size value in LZ4 frame header')
        rsrv3 = reader.read_bit()
        if any((rsrv1, rsrv2, rsrv3)):
            self.log_warn('nonzero reserved value in LZ4 frame header')
        if (v1, v2) != (0, 1):
            self.log_warn(F'invalid version ({v1},{v2}) in LZ4 frame header')
        content_size = content_size and reader.u64() or None
        dict_id = dict_id and reader.u32() or None
        # Header Checksum
        xxh = xxhash(data[4:reader.tell()]).intdigest() >> 8 & 0xFF
        chk = reader.read_byte()
        if chk != xxh:
            self.log_warn(F'header checksum {chk:02X} does not match computed value {xxh:02X}')

        self.log_debug(lambda: F'dictionary id: {dict_id}')
        self.log_debug(lambda: F'block max: 0x{block_maximum:X}')
        if content_size is not None:
            self.log_debug(lambda: F'chunk max: 0x{content_size:X}')
        self.log_debug(lambda: F'blocks independent: {bool(blocks_independent)}')
        self.log_debug(lambda: F'blocks checksummed: {bool(blocks_checksummed)}')

        blockindex = 0

        while True:
            blockindex += 1
            size = reader.read_integer(31)
            uncompressed = reader.read_bit()
            if not size:
                assert not uncompressed
                break
            self.log_info(F'reading block of size 0x{size:06X}')
            assert reader.byte_aligned
            assert size <= block_maximum, 'block size exceeds maximum size'
            if uncompressed:
                output.write(reader.read_exactly(size))
            else:
                self._read_block(reader, output, size)
            if blocks_checksummed:
                with StreamDetour(reader, -size, io.SEEK_CUR):
                    xxh = xxhash(reader.read_exactly(size)).intdigest()
                chk = reader.u32()
                if chk != xxh:
                    self.log_warn(F'block {blockindex} had checksum {chk:08X} which did not match computed value {xxh:08X}')
        value = output.getvalue()
        if content_checksummed:
            self.log_info('computing checksum')
            xxh = xxhash(value).intdigest()
            chk = reader.u32()
            if chk != xxh:
                self.log_warn(F'the given checksum {chk:08X} did not match the computed checksum {xxh:08X}')
        if not reader.eof:
            pos = reader.tell()
            self.log_warn(F'found {len(data) - pos} additional bytes starting at position 0x{pos:X} after compressed data')
        return value

class lzf (fast=False)

This unit implements LZF compression and decompression.

Expand source code Browse git

class lzf(Unit):
    """
    This unit implements LZF compression and decompression.
    """

    def __init__(self, fast: Arg.Switch('-x', help='Enable fast compression mode.') = False):
        super().__init__(fast=fast)

    def reverse(self, data):
        def FRST(p: memoryview) -> int:
            return ((p[0]) << 8) | p[1]

        def NEXT(v: int, p: memoryview) -> int:
            return ((v << 8) | p[2]) & 0xFFFFFFFF

        def DELTA(p: memoryview):
            return view.nbytes - p.nbytes

        if self.args.fast:
            def HIDX(h: int) -> int:
                return (((h >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))
        else:
            def HIDX(h: int) -> int:
                q = (h ^ (h << 5))
                return (((q >> (3 * 8 - _HSLOG)) - h * 5) & (_HSIZE - 1))

        if not data:
            return data

        ip = view = memoryview(data)
        op = bytearray()

        if len(data) == 1:
            op.append(0)
            op.extend(data)
            return op

        hval = FRST(ip)
        htab = [0] * _HSIZE
        fast = 1 if self.args.fast else 0

        lit = 0

        def begin_literal():
            nonlocal lit
            op.append(0)
            lit = 0

        def advance_literal():
            nonlocal lit, ip
            lit += 1
            op.append(ip[0])
            ip = ip[1:]
            if lit == _MAX_LIT:
                op[-lit - 1] = lit - 1
                begin_literal()

        def commit_literal():
            if lit > 0:
                op[-lit - 1] = lit - 1
            else:
                op.pop()

        begin_literal()

        while ip.nbytes > 2:
            hval = NEXT(hval, ip)
            hpos = HIDX(hval)
            ipos = DELTA(ip)
            length = 2
            r, htab[hpos] = htab[hpos], ipos
            off = ipos - r - 1
            ref = view[r:]

            if off >= _MAX_OFF or r <= 0 or ref[:3] != ip[:3]:
                advance_literal()
                continue
            else:
                commit_literal()

            maxlen = min(_MAX_REF, ip.nbytes - length)

            while True:
                length += 1
                if length >= maxlen or ref[length] != ip[length]:
                    length -= 2
                    break

            if length < 7:
                op.append((off >> 8) + (length << 5))
            else:
                op.append((off >> 8) + (7 << 5))
                op.append(length - 7)

            op.append(off & 0xFF)
            begin_literal()

            if ip.nbytes <= length + 3:
                ip = ip[length + 2:]
                break
            if fast:
                ip = ip[length:]
                hval = FRST(ip)
                for _ in range(2):
                    hval = NEXT(hval, ip)
                    htab[HIDX(hval)] = DELTA(ip)
                    ip = ip[1:]
            else:
                ip = ip[1:]
                for _ in range(length + 1):
                    hval = NEXT(hval, ip)
                    htab[HIDX(hval)] = DELTA(ip)
                    ip = ip[1:]
        while ip.nbytes:
            advance_literal()
        commit_literal()
        return op

    def _decompress_chunk(self, data: memoryview, out: MemoryFile):
        ip = StructReader(data)
        while not ip.eof:
            ctrl = ip.u8()
            if ctrl < 0B100000:
                ctrl += 1
                out.write(ip.read_exactly(ctrl))
            else:
                length = ctrl >> 5
                offset = 1 + ((ctrl & 0B11111) << 8)
                if length == 7:
                    length += ip.u8()
                offset += ip.u8()
                length += 2
                out.replay(offset, length)

    def process(self, data):
        mem = memoryview(data)
        out = MemoryFile()

        try:
            reader = StructReader(mem)
            header = LZFHeader(reader)
        except Exception:
            self.log_info('no header detected, decompressing as raw stream')
            self._decompress_chunk(mem, out)
            return out.getvalue()

        for k in itertools.count(1):
            self.log_info(F'chunk: e=0x{header.encoded_size:04X} d=0x{header.decoded_size:04X}')
            chunk = reader.read(header.encoded_size)
            if header.compressed:
                self._decompress_chunk(chunk, out)
            else:
                out.write(chunk)
            if reader.eof:
                break
            try:
                header = LZFHeader(reader)
            except Exception as E:
                msg = F'failed parsing next header after {k} chunks: {E!s}'
                raise RefineryPartialResult(msg, out.getvalue())

        return out.getvalue()

    @classmethod
    def handles(self, data: bytearray):
        if data[:2] == LZFHeader.MAGIC:
            return True

class lzg

LZG decompression.

Expand source code Browse git

class lzg(Unit):
    """
    LZG decompression.
    """
    def process(self, data: bytearray):
        stream = LZGStream(data)
        out = stream.decompress()
        if len(out) != stream.decoded_size:
            msg = F'LZG header announced {stream.decoded_size} bytes, but decompressed buffer had size {len(out)}.'
            raise RefineryPartialResult(msg, out)
        return out

    @classmethod
    def handles(cls, data: bytearray):
        if data[:3] == B'LZG':
            return True

class lzip

LZIP decompression

Expand source code Browse git

class lzip(Unit):
    """
    LZIP decompression
    """
    def process(self, data: bytearray):
        view = memoryview(data)
        with MemoryFile() as output, StructReader(view) as reader:
            for k in count(1):
                if reader.eof:
                    break
                trailing_size = len(data) - reader.tell()
                try:
                    ID, VN, DS = reader.read_struct('4sBB')
                    if ID != B'LZIP':
                        if k > 1:
                            raise EOF
                        else:
                            self.log_warn(F'ignoring invalid LZIP signature: {ID.hex()}')
                    if VN != 1:
                        self.log_warn(F'ignoring invalid LZIP version: {VN}')
                    dict_size = 1 << (DS & 0x1F)
                    dict_size -= (dict_size // 16) * ((DS >> 5) & 7)
                    if dict_size not in range(_MIN_DICT_SIZE, _MAX_DICT_SIZE + 1):
                        raise ValueError(
                            F'The dictionary size {dict_size} is out of the valid range '
                            F'[{_MIN_DICT_SIZE}, {_MAX_DICT_SIZE}]; unable to proceed.'
                        )
                    decoder = MemberDecoder(dict_size, reader, output)
                    if not decoder():
                        raise ValueError(F'Data error in stream {k}.')
                    crc32, data_size, member_size = reader.read_struct('<LQQ')
                    if crc32 != decoder.crc32:
                        self.log_warn(F'checksum in stream {k} was {decoder.crc:08X}, should have been {crc32:08X}.')
                    if member_size - 20 != decoder.member_position:
                        self.log_warn(F'member size in stream {k} was {decoder.member_position}, should have been {member_size}.')
                    if data_size != decoder.data_position:
                        self.log_warn(F'data size in stream {k} was {decoder.data_position}, should have been {data_size}.')
                except EOFError:
                    if k <= 1:
                        raise
                    self.log_info(F'silently ignoring {trailing_size} bytes of trailing data')
                    break

            return output.getvalue()

    @classmethod
    def handles(self, data: bytearray):
        return data[:4] == B'LZIP'

class lzjb

LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.

Expand source code Browse git

class lzjb(Unit):
    """
    LZJB compression and decompression. This LZ-type compression is used in the ZFS file system.
    """
    def reverse(self, src):
        # https://web.archive.org/web/20100807223517/ ..
        # .. http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/fs/zfs/lzjb.c
        output = bytearray()
        lempel = [0] * _LEMPEL_SIZE
        copymask = 0x80
        position = 0
        while position < len(src):
            copymask <<= 1
            if copymask >= 0x100:
                copymask = 1
                copymap = len(output)
                output.append(0)
            if position > len(src) - _MATCH_MAX:
                output.append(src[position])
                position += 1
                continue
            hsh = (src[position] << 16) + (src[position + 1] << 8) + src[position + 2]
            hsh += hsh >> 9
            hsh += hsh >> 5
            hsh %= len(lempel)
            offset = (position - lempel[hsh]) & _OFFSET_MASK
            lempel[hsh] = position
            cpy = position - offset
            if cpy >= 0 and cpy != position and src[position:position + 3] == src[cpy:cpy + 3]:
                output[copymap] |= copymask
                for mlen in range(_MATCH_MIN, min(len(src) - position, _MATCH_MAX)):
                    if src[position + mlen] != src[cpy + mlen]:
                        break
                output.append(((mlen - _MATCH_MIN) << (8 - _MATCH_LEN)) | (offset >> 8))
                output.append(offset & 255)
                position += mlen
            else:
                output.append(src[position])
                position += 1
        return output

    def process(self, data):
        dst = bytearray()
        src = StructReader(data)
        while not src.eof:
            copy = src.read_byte()
            for mask in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80):
                if src.eof:
                    break
                if not copy & mask:
                    dst.append(src.read_byte())
                    continue
                elif not dst:
                    raise ValueError('copy requested against empty buffer')
                with src.be:
                    match_len = src.read_integer(6) + _MATCH_MIN
                    match_pos = src.read_integer(10)
                if not match_pos or match_pos > len(dst):
                    raise RuntimeError(F'invalid match offset at position {src.tell()}')
                match_pos = len(dst) - match_pos
                while match_len > 0:
                    match = dst[match_pos:match_pos + match_len]
                    dst.extend(match)
                    match_pos += len(match)
                    match_len -= len(match)
        return dst

class lzma (raw=False, alone=False, xz=False, level=9, delta=None)

LZMA compression and decompression.

Expand source code Browse git

class lzma(Unit):
    """
    LZMA compression and decompression.
    """

    _SEARCH_MIN_DICT = 0x1_0000
    _SEARCH_MAX_DICT = 0x1000_0000
    _SEARCH_MAX_BLOW = 1.2
    _SEARCH_SKIP1 = 0x08
    _SEARCH_SKIP2 = 0x10
    _ATTEMPT_PARTIAL = True

    def __init__(
        self,
        raw   : Arg.Switch('-r', group='MODE', help='Use raw (no container) format.') = False,
        alone : Arg.Switch('-a', group='MODE', help='Use the lzma container format.') = False,
        xz    : Arg.Switch('-x', group='MODE', help='Use the default xz format.') = False,
        level : Arg.Number('-l', bound=(0, 9), help='The compression level preset; between 0 and 9.') = 9,
        delta : Arg.Number('-d', help='Add a delta filter when compressing.') = None,
    ):
        if (raw, alone, xz).count(True) > 1:
            raise ValueError('Only one container format can be enabled.')
        if level not in range(10):
            raise ValueError('Compression level must be a number between 0 and 9.')
        super().__init__(filter=filter, raw=raw, alone=alone, xz=xz, delta=delta,
            level=level | PRESET_EXTREME)

    def reverse(self, data):
        filters = []
        if self.args.delta is not None:
            self.log_debug('adding delta filter')
            filters.append({'id': FILTER_DELTA, 'dist': self.args.delta})
        if self.args.alone:
            self.log_debug('setting alone format')
            mode = FORMAT_ALONE
            filters.append({'id': FILTER_LZMA1, 'preset': self.args.level})
        elif self.args.raw:
            self.log_debug('setting raw format')
            mode = FORMAT_RAW
            filters.append({'id': FILTER_LZMA2, 'preset': self.args.level})
        else:
            if not self.args.xz:
                self.log_info('choosing default .xz container format for compression')
            mode = FORMAT_XZ
            filters.append({'id': FILTER_LZMA2, 'preset': self.args.level})
        lz = LZMACompressor(mode, filters=filters)
        output = lz.compress(data)
        output += lz.flush()
        return output

    def _decompress(self, data: bytearray, lz: LZMADecompressor, partial: bool = False):
        temp = bytearray()
        sizes = repeat(1) if partial else [len(data)]
        with MemoryFile(temp) as output:
            with MemoryFile(data) as stream:
                for size in sizes:
                    if stream.eof or stream.closed:
                        break
                    try:
                        offset = stream.tell()
                        output.write(lz.decompress(stream.read(size)))
                    except (EOFError, LZMAError):
                        raise RefineryPartialResult(
                            F'compression failed at offset {offset}', temp)
        if n := len(lz.unused_data):
            raise RefineryPartialResult(F'Data stream is truncated, {n} bytes unused.', temp)
        return temp

    def _process(self, data: bytearray, partial=False):
        try:
            dc = LZMADecompressor()
            return self._decompress(data, dc, partial)
        except RefineryPartialResult as pe:
            best = pe
        except Exception:
            best = None
            self.log_info('default LZMA decompressor failed, brute-forcing custom header')
        view = memoryview(data)
        min_original_size = {
            # https://sourceforge.net/p/sevenzip/discussion/45797/thread/b6bd62f8/
            1: int((len(data) - 64_000) / 1.100), # noqa
            2: int((len(data) -  1_000) / 1.001), # noqa
        }
        for (version, p), offset_prop, to_data in product(
            ((1, 5),
             (2, 1)),
            range(self._SEARCH_SKIP1 + 1),
            range(self._SEARCH_SKIP2 + 1),
        ):
            if offset_prop + to_data > p + 20:
                # expect no more than a 20 byte header on top of the properties
                # that would be enough for, e.g. compressed & uncompressed size
                # each filling a full 64bit integer and 4 additional bytes.
                continue
            try:
                filter = parse_lzma_properties(
                    view[offset_prop:offset_prop + p],
                    version,
                    min_dict=self._SEARCH_MIN_DICT,
                    max_dict=self._SEARCH_MAX_DICT,
                )
                self.log_debug(F'attempt LZMA{version} at {offset_prop:02d}, skipping {to_data:02d}, filter: {filter!r}')
                engine = LZMADecompressor(FORMAT_RAW, filters=[filter])
                result = self._decompress(view[offset_prop + p + to_data:], engine, partial)
            except RefineryPartialResult as pe:
                if best is None:
                    best = pe
                elif len(best.partial) < len(pe.partial):
                    best = pe
                continue
            except Exception:
                continue
            if len(result) < min_original_size[version]:
                continue
            if len(result) * self._SEARCH_MAX_BLOW < len(data):
                continue
            self.log_info(
                F'success with LZMA{version} properties at {offset_prop} and raw stream starting at {to_data + offset_prop + p}')
            return result
        if partial or not self._ATTEMPT_PARTIAL:
            if best and len(best.partial) > 0:
                raise best
            raise ValueError('unable to find an LZMA stream')

    def process(self, data: bytearray):
        if out := self._process(data):
            return out
        return self._process(data, partial=True)

    @classmethod
    def handles(self, data: bytearray):
        if data[:4] == B'\x5D\0\0\0':
            return True
        if data[:5] == B'\xFD7zXZ':
            return True

class lznt1 (chunk_size=4096)

LZNT1 compression and decompression. This compression algorithm is expected by the Win32 API routine RtlDecompressBuffer, for example.

Expand source code Browse git

class lznt1(Unit):
    """
    LZNT1 compression and decompression. This compression algorithm is expected
    by the Win32 API routine `RtlDecompressBuffer`, for example.
    """

    def _decompress_chunk(self, chunk):
        out = B''
        while chunk:
            flags = chunk[0]
            chunk = chunk[1:]
            for i in range(8):
                if not (flags >> i & 1):
                    out += chunk[:1]
                    chunk = chunk[1:]
                else:
                    flag = struct.unpack('<H', chunk[:2])[0]
                    pos = len(out) - 1
                    l_mask = 0xFFF
                    o_shift = 12
                    while pos >= 0x10:
                        l_mask >>= 1
                        o_shift -= 1
                        pos >>= 1
                    length = (flag & l_mask) + 3
                    offset = (flag >> o_shift) + 1
                    if length >= offset:
                        tmp = out[-offset:] * (0xFFF // len(out[-offset:]) + 1)
                        out += tmp[:length]
                    else:
                        out += out[-offset:length - offset]
                    chunk = chunk[2:]
                if len(chunk) == 0:
                    break
        return out

    def _find(self, src, target, max_len):
        result_offset = 0
        result_length = 0
        for i in range(1, max_len):
            offset = src.rfind(target[:i])
            if offset == -1:
                break
            tmp_offset = len(src) - offset
            tmp_length = i
            if tmp_offset == tmp_length:
                tmp = src[offset:] * (0xFFF // len(src[offset:]) + 1)
                for j in range(i, max_len + 1):
                    offset = tmp.rfind(target[:j])
                    if offset == -1:
                        break
                    tmp_length = j
            if tmp_length > result_length:
                result_offset = tmp_offset
                result_length = tmp_length
        if result_length < 3:
            return 0, 0
        return result_offset, result_length

    def _compress_chunk(self, chunk):
        blob = copy.copy(chunk)
        out = B''
        pow2 = 0x10
        l_mask3 = 0x1002
        o_shift = 12
        while len(blob) > 0:
            bits = 0
            tmp = B''
            for i in range(8):
                bits >>= 1
                while pow2 < (len(chunk) - len(blob)):
                    pow2 <<= 1
                    l_mask3 = (l_mask3 >> 1) + 1
                    o_shift -= 1
                if len(blob) < l_mask3:
                    max_len = len(blob)
                else:
                    max_len = l_mask3
                offset1, length1 = self._find(
                    chunk[:len(chunk) - len(blob)], blob, max_len)
                # try to find more compressed pattern
                offset2, length2 = self._find(
                    chunk[:len(chunk) - len(blob) + 1], blob[1:], max_len)
                if length1 < length2:
                    length1 = 0
                if length1 > 0:
                    symbol = ((offset1 - 1) << o_shift) | (length1 - 3)
                    tmp += struct.pack('<H', symbol)
                    bits |= 0x80  # set the highest bit
                    blob = blob[length1:]
                else:
                    tmp += blob[:1]
                    blob = blob[1:]
                if len(blob) == 0:
                    break
            out += struct.pack('B', bits >> (7 - i))
            out += tmp
        return out

    def reverse(self, buf):
        out = B''
        while buf:
            chunk = buf[:self.args.chunk_size]
            compressed = self._compress_chunk(chunk)
            if len(compressed) < len(chunk):  # chunk is compressed
                flags = 0xB000
                header = struct.pack('<H', flags | (len(compressed) - 1))
                out += header + compressed
            else:
                flags = 0x3000
                header = struct.pack('<H', flags | (len(chunk) - 1))
                out += header + chunk
            buf = buf[self.args.chunk_size:]
        return out

    def process(self, data):
        out = io.BytesIO()
        offset = 0
        while offset < len(data):
            try:
                header, = struct.unpack('<H', data[offset:offset + 2])
            except struct.error as err:
                raise RefineryPartialResult(str(err), partial=out.getvalue())
            offset += 2
            size = (header & 0xFFF) + 1
            if size + 1 >= len(data):
                raise RefineryPartialResult(
                    F'chunk header indicates size {size}, but only {len(data)} bytes remain.',
                    partial=out.getvalue()
                )
            chunk = data[offset:offset + size]
            offset += size
            if header & 0x8000:
                chunk = self._decompress_chunk(chunk)
            out.write(chunk)
        return out.getvalue()

    def __init__(self, chunk_size: Arg.Number('-c', help='Optionally specify the chunk size for compression, default is 0x1000.') = 0x1000):
        super().__init__(chunk_size=chunk_size)

class lzo

LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used).

Expand source code Browse git

class lzo(Unit):
    """
    LZO decompression. The code works against simple test cases, but it is known to fail for certain outputs produced by the lzop
    command-line tool when high compression ratio is favoured (i.e. when the -9 switch is used).
    """
    def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray:
        """
        An implementation of LZO decompression. We use the article
        "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)"
        as a reference since no proper specification is available.
        """
        def integer() -> int:
            length = 0
            while True:
                byte = src.read_byte()
                if byte:
                    return length + byte
                length += 0xFF
                if length > 0x100000:
                    raise LZOError('Too many zeros in integer encoding.')

        def literal(count):
            dst.write(src.read_bytes(count))

        def copy(distance: int, length: int):
            if distance > len(dst):
                raise LZOError(F'Distance {distance} > bufsize {len(dst)}')
            buffer = dst.getvalue()
            if distance > length:
                start = len(buffer) - distance
                end = start + length
                dst.write(buffer[start:end])
            else:
                block = buffer[-distance:]
                while len(block) < length:
                    block += block[:length - len(block)]
                if len(block) > length:
                    block[length:] = ()
                dst.write(block)

        src = StructReader(memoryview(data))
        dst = MemoryFile()

        state = 0
        first = src.read_byte()

        if first == 0x10:
            raise LZOError('Invalid first stream byte 0x10.')
        elif first <= 0x12:
            src.seekrel(-1)
        elif first <= 0x15:
            state = first - 0x11
            literal(state)
        else:
            state = 4
            literal(first - 0x11)

        while True:
            instruction = src.read_byte()
            if instruction < 0x10:
                if state == 0:
                    length = instruction or integer() + 15
                    state = length + 3
                    if state < 4:
                        raise LZOError('Literal encoding is too short.')
                else:
                    state = instruction & 0b0011
                    D = (instruction & 0b1100) >> 2
                    H = src.read_byte()
                    distance = (H << 2) + D + 1
                    if state >= 4:
                        distance += 0x800
                        length = 3
                    else:
                        length = 2
                    copy(distance, length)
            elif instruction < 0x20:
                L = instruction & 0b0111
                H = instruction & 0b1000
                length = L or integer() + 7
                argument = src.u16()
                state = argument & 3
                distance = (H << 11) + (argument >> 2)
                if not distance:
                    return dst.getvalue()
                if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265):
                    raise LZOError('Compressed data contains sequence that is banned in LZOv1.')
                if LZOv1 and distance == 0xBFFF:
                    X = src.read_byte()
                    count = ((X << 3) | L) + 4
                    self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.')
                    dst.write(B'\0' * count)
                else:
                    copy(distance + 0x4000, length + 2)
            elif instruction < 0x40:
                L = instruction & 0b11111
                length = L or integer() + 31
                argument = src.u16()
                state = argument & 3
                distance = (argument >> 2) + 1
                copy(distance, length + 2)
            else:
                if instruction < 0x80:
                    length = 3 + ((instruction >> 5) & 1)
                else:
                    length = 5 + ((instruction >> 5) & 3)
                H = src.read_byte()
                D = (instruction & 0b11100) >> 2
                state = instruction & 3
                distance = (H << 3) + D + 1
                copy(distance, length)
            if state:
                literal(state)

    def process(self, data):
        try:
            lzo = LZO(data)
        except LZOError:
            self.log_info('Not an LZO archive, processing raw stream.')
            return self.decompress_stream(data)
        with MemoryFile() as output:
            for k, chunk in enumerate(lzo, 1):
                self.log_debug(F'decompressing chunk {k}')
                output.write(self.decompress_stream(chunk.data))
            return self.labelled(
                output.getvalue(),
                path=lzo.name,
                date=date_from_timestamp(lzo.mtime)
            )

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        sig = LZO.SIGNATURE
        if data[:len(sig)] == sig:
            return True

class lzw

LZW decompression based on ancient Unix sources.

Expand source code Browse git

class lzw(Unit):
    '''
    LZW decompression based on ancient Unix sources.
    '''

    _MAGIC = B'\x1F\x9D'

    def process(self, data: bytearray):
        out = MemoryFile()
        inf = StructReader(memoryview(data))

        if inf.peek(2) != self._MAGIC:
            self.log_info('No LZW signature found, assuming raw stream.')
            maxbits = LZW.BITS
            block_mode = True
        else:
            inf.seekrel(2)
            maxbits = inf.read_integer(5)
            if inf.read_integer(2) != 0:
                self.log_info('reserved bits were set in LZW header')
            block_mode = bool(inf.read_bit())

        if maxbits > LZW.BITS:
            raise ValueError(F'Compressed with {maxbits} bits; cannot handle file.')

        maxmaxcode = 1 << maxbits

        ibuf = inf.read()

        tab_suffix = bytearray(LZW.WSIZE * 2)
        tab_prefix = array('H', itertools.repeat(0, 1 << LZW.BITS))

        n_bits = LZW.INIT_BITS
        maxcode = (1 << n_bits) - 1
        bitmask = (1 << n_bits) - 1
        oldcode = ~0
        finchar = +0
        posbits = +0

        free_entry = LZW.FIRST if block_mode else 0x100
        tab_suffix[:0x100] = range(0x100)
        resetbuf = True

        while resetbuf:
            resetbuf = False

            ibuf = ibuf[posbits >> 3:]
            insize = len(ibuf)
            posbits = 0
            inbits = (insize << 3) - (n_bits - 1)

            while inbits > posbits:
                if free_entry > maxcode:
                    n = n_bits << 3
                    p = posbits - 1
                    posbits = p + (n - (p + n) % n)
                    n_bits += 1
                    if (n_bits == maxbits):
                        maxcode = maxmaxcode
                    else:
                        maxcode = (1 << n_bits) - 1
                    bitmask = (1 << n_bits) - 1
                    resetbuf = True
                    break

                p = ibuf[posbits >> 3:]
                code = int.from_bytes(p[:3], 'little') >> (posbits & 7) & bitmask
                posbits += n_bits

                if oldcode == -1:
                    if code >= 256:
                        raise ValueError('corrupt input.')
                    oldcode = code
                    finchar = oldcode
                    out.write_byte(finchar)
                    continue

                if code == LZW.CLEAR and block_mode:
                    tab_prefix[:0x100] = array('H', itertools.repeat(0, 0x100))
                    free_entry = LZW.FIRST - 1
                    n = n_bits << 3
                    p = posbits - 1
                    posbits = p + (n - (p + n) % n)
                    n_bits = LZW.INIT_BITS
                    maxcode = (1 << n_bits) - 1
                    bitmask = (1 << n_bits) - 1
                    resetbuf = True
                    break

                incode = code
                stack = bytearray()

                if code >= free_entry:
                    if code > free_entry:
                        raise RefineryPartialResult('corrupt input.', out.getvalue())
                    stack.append(finchar)
                    code = oldcode
                while code >= 256:
                    stack.append(tab_suffix[code])
                    code = tab_prefix[code]

                finchar = tab_suffix[code]
                stack.append(finchar)
                stack.reverse()
                out.write(stack)
                code = free_entry

                if code < maxmaxcode:
                    tab_prefix[code] = oldcode & 0xFFFF
                    tab_suffix[code] = finchar & 0x00FF
                    free_entry = code + 1

                oldcode = incode

        return out.getvalue()

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        sig = self._MAGIC
        if data[:len(sig)] == sig:
            return True

class lzx (window=15, wim=False)

Expand source code Browse git

class lzx(Unit):

    def __init__(
        self,
        window: Arg.Number('window', metavar='window',
            help='Optionally specify the window size; the default is {default}.') = 15,
        wim: Arg.Switch('-w', help='Use the WIM flavor of LZX.') = False,
    ):
        super().__init__(window=window, wim=wim)

    def process(self, data):
        lzx = LzxDecoder(self.args.wim)
        lzx.set_params_and_alloc(self.args.window)

        try:
            return lzx.decompress(memoryview(data))
        except Exception as E:
            if out := lzx.get_output_data():
                raise RefineryPartialResult(str(E), out) from E
            raise

class m2h (seed=0, reps=1, text=False)

Returns the 32bit Murmur Hash, Version 2.

Expand source code Browse git

class m2h(MurMurHash):
    """
    Returns the 32bit Murmur Hash, Version 2.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v2_mmh32digest(data, self.args.seed)

class m2h64a (seed=0, reps=1, text=False)

Returns the 64bit Murmur Hash, Version 2, Variant A.

Expand source code Browse git

class m2h64a(MurMurHash):
    """
    Returns the 64bit Murmur Hash, Version 2, Variant A.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v2_mmh64digestA(data, self.args.seed)

class m2h64b (seed=0, reps=1, text=False)

Returns the 64bit Murmur Hash, Version 2, Variant B.

Expand source code Browse git

class m2h64b(MurMurHash):
    """
    Returns the 64bit Murmur Hash, Version 2, Variant B.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v2_mmh64digestB(data, self.args.seed)

class m2ha (seed=0, reps=1, text=False)

Returns the 32bit Murmur Hash, Version 2, Variant A.

Expand source code Browse git

class m2ha(MurMurHash):
    """
    Returns the 32bit Murmur Hash, Version 2, Variant A.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v2_mmh32digestA(data, self.args.seed)

class m3h (seed=0, reps=1, text=False)

Returns the 32bit Murmur Hash, Version 3.

Expand source code Browse git

class m3h(MurMurHash):
    """
    Returns the 32bit Murmur Hash, Version 3.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v3_mmh32digest(data, self.args.seed)

class m3h32 (seed=0, reps=1, text=False)

Returns the 128bit Murmur Hash, Version 3, 32bit digest size.

Expand source code Browse git

class m3h32(MurMurHash):
    """
    Returns the 128bit Murmur Hash, Version 3, 32bit digest size.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v3_mmh128digest32(data, self.args.seed)

class m3h64 (seed=0, reps=1, text=False)

Returns the 128bit Murmur Hash, Version 3, 64bit digest size.

Expand source code Browse git

class m3h64(MurMurHash):
    """
    Returns the 128bit Murmur Hash, Version 3, 64bit digest size.
    """
    def _algorithm(self, data: bytes) -> bytes:
        return v3_mmh128digest64(data, self.args.seed)

class machometa (all=True, header=False, linked_images=False, signatures=False, version=False, load_commands=False, exports=False, imports=False, tabular=False)

Extract metadata from Mach-O files.

Expand source code Browse git

class machometa(Unit):
    """
    Extract metadata from Mach-O files.
    """
    def __init__(
        self, all: Arg('-c', '--custom',
            help='Unless enabled, all default categories will be extracted.') = True,
        header: Arg('-H', help='Parse basic data from the Mach-O header.') = False,
        linked_images: Arg('-K', help='Parse all library images linked by the Mach-O.') = False,
        signatures: Arg('-S', help='Parse signature and entitlement information.') = False,
        version: Arg('-V', help='Parse version information from the Mach-O load commands.') = False,
        load_commands: Arg('-D', help='Parse load commands from the Mach-O header.') = False,
        exports: Arg('-E', help='List all exported functions.') = False,
        imports: Arg('-I', help='List all imported functions.') = False,
        tabular: Arg('-t', help='Print information in a table rather than as JSON') = False,
    ):
        super().__init__(
            header=all or header,
            linked_images=all or linked_images,
            version=all or version,
            signatures=all or signatures,
            load_commands=load_commands,
            imports=imports,
            exports=exports,
            tabular=tabular,
        )

    def compute_symhash(self, macho: lief.MachO.Binary) -> Dict:
        def _symbols(symbols: Iterable[lief.MachO.Symbol]):
            for sym in symbols:
                if sym.category != lief.MachO.Symbol.CATEGORY.UNDEFINED:
                    continue
                yield lief.string(sym.name)
        symbols = sorted(set(_symbols(macho.symbols)))
        symbols: str = ','.join(symbols)
        return md5(symbols.encode('utf8')).hexdigest()

    def parse_macho_header(self, macho: lief.MachO.Binary, data=None) -> Dict:
        info = {}
        if header := macho.header:
            st = header.cpu_subtype & 0x7FFFFFFF
            ht = 'mach_header_64' if header.magic in {
                lief.MachO.MACHO_TYPES.CIGAM_64,
                lief.MachO.MACHO_TYPES.MAGIC_64,
            } else 'mach_header'
            info['Type'] = ht
            info['Magic'] = header.magic.value
            info['CPUType'] = header.cpu_type.__name__.upper()
            info['CPUSubType'] = _CPU_SUBTYPES.get(header.cpu_type, {}).get(st, st)
            info['FileType'] = header.file_type.__name__
            info['LoadCount'] = header.nb_cmds
            info['LoadSize'] = header.sizeof_cmds
            info['Flags'] = sorted(flag.__name__ for flag in header.flags_list)
            info['Reserved'] = header.reserved
        return info

    def parse_linked_images(self, macho: lief.MachO.Binary, data=None) -> Dict:
        load_command_images = {}
        load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
        for load_command in load_commands:
            if not isinstance(load_command, lief.MachO.DylibCommand):
                continue
            images: List[str] = load_command_images.setdefault(load_command.command.__name__, [])
            images.append(load_command.name)
        return load_command_images

    def parse_signature(self, macho_image: lief.MachO.Binary, data=None) -> Dict:

        if not macho_image.has_code_signature:
            return {}

        info = {}
        reader = StructReader(macho_image.code_signature.content)
        super_blob = SuperBlob(reader)

        for blob in super_blob.blobs:

            if blob.type == BlobType.CODEDIRECTORY:
                codedirectory_blob = CodeDirectoryBlob(blob.data)
                if codedirectory_blob.flags & CS_ADHOC != 0:
                    info['AdHocSigned'] = True
                else:
                    info['AdHocSigned'] = False
                reader.seekset(codedirectory_blob.identOffset + blob.offset)
                info['SignatureIdentifier'] = reader.read_c_string('utf8')
                continue

            if blob.type == BlobType.CMS_SIGNATURE:
                reader.seekset(blob.offset)
                cms_signature = blob.data
                if not cms_signature:
                    continue
                try:
                    parsed_cms_signature = pemeta.parse_signature(bytearray(cms_signature))
                    info['Signature'] = parsed_cms_signature
                except ValueError as pkcs7_parse_error:
                    self.log_warn(F'Could not parse the data in CSSLOT_CMS_SIGNATURE as valid PKCS7 data: {pkcs7_parse_error!s}')
                continue

            if blob.type == BlobType.REQUIREMENTS:
                # TODO: Parse the requirements blob,
                # which is encoded according to the code signing requirements language:
                # https://developer.apple.com/library/archive/documentation/Security
                #        /Conceptual/CodeSigningGuide/RequirementLang/RequirementLang.html
                info['Requirements'] = blob.data.hex()
                continue

            if blob.type == BlobType.XML_ENTITLEMENTS:
                entitlements = bytes(blob.data)
                if not entitlements:
                    continue
                try:
                    entitlements = plistlib.loads(entitlements)
                except Exception as error:
                    self.log_warn(F'failed to parse entitlements: {error!s}')
                else:
                    info['Entitlements'] = entitlements

        return info

    def parse_version(self, macho: lief.MachO.Binary, data=None) -> Dict:
        info = {}
        load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
        for load_command in load_commands:
            if load_command.command == lief.MachO.LoadCommand.TYPE.SOURCE_VERSION:
                if 'SourceVersion' not in info:
                    cmd: lief.MachO.SourceVersion = load_command
                    info['SourceVersion'] = cmd.version[0]
                else:
                    self.log_warn('More than one load command of type SOURCE_VERSION found; the MachO file is possibly malformed')
                continue
            if load_command.command == lief.MachO.LoadCommand.TYPE.BUILD_VERSION:
                if 'BuildVersion' not in info:
                    cmd: lief.MachO.BuildVersion = load_command
                    info['BuildVersion'] = {}
                    info['BuildVersion']['Platform'] = cmd.platform.__name__
                    info['BuildVersion']['MinOS'] = '.'.join((str(v) for v in cmd.minos))
                    info['BuildVersion']['SDK'] = '.'.join((str(v) for v in cmd.sdk))
                    info['BuildVersion']['Ntools'] = len(cmd.tools)
                else:
                    self.log_warn('More than one load command of type BUILD_VERSION found; the MachO file is possibly malformed')
                continue
        return info

    def parse_load_commands(self, macho: lief.MachO.Binary, data=None) -> List:
        info = []
        load_commands: Iterable[lief.MachO.LoadCommand] = macho.commands
        for load_command in load_commands:
            info.append(dict(
                Type=load_command.command.__name__,
                Size=load_command.size,
                Data=load_command.data.hex(),
            ))
        return info

    def parse_imports(self, macho: lief.MachO.Binary, data=None) -> List:
        info = []
        imports: Iterable[lief.MachO.Symbol] = macho.imported_symbols
        for imp in imports:
            info.append(lief.string(imp.name))
        return info

    def parse_exports(self, macho: lief.MachO.Binary, data=None) -> List:
        info = []
        exports: Iterable[lief.MachO.Symbol] = macho.exported_symbols
        for exp in exports:
            info.append(lief.string(exp.name))
        return info

    def process(self, data: bytearray):
        result = {}
        slices = []
        macho = lief.load_macho(data)
        macho_slices: List[lief.MachO.Binary] = []

        for k in itertools.count():
            if not (ms := macho.at(k)):
                break
            macho_slices.append(ms)

        result['FileType'] = 'FAT' if len(macho_slices) > 1 else 'THIN'

        for image in macho_slices:
            slice_result = {}

            for switch, resolver, name in [
                (self.args.header,          self.parse_macho_header,  'Header'),       # noqa
                (self.args.linked_images,   self.parse_linked_images, 'LinkedImages'), # noqa
                (self.args.signatures,      self.parse_signature,     'Signatures'),   # noqa
                (self.args.version,         self.parse_version,       'Version'),      # noqa
                (self.args.load_commands,   self.parse_load_commands, 'LoadCommands'), # noqa
                (self.args.imports,         self.parse_imports,       'Imports'),      # noqa
                (self.args.exports,         self.parse_exports,       'Exports'),      # noqa
            ]:
                if not switch:
                    continue
                self.log_debug(F'parsing: {name}')
                try:
                    info = resolver(image, data)
                except Exception as E:
                    self.log_info(F'failed to obtain {name}: {E!s}')
                    continue
                if info:
                    slice_result[name] = info

            if image.uuid is not None:
                uuid = bytes(image.uuid.uuid)
                slice_result['UUID'] = uuid.hex()
            slice_result['SymHash'] = self.compute_symhash(image)
            if fileset_name := image.fileset_name:
                slice_result['FilesetName'] = fileset_name
            slices.append(slice_result)

        if slices:
            result['Slices'] = slices
            yield from ppjson(
                tabular=self.args.tabular
            )._pretty_output(result, indent=4, ensure_ascii=False)

class map (index, image, default=None, blocksize=None)

Each block of the input data which occurs as a block of the index argument is replaced by the corresponding block of the image argument. If a block size is specified, and if the index or image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes that are not an integer multiple of the block size are discarded. To prevent any automatic chunking, the DelayedArgument.btoi() handler can be used. An optional default value can be provided to serve as inserts for any blocks in the input that do not occur in the index sequence. If this argument is not specified, such blocks are left unchanged.

Expand source code Browse git

class map(BlockTransformation):
    """
    Each block of the input data which occurs as a block of the index argument is replaced by the
    corresponding block of the image argument. If a block size is specified, and if the index or
    image argument are byte sequences, they are unpacked into chunks of that size, and excess bytes
    that are not an integer multiple of the block size are discarded. To prevent any automatic
    chunking, the `refinery.lib.argformats.DelayedArgument.btoi` handler can be used.
    An optional default value can be provided to serve as inserts for any blocks in the input that
    do not occur in the index sequence. If this argument is not specified, such blocks are left
    unchanged.
    """
    _map: Optional[Dict[int, int]]

    def __init__(
        self,
        index: Arg.NumSeq(help='index characters'),
        image: Arg.NumSeq(help='image characters'),
        default: Arg.NumSeq(help='default value') = None,
        blocksize=None
    ):
        super().__init__(blocksize=blocksize, index=index, image=image, default=default, _truncate=2)
        self._map = None

    def reverse(self, data):
        return self._process(data, self.args.image, self.args.index, self.args.default)

    def process(self, data):
        return self._process(data, self.args.index, self.args.image, self.args.default)

    def _process(self, data: bytearray, index: Sequence[int], image: Sequence[int], default: Optional[Sequence[int]]):
        if not self.bytestream:
            if isbuffer(index):
                self.log_info(F'chunking index sequence into blocks of size {self.blocksize}')
                index = list(self.chunk(index))
                self.log_debug(F'index sequence: {index}')
            if isbuffer(image):
                self.log_info(F'chunking image sequence into blocks of size {self.blocksize}')
                image = list(self.chunk(image))
                self.log_debug(F'image sequence: {image}')
            if isbuffer(default):
                self.log_info(F'chunking default sequence into blocks of size {self.blocksize}')
                default = list(self.chunk(default))
                self.log_debug(F'default sequence: {default}')
        if len(set(index)) != len(index):
            raise ValueError('The index sequence contains duplicates.')
        if len(index) > len(image):
            raise ValueError('The index sequence is longer than the image sequence.')

        if self.bytestream:
            mapping = dict(zip(index, image))
            if default is not None:
                d = iter(cycle(default))
                mapping = bytes(mapping.get(c, d) for c in range(0x100))
            else:
                mapping = bytes(mapping.get(c, c) for c in range(0x100))
            if not isinstance(data, bytearray):
                data = bytearray(data)
            data[:] = (mapping[b] for b in data)
            return data
        try:
            self.log_info(default)
            self._def = default if default is None else cycle(default)
            self._map = dict(zip(index, image))
            return super().process(data)
        finally:
            self._map = None

    def process_block(self, token):
        default = next(it) if (it := self._def) else token
        return self._map.get(token, default)

class maru (seed=0, reps=1, text=False)

Returns the 64bit maru hash of the input data.

Expand source code Browse git

class maru(HashUnit):
    """
    Returns the 64bit maru hash of the input data.
    """
    def __init__(
        self,
        seed: Arg.Number(help='optional seed value') = 0,
        reps=1,
        text=False,
    ):
        super().__init__(seed=seed, text=text, reps=reps)

    def _algorithm(self, data: bytes) -> bytes:
        return maru32digest(data, self.args.seed)

class max_ (key=None)

Picks the maximum of all elements in the current refinery.lib.frame.

Expand source code Browse git

class max_(Unit):
    """
    Picks the maximum of all elements in the current `refinery.lib.frame`.
    """

    def __init__(
        self,
        key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None,
    ):
        super().__init__(key=key)

    def filter(self, chunks: Iterable[Chunk]):
        def get_value(chunk: Chunk):
            if key is None:
                return chunk
            return metavars(chunk).get(key)

        key = self.args.key
        it = iter(chunks)

        for max_chunk in it:
            if not max_chunk.visible:
                yield max_chunk
            else:
                max_index = 0
                max_value = get_value(max_chunk)
                break
        else:
            return

        for index, chunk in enumerate(chunks, 1):
            if not chunk.visible:
                yield chunk
                continue
            value = get_value(chunk)
            try:
                is_max = value > max_value
            except TypeError:
                if max_value is None:
                    self.log_info(
                        F'Discarding chunk {max_index} in favor of {index} because {key} was not '
                        F'set on the former; new maximum is {value!r}.')
                    is_max = True
                else:
                    self.log_info(
                        F'Discarding chunk {index} because {key} had value {value!r}; it could not '
                        F'be compared to the current maximum {max_value!r} on chunk {max_index}.')
                    is_max = False
            if is_max:
                max_value = value
                max_chunk = chunk
                max_index = index

        yield max_chunk

class md2 (reps=1, text=False)

Returns the MD2 hash of the input data.

class md4 (reps=1, text=False)

Returns the MD4 hash of the input data.

class md5 (reps=1, text=False)

Returns the MD5 hash of the input data.

class mimewords

Implements the decoding of MIME encoded-word syntax from RFC-2047.

Expand source code Browse git

class mimewords(Unit):
    """
    Implements the decoding of MIME encoded-word syntax from RFC-2047.
    """

    @unicoded
    def process(self, data: str) -> str:
        def replacer(match):
            self.log_info('encoded mime word:', match[0])
            decoded, = decode_header(match[0])
            raw, codec = decoded
            return codecs.decode(raw, codec, errors='surrogateescape')
        return re.sub(R"=(?:\?[^\?]*){3}\?=", replacer, data)

class min_ (key=None)

Picks the minimum of all elements in the current refinery.lib.frame.

Expand source code Browse git

class min_(Unit):
    """
    Picks the minimum of all elements in the current `refinery.lib.frame`.
    """

    def __init__(
        self,
        key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None,
    ):
        super().__init__(key=key)

    def filter(self, chunks: Iterable[Chunk]):
        def get_value(chunk: Chunk):
            if key is None:
                return chunk
            return metavars(chunk).get(key)

        key = self.args.key
        it = iter(chunks)

        for min_chunk in it:
            if not min_chunk.visible:
                yield min_chunk
            else:
                min_index = 0
                min_value = get_value(min_chunk)
                break
        else:
            return

        for index, chunk in enumerate(chunks, 1):
            if not chunk.visible:
                yield chunk
                continue
            value = get_value(chunk)
            try:
                is_min = value < min_value
            except TypeError:
                if min_value is None:
                    self.log_info(
                        F'Discarding chunk {min_index} in favor of {index} because {key} was not '
                        F'set on the former; new minimum is {value!r}.')
                    is_min = True
                else:
                    self.log_info(
                        F'Discarding chunk {index} because {key} had value {value!r}; it could not '
                        F'be compared to the current minimum {min_value!r} on chunk {min_index}.')
                    is_min = False
            if is_min:
                min_value = value
                min_chunk = chunk
                min_index = index

        yield min_chunk

class morse (language=None)

Morse encoding and decoding. All tokens in the input data which consist of dashes and dots are replaced by their Morse decoding.

Expand source code Browse git

class morse(Unit):
    """
    Morse encoding and decoding. All tokens in the input data which consist of dashes and dots are
    replaced by their Morse decoding.
    """
    def __init__(
        self,
        language: Arg.Option(choices=MorseLanguage, help=(
            'Optionally choose a language. If none is specified, the unit will attempt to detect '
            'the language automatically. Options are: {choices}')) = None,
    ):
        super().__init__(language=Arg.AsOption(language, MorseLanguage))

    @classmethod
    def handles(self, data: bytearray):
        if re.fullmatch(BR'[-.\s]+', data, re.DOTALL):
            return True

    @unicoded
    def process(self, data: str):
        language: MorseLanguage = self.args.language
        parsed = re.split('(\\s+)', data)
        tokens = {t for t in parsed[::2] if t}
        tables = [
            self._DECODE_SYMBOL,
            self._DECODE_DIGITS,
        ]

        if language is not None:
            tables.append(self._DECODE[language])
        else:
            special = set(self._DECODE_SYMBOL) | set(self._DECODE_DIGITS)
            best_ratio = 1 # number of unused codes
            best_table = None
            for language in MorseLanguage:
                table = self._DECODE[language]
                codes = set(table)
                if not tokens <= codes | special:
                    continue
                if language == MorseLanguage.EN:
                    best_table = table
                    break
                ratio = len(codes - tokens) / len(codes)
                if ratio < best_ratio:
                    best_ratio = ratio
                    best_table = table
            if best_table is None:
                raise LookupError('Unable to determine language, please specify it manually.')
            tables.append(best_table)

        with io.StringIO() as out:
            for k, string in enumerate(parsed):
                if k % 2 == 1:
                    string = string[1:]
                    if len(string) > 1:
                        string = string[:-1]
                    out.write(string)
                    continue
                if not string:
                    continue
                for table in tables:
                    try:
                        out.write(table[string])
                        break
                    except KeyError:
                        continue
                else:
                    raise ValueError(F'invalid token: {string}')
            return out.getvalue()

    @unicoded
    def reverse(self, data: str):
        language: MorseLanguage = self.args.language
        tables = [
            self._ENCODE_SYMBOL,
            self._ENCODE_DIGITS,
        ]
        if language is not None:
            tables.append(self._ENCODE[language])
        else:
            tables.extend(self._ENCODE.values())

        def _encode(letter):
            for table in tables:
                try:
                    return table[letter]
                except KeyError:
                    continue
            else:
                raise ValueError(F'cannot encode letter "{letter}"')

        with io.StringIO() as out:
            for k, word in enumerate(re.split('(\\s+)', data)):
                if k % 2 == 1:
                    out.write(F' {word} ')
                    continue
                out.write(' '.join(_encode(letter) for letter in word.lower()))
            return out.getvalue()

    _ENCODE = {
        MorseLanguage.EN: {
            'a': '.-',
            'b': '-...',
            'c': '-.-.',
            'd': '-..',
            'e': '.',
            'f': '..-.',
            'g': '--.',
            'h': '....',
            'i': '..',
            'j': '.---',
            'k': '-.-',
            'l': '.-..',
            'm': '--',
            'n': '-.',
            'o': '---',
            'p': '.--.',
            'q': '--.-',
            'r': '.-.',
            's': '...',
            't': '-',
            'u': '..-',
            'v': '...-',
            'w': '.--',
            'x': '-..-',
            'y': '-.--',
            'z': '--..',
        }
    }
    _ENCODE[MorseLanguage.ES] = _extend_dictionary(_ENCODE[MorseLanguage.EN], {
        'á': '.--.-',
        'é': '..-..',
        'í': '..',
        'ñ': '--.--',
        'ó': '---.',
        'ú': '..-',
        'ü': '..--',
        '¿': '..-.-',
        '¡': '--...-',
    })
    _ENCODE[MorseLanguage.DE] = _extend_dictionary(_ENCODE[MorseLanguage.EN], {
        'ä': '.-.-',
        'ö': '---.',
        'ü': '..--',
        'ß': '...--..',
    })
    _ENCODE[MorseLanguage.FR] = _extend_dictionary(_ENCODE[MorseLanguage.EN], {
        'à': '.--.-',
        'â': '.--.-',
        'ç': '-.-..',
        'è': '.-..-',
        'é': '..-..',
        'ê': '-..-.',
        'ë': '..-..',
        'î': '..',
        'ï': '-..--',
        'ô': '---',
        'ù': '..-',
        'ü': '..--',
    })
    _ENCODE[MorseLanguage.RU] = {
        'а': '.-',
        'б': '-...',
        'в': '.--',
        'г': '--.',
        'д': '-..',
        'е': '.',
        'ё': '.',
        'ж': '...-',
        'з': '--..',
        'и': '..',
        'й': '.---',
        'к': '-.-',
        'л': '.-..',
        'м': '--',
        'н': '-.',
        'о': '---',
        'п': '.--.',
        'р': '.-.',
        'с': '...',
        'т': '-',
        'у': '..-',
        'ф': '..-.',
        'х': '....',
        'ц': '-.-.',
        'ч': '---.',
        'ш': '----',
        'щ': '--.-',
        'ъ': '--.--',
        'ы': '-.--',
        'ь': '-..-',
        'э': '..-..',
        'ю': '..--',
        'я': '.-.-',
    }
    _ENCODE[MorseLanguage.UA] = _extend_dictionary(_ENCODE[MorseLanguage.RU], {
        'ґ': '--.',
        'и': '-.--',
        'ї': '.---.',
    })
    _ENCODE[MorseLanguage.UA]['є'] = _ENCODE[MorseLanguage.UA].pop('э')
    _ENCODE[MorseLanguage.UA]['і'] = _ENCODE[MorseLanguage.UA].pop('и')

    _ENCODE[MorseLanguage.HE] = {
        'א': '.-',
        'ב': '-...',
        'ג': '--.',
        'ד': '-..',
        'ה': '---',
        'ו': '.',
        'ז': '--..',
        'ח': '....',
        'ט': '..--',
        'י': '..',
        'כ': '-.',
        'ל': '.-..',
        'מ': '--',
        'נ': '--.',
        'ס': '-.-.',
        'ע': '.---',
        'פ': '.--.',
        'צ': '.--',
        'ק': '--.-',
        'ר': '.-.',
        'ש': '...',
        'ת': '-',
    }

    _ENCODE[MorseLanguage.AR] = {
        'ا': '.-',
        'ب': '-...',
        'ت': '-',
        'ث': '-.-.',
        'ج': '.---',
        'ح': '....',
        'خ': '---',
        'د': '-..',
        'ذ': '--..',
        'ر': '.-.',
        'ز': '---.',
        'س': '...',
        'ش': '----',
        'ص': '-..-',
        'ض': '...-',
        'ط': '..-',
        'ظ': '-.--',
        'ع': '.-.-',
        'غ': '--.',
        'ف': '..-.',
        'ق': '--.-',
        'ك': '-.-',
        'ل': '.-..',
        'م': '--',
        'ن': '-.',
        'ه': '..-..',
        'و': '.--',
        'ي': '..',
        'ﺀ': '.',
    }

    _ENCODE_DIGITS = {
        '0': '-----',
        '1': '.----',
        '2': '..---',
        '3': '...--',
        '4': '....-',
        '5': '.....',
        '6': '-....',
        '7': '--...',
        '8': '---..',
        '9': '----.'
    }

    _ENCODE_SYMBOL = {
        '_': '..--.-',
        '-': '-....-',
        ',': '--..--',
        ';': '-.-.-.',
        ':': '---...',
        '!': '-.-.--',
        '?': '..--..',
        '.': '.-.-.-',
        '"': '.-..-.',
        '(': '-.--.',
        ')': '-.--.-',
        '@': '.--.-.',
        '/': '-..-.',
        '\\': '-..-.',
        '&': '.-...',
        '+': '.-.-.',
        '=': '-...-',
        '$': '...-..-',
        "'": '.----.',
    }

    _DECODE = {
        lng: _reverse_dictionary(tbl) for lng, tbl in _ENCODE.items()}
    _DECODE_SYMBOL = _reverse_dictionary(_ENCODE_SYMBOL)
    _DECODE_DIGITS = _reverse_dictionary(_ENCODE_DIGITS)

class mscdk (size, hash='MD5')

An implementation of the CryptDeriveKey routine available from the Win32 API.

Expand source code Browse git

class mscdk(KeyDerivation):
    """
    An implementation of the CryptDeriveKey routine available from the Win32 API.
    """

    def __init__(self, size, hash='MD5'):
        super().__init__(size=size, salt=None, hash=hash)

    def process(self, data):
        def digest(x):
            return self.hash.new(x).digest()
        size = self.args.size
        if self.args.hash in (HASH.SHA224, HASH.SHA256, HASH.SHA384, HASH.SHA512):
            buffer = digest(data)
            max_size = len(buffer)
        else:
            max_size = 2 * self.hash.digest_size
            value = digest(data)
            del data
            buffer1 = bytearray([0x36] * 64)
            buffer2 = bytearray([0x5C] * 64)
            for k, b in enumerate(value):
                buffer1[k] ^= b
                buffer2[k] ^= b
            buffer = digest(buffer1) + digest(buffer2)
        if size > max_size:
            raise RefineryPartialResult(F'too many bytes requested, can only provide {max_size}', partial=buffer)
        return buffer[:size]

class mscf (mode=None)

The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both with and without Huffman table) are supported. This pure Python implementation is very slow when compared to native code, so decompressing very large inputs can take several minutes.

Expand source code Browse git

class mscf(Unit):
    """
    The Microsoft Compression Format unit implements the format and algorithms used by the Microsoft
    Compression API. The implementation for LZMS is currently missing, but MSZIP and XPRESS (both
    with and without Huffman table) are supported. This pure Python implementation is very slow when
    compared to native code, so decompressing very large inputs can take several minutes.
    """

    _SIGNATURE = B'\x0A\x51\xE5\xC0'

    def __init__(
        self,
        mode: Unit.Arg.Option(choices=MODE, help=(
            'Manually select decompression mode ({choices}); by default the unit attempts to derive the '
            'mode from the header, but this will fail for raw streams. However, even if a header is '
            'found, a manually specified mode will take precedence.')) = None,
    ):
        mode = Unit.Arg.AsOption(mode, MODE)
        super().__init__(mode=mode)

    def process(self, data):
        mode: MODE = self.args.mode
        with StructReader(memoryview(data)) as reader, MemoryFile() as writer:
            reader: StructReader[memoryview]
            check = zlib.crc32(reader.peek(6))
            magic = reader.read(4)
            if magic != self._SIGNATURE:
                if mode is None:
                    self.log_warn(
                        F'data starts with {magic.hex().upper()} rather than the expected sequence '
                        F'{self._SIGNATURE.hex().upper()}; this could be a raw stream.')
                else:
                    reader.seek(0)
                    handler = self._get_handler(mode)
                    handler(reader, writer, None)
                    return writer.getvalue()

            header_size = reader.u16()
            if header_size != 24:
                self.log_warn(F'the header size {header_size} was not equal to 24')

            crc32byte = reader.u8()
            check = zlib.crc32(reader.peek(0x11), check) & 0xFF
            if check != crc32byte:
                self.log_warn(F'the CRC32 check byte was {crc32byte}, computed value was {check}')

            _mode_code = reader.u8()

            try:
                _mode = MODE(_mode_code)
            except ValueError:
                msg = F'header contains unknown compression type code {_mode_code}'
                if mode is None:
                    raise ValueError(msg)
                else:
                    self.log_warn(msg)
            else:
                if mode is not None and mode != _mode:
                    logger = self.log_warn
                else:
                    logger = self.log_info
                    mode = _mode
                logger(F'header specifies algorithm {_mode.name}')

            self.log_info(F'using algorithm {mode.name}')
            decompress = self._get_handler(mode)

            final_size = reader.u32()
            _unknown_1 = reader.u32()
            chunk_size = reader.u32()
            _unknown_2 = reader.u32()

            if _unknown_1 != 0:
                self.log_warn(F'unknown value 1 was unexpectedly nonzero: 0x{_unknown_1:08X}')
            if _unknown_2 != 0:
                self.log_warn(F'unknown value 2 was unexpectedly nonzero: 0x{_unknown_2:08X}')

            self.log_debug(F'final size: 0x{final_size:08X}')
            self.log_debug(F'chunk size: 0x{chunk_size:08X}')

            if chunk_size > COMPRESS_MAX_CHUNK:
                raise ValueError('the header chunk size is greater than the maximum value')

            while len(writer) < final_size:
                src_size = reader.u32()
                src_data = reader.read(src_size)
                if len(src_data) != src_size:
                    raise IndexError(F'Attempted to read {src_size} bytes, but got only {len(src_data)}.')
                if src_size + len(writer) == final_size:
                    self.log_debug(F'final chunk is uncompressed, appending {src_size} raw bytes to output')
                    writer.write(src_data)
                    break
                self.log_debug(F'reading chunk of size {src_size}')
                start = writer.tell()
                chunk = StructReader(src_data)
                target = min(chunk_size, final_size - len(writer))
                decompress(chunk, writer, target)
                writer.flush()
                written = writer.tell() - start
                if written != target:
                    raise RuntimeError(F'decompressed output had unexpected size {written} instead of {chunk_size}')

            if not reader.eof:
                self.log_info(F'compression complete with {reader.remaining_bytes} bytes remaining in input')
            return writer.getvalue()

    def _get_handler(self, mode: MODE) -> Callable[[StructReader, MemoryFile, Optional[int]], None]:
        decompress = {
            mode.MSZIP       : self._decompress_mszip,
            mode.XPRESS_HUFF : self._decompress_xpress_huffman,
            mode.XPRESS      : self._decompress_xpress,
        }.get(mode, None)
        if decompress is None:
            raise NotImplementedError(F'algorithm {mode.name} is not yet implemented')
        return decompress

    def _decompress_mszip(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None):
        header = bytes(reader.read(2))
        if header != B'CK':
            raise ValueError(F'chunk did not begin with CK header, got {header!r} instead')
        decompress = zlib.decompressobj(-zlib.MAX_WBITS, zdict=writer.getvalue())
        writer.write(decompress.decompress(reader.read()))
        writer.write(decompress.flush())

    def _decompress_xpress_huffman(
        self,
        reader: StructReader,
        writer: MemoryFile,
        target: Optional[int] = None,
        max_chunk_size: int = 0x10000
    ) -> None:
        limit = writer.tell()
        if target is not None:
            target += limit

        while not reader.eof:

            if reader.remaining_bytes < XPRESS_NUM_SYMBOLS // 2:
                raise IndexError(
                    F'There are only {reader.remaining_bytes} bytes reamining in the input buffer,'
                    F' but at least {XPRESS_NUM_SYMBOLS // 2} are required to read a Huffman table.')

            table = bytearray(reader.read_integer(4) for _ in range(XPRESS_NUM_SYMBOLS))
            table = make_huffman_decode_table(table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN)
            limit = limit + max_chunk_size
            flags = BitBufferedReader(reader, 16)

            while True:
                position = writer.tell()
                if position == target:
                    if reader.remaining_bytes:
                        self.log_info(F'chunk decompressed with {reader.remaining_bytes} bytes remaining in input buffer')
                    return
                if position >= limit:
                    if position > limit:
                        limit = position
                        self.log_info(F'decompression of one chunk generated more than the limit of {max_chunk_size} bytes')
                    flags.collect()
                    break
                try:
                    sym = read_huffman_symbol(flags, table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN)
                except EOFError:
                    self.log_debug('end of file while reading huffman symbol')
                    break
                if sym < XPRESS_NUM_CHARS:
                    writer.write_byte(sym)
                    continue
                length = sym & 0xF
                offsetlog = (sym >> 4) & 0xF
                flags.collect()
                if reader.eof:
                    break
                offset = (1 << offsetlog) | flags.read(offsetlog)
                if length == 0xF:
                    nudge = reader.read_byte()
                    if nudge < 0xFF:
                        length += nudge
                    else:
                        length = reader.u16() or reader.u32()
                length += XPRESS_MIN_MATCH_LEN
                writer.replay(offset, length)

    def _decompress_xpress(self, reader: StructReader, writer: MemoryFile, target: Optional[int] = None) -> bytearray:
        if target is not None:
            target += writer.tell()
        flags = BitBufferedReader(reader)
        nibble_cache = None
        while not reader.eof:
            if target is not None and writer.tell() >= target:
                return
            if not flags.next():
                writer.write(reader.read(1))
                continue
            offset, length = divmod(reader.u16(), 8)
            offset += 1
            if length == 7:
                length = nibble_cache
                if length is None:
                    length_pair = reader.u8()
                    nibble_cache = length_pair >> 4
                    length = length_pair & 0xF
                else:
                    nibble_cache = None
                if length == 15:
                    length = reader.u8()
                    if length == 0xFF:
                        length = reader.u16() or reader.u32()
                        length -= 22
                        if length < 0:
                            raise RuntimeError(F'Invalid match length of {length} for long delta sequence')
                    length += 15
                length += 7
            length += 3
            writer.replay(offset, length)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        sig = cls._SIGNATURE
        if data[:len(sig)] == sig:
            return True

class msgpack

Converts a message-pack (msgpack) buffer to JSON and vice-versa.

Expand source code Browse git

class msgpack(Unit):
    """
    Converts a message-pack (msgpack) buffer to JSON and vice-versa.
    """
    def reverse(self, data):
        return mp.dumps(json.loads(data))

    def process(self, data):
        unpacker: mp.fallback.Unpacker = mp.Unpacker(MemoryFile(data, read_as_bytes=True))
        for k in itertools.count():
            try:
                last = unpacker.tell()
                item = unpacker.unpack()
            except Exception as E:
                if isinstance(E, mp.OutOfData) and k == 1:
                    break
                raise RefineryPartialResult(str(E), memoryview(data)[last:]) from E
            else:
                yield json.dumps(item).encode(self.codec)

class mspdb (size, salt, iter=100, hash='SHA1')

An implementation of the PasswordDeriveBytes routine available from the .NET standard library. According to documentation, it is an extension of PBKDF1.

Expand source code Browse git

class mspdb(KeyDerivation):
    """
    An implementation of the PasswordDeriveBytes routine available from the .NET
    standard library. According to documentation, it is an extension of PBKDF1.
    """
    def __init__(self, size, salt, iter=100, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        if self.codec != 'UTF8':
            data = data.decode(self.codec).encode('UTF8')
        data += self.args.salt
        for _ in range(self.args.iter - 1):
            data = self.hash.new(data).digest()
        counter, seedhash = 1, data
        data = self.hash.new(data).digest()
        while len(data) < self.args.size:
            data += self.hash.new(B'%d%s' % (counter, seedhash)).digest()
            counter += 1
        return data[:self.args.size]

class mvg (*names, top=False)

Short for "Make Variable Global": This unit can move meta variables into the scope of the parent frame. If used at the end of a frame, the variables will be moved the scope of the frame that the pipeline will return to. Otherwise and if the –top switch is being used, variables will be moved to scope 0, i.e. to the topmost frame in the current tree.

Note that it is not possible to promote a variable to a parent frame if that variable does not have the same value on all chunks in the current frame - such variables will always be removed when the frame closes.

Expand source code Browse git

class mvg(Unit):
    """
    Short for "Make Variable Global": This unit can move meta variables into the scope of the
    parent frame. If used at the end of a frame, the variables will be moved the scope of the
    frame that the pipeline will return to. Otherwise and if the --top switch is being used,
    variables will be moved to scope 0, i.e. to the topmost frame in the current tree.

    Note that it is not possible to promote a variable to a parent frame if that variable does not
    have the same value on all chunks in the current frame - such variables will always be removed
    when the frame closes.
    """
    def __init__(
        self,
        *names: Arg(type=str, metavar='name', help=(
            'Name of a variable to be removed. If no variables are explicitly specified, all '
            'variables in the current chunk will be rescoped.'
        )),
        top: Arg.Switch('-t', help='Move the variable(s) to the topmost frame layer.') = False
    ):
        super().__init__(names=names, top=top)

    def process(self, data):
        meta = metavars(data)
        nest = self.args.nesting
        if nest < 0 and not self.args.top:
            spot = meta.scope + nest
        else:
            spot = 1
        for name in self.args.names or meta.variable_names():
            try:
                if meta.get_scope(name) <= spot:
                    continue
                meta.set_scope(name, spot)
            except KeyError:
                self.log_info(F'variable not defined: {name}')
        return data

class n40 (key)

Decrypts hex-encoded strings in various latin-american banker families, including N40.

Expand source code Browse git

class n40(Unit):
    """
    Decrypts hex-encoded strings in various latin-american banker families, including N40.
    """
    def __init__(self, key: Arg(help='Decryption key.')):
        ...

    def process(self, data):
        try:
            data = b16decode(data, casefold=True)
        except Error:
            self.log_info('Input was not hex-encoded; ignoring this step.')
        mask = data[1:] | xor(self.args.key) | bytearray
        return bytearray(0xFF + b - a if b <= a else b - a for a, b in zip(data, mask))

class neg (bigendian=False, blocksize=None)

Each block of the input data is negated bitwise. This is sometimes also called the bitwise complement or inverse.

Expand source code Browse git

class neg(UnaryOperation):
    """
    Each block of the input data is negated bitwise. This is sometimes
    also called the bitwise complement or inverse.
    """
    def operate(self, a): return ~a
    def inplace(self, a): a ^= self.fmask

class netbios (key=b'A')

Encodes and decodes strings using the same algorithm that is used for NetBIOS labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and 0xL with an offset character, respectively. The default offset is the capital letter A.

Expand source code Browse git

class netbios(Unit):
    """
    Encodes and decodes strings using the same algorithm that is used for NetBIOS
    labels. Each byte 0xUL is encoded as two bytes, which are the sum of 0xU and
    0xL with an offset character, respectively. The default offset is the capital
    letter A.
    """

    def __init__(self, key: Arg(help="Provide a single letter to use as the offset.") = B'A'):
        if len(key) != 1:
            raise ValueError("The key must be a binary string of length exactly 1")
        super().__init__(key=key[0])

    def reverse(self, data):
        result = bytearray(2 * len(data))
        for k, byte in enumerate(data):
            hi, lo = byte >> 4, byte & 15
            result[2 * k + 0] = hi + self.args.key
            result[2 * k + 1] = lo + self.args.key
        return result

    def process(self, data):
        def merge(it):
            while True:
                try:
                    hi = next(it) - self.args.key
                    lo = next(it) - self.args.key
                    if hi not in range(16) or lo not in range(16):
                        raise ValueError(F'Invalid character encoding detected: hi={hi:X}, lo={lo:X}.')
                    yield (hi << 4) | lo
                except StopIteration:
                    break
        return bytearray(merge(iter(data)))

class ngrams (size=slice(2, None, None))

Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams and deduplicates using a set data structure. The number n is taken from an arbitrary range given as a Python slice expression.

Expand source code Browse git

class ngrams(Unit):
    """
    Extract all n-grams from the input. The algorithm is naive, i.e. it simply iterates all n-grams
    and deduplicates using a set data structure. The number n is taken from an arbitrary range given
    as a Python slice expression.
    """
    def __init__(
        self, size: Arg.Bounds(
            help='Specifies the sizes of each n-gram, i.e. the number n. Defaults to {default}.') = slice(2, None),
    ):
        super().__init__(size=size)

    def process(self, data: bytearray):
        for n in integers_of_slice(self.args.size):
            self.log_info(F'emitting {n}-grams')
            if n > len(data):
                break
            deduplicator = set()
            view = memoryview(data)
            for index in range(len(data) - n + 1):
                block = bytes(view[index:index + n])
                if block in deduplicator:
                    continue
                deduplicator.add(block)
                yield self.labelled(block, offset=index)

class nop

The unit generates the exact output that was received as input. All unknown arguments passed to nop are completely ignored, which is different from the behavior of other units. As such, nop can be used to comment out other units in longer refinery pipelines by simply prefixing a command with nop.

Expand source code Browse git

class nop(Unit):
    """
    The unit generates the exact output that was received as input. All unknown arguments passed
    to nop are completely ignored, which is different from the behavior of other units. As such,
    nop can be used to comment out other units in longer refinery pipelines by simply prefixing a
    command with nop.
    """
    @classmethod
    def argparser(cls, **keywords):
        argp = NopArgParser(
            keywords, prog=cls.name, description=documentation(cls), add_help=False)
        argp.set_defaults(nesting=0)
        return cls._interface(argp)

class nrv2b (bits=32)

Decompress data using the NRV2B algorithm.

Expand source code Browse git

class nrv2b(NRVUnit):
    """
    Decompress data using the NRV2B algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * offset + next(bb)
            if offset == 2:
                offset = last_offset
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                offset += 1
                last_offset = offset
            length = next(bb)
            length = 2 * length + next(bb)
            if length == 0:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 2
            length += int(bool(offset > 0xD00))
            dst.replay(offset, length + 1)

class nrv2d (bits=32)

Decompress data using the NRV2D algorithm.

Expand source code Browse git

class nrv2d(NRVUnit):
    """
    Decompress data using the NRV2D algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * (offset - 1) + next(bb) # noqa
                offset = 2 *  offset      + next(bb) # noqa
            if offset == 2:
                offset = last_offset
                length = next(bb)
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                length = (offset  ^ 1) & 1 # noqa
                offset = (offset >> 1) + 1
                last_offset = offset
            length = 2 * length + next(bb)
            if length == 0:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 2
            length += int(bool(offset > 0x500))
            dst.replay(offset, length + 1)

class nrv2e (bits=32)

Decompress data using the NRV2E algorithm.

Expand source code Browse git

class nrv2e(NRVUnit):
    """
    Decompress data using the NRV2E algorithm.
    """
    def _decompress(self, src: StructReader, dst: MemoryFile, bb: BitBufferedReader):
        last_offset = 1
        while not src.eof:
            while next(bb):
                dst.write_byte(src.read_byte())
            offset = 2 + next(bb)
            while not next(bb):
                offset = 2 * (offset - 1) + next(bb) # noqa
                offset = 2 *  offset      + next(bb) # noqa
            if offset == 2:
                offset = last_offset
                length = next(bb)
            else:
                offset = (offset - 3) * 0x100 + src.read_byte()
                if offset & 0xFFFFFFFF == 0xFFFFFFFF:
                    break
                length = (offset ^  1) & 1 # noqa
                offset = (offset >> 1) + 1
                last_offset = offset
            if length:
                length = 1 + next(bb)
            elif next(bb):
                length = 3 + next(bb)
            else:
                length = 2 + next(bb)
                while not next(bb):
                    length = 2 * length + next(bb)
                length += 3
            length += int(bool(offset > 0x500))
            dst.replay(offset, length + 1)

class ntlm (reps=1, text=False)

Returns the Windows NTLM hash of the input.

Expand source code Browse git

class ntlm(HashUnit):
    """
    Returns the Windows NTLM hash of the input.
    """
    def _algorithm(self, data: bytes) -> bytes:
        from Cryptodome.Hash import MD4
        return MD4.new(data.decode(self.codec).encode('utf-16le')).digest()

class officecrypt (password=b'VelvetSweatshop')

A simple proxy for the msoffcrypto package to decrypt office documents.

Expand source code Browse git

class officecrypt(Unit):
    """
    A simple proxy for the `msoffcrypto` package to decrypt office documents.
    """

    def __init__(self, password: Arg.Binary(help=(
        'The document password. By default, the Excel default password "{default}" is used.'
    )) = b'VelvetSweatshop'):
        super().__init__(password=password)

    @Unit.Requires('msoffcrypto-tool', 'formats', 'office')
    def _msoffcrypto():
        import msoffcrypto
        return msoffcrypto

    def process(self, data):
        password: bytes = self.args.password
        with MemoryFile(data) as stream:
            doc = self._msoffcrypto.OfficeFile(stream)
            if not doc.is_encrypted():
                self.log_warn('the document is not encrypted; returning input')
                return data
            if password:
                doc.load_key(password=password.decode(self.codec))
            with MemoryFile(bytearray()) as output:
                doc.decrypt(output)
                return output.getvalue()

class opc (mode='x32', *, count=None, until=None, nvar='name', avar='addr', ovar='arg')

Disassembles the input data using capstone and generates opcodes with metadata as output. This is useful for programmatic disassembly, while the asm unit outputs a human-readable representation. Internally, asm uses this unit and pretty-prints the output.

Expand source code Browse git

class opc(Unit):
    """
    Disassembles the input data using capstone and generates opcodes with metadata as output. This
    is useful for programmatic disassembly, while the `refinery.asm` unit outputs a human-readable
    representation. Internally, `refinery.asm` uses this unit and pretty-prints the output.
    """
    def __init__(
        self,
        mode: Arg.Choice(
            help='Machine code architecture, default is {default}. Select from the following list: {choices}.',
            choices=_ARCHES, metavar='[x32|x64|..]') = 'x32', *,
        count: Arg.Number('-c', help='Maximum number of bytes to disassemble, infinite by default.') = None,
        until: Arg.String('-u', help='Disassemble until the given string appears among the disassembly.') = None,
        nvar: Arg.String('-n', help=(
            'Variable to receive the disassembled mnemonic. Default is "{default}".')) = 'name',
        avar: Arg.String('-a', help=(
            'Variable to receive the address of the instruction. Default is "{default}".')) = 'addr',
        ovar: Arg.String('-o', help=(
            'Variable prefix for instruction operands. Default is "{default}". The complete operand '
            'string will be in {default}s, the first argument in {default}1, the second in {default}2, '
            'and so on.')) = 'arg',
        **more
    ):
        super().__init__(
            mode=mode,
            count=count,
            until=until,
            nvar=nvar,
            avar=avar,
            ovar=ovar,
            **more)

    @Unit.Requires('capstone')
    def _capstone():
        import capstone
        return capstone

    @property
    def _capstone_engine(self) -> Cs:
        cs = self._capstone
        return cs.Cs(*{
            'arm'    : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM),
            'mips32' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS32),
            'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_MIPS64),
            'ppc32'  : (cs.CS_ARCH_PPC, cs.CS_MODE_32),
            'ppc64'  : (cs.CS_ARCH_PPC, cs.CS_MODE_64),
            'x16'    : (cs.CS_ARCH_X86, cs.CS_MODE_16),
            'x32'    : (cs.CS_ARCH_X86, cs.CS_MODE_32),
            'x64'    : (cs.CS_ARCH_X86, cs.CS_MODE_64),
        }.get(self.args.mode.lower()))

    def process(self, data):
        count = self.args.count or 0
        until = self.args.until
        nvar = self.args.nvar
        avar = self.args.avar
        ovar = self.args.ovar
        if isinstance(until, str):
            until = until.lower()
        for insn in self._capstone_engine.disasm(data, 0, count):
            kwargs = {
                avar: insn.address,
                nvar: insn.mnemonic,
            }
            try:
                ops = insn.op_str
                operands = [op.strip() for op in ops.split(',')]
            except Exception:
                operands = []
            else:
                kwargs[F'{ovar}s'] = ops
            for k, op in enumerate(operands, 1):
                if not op:
                    break
                try:
                    op = int(op, 0)
                except Exception:
                    pass
                kwargs[F'{ovar}{k}'] = op
            yield self.labelled(insn.bytes, **kwargs)
            if until is None:
                continue
            if until in ops.lower() or until in insn.mnemonic.lower():
                break

class p1

A shortcut for pick with the argument 0:1.

Expand source code Browse git

class p1(pick):
    """
    A shortcut for `refinery.pick` with the argument `0:1`.
    """
    def __init__(self):
        super().__init__(slice(0, 1))

class p2

A shortcut for pick with the argument 0:2.

Expand source code Browse git

class p2(pick):
    """
    A shortcut for `refinery.pick` with the argument `0:2`.
    """
    def __init__(self):
        super().__init__(slice(0, 2))

class p3

A shortcut for pick with the argument 0:3.

Expand source code Browse git

class p3(pick):
    """
    A shortcut for `refinery.pick` with the argument `0:3`.
    """
    def __init__(self):
        super().__init__(slice(0, 3))

class pack (base=0, prefix=False, strict=False, width=0, single_floats=False, double_floats=False, bigendian=False, blocksize=None)

Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of an array of numbers into its binary form. For example, 123,34,256,12,1,234 would be transformed into the byte sequence 7B22000C01EA, where 256 was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed with options -EB2, the result would be equal to 007B00220100000C000100EA in hexadecimal.

Expand source code Browse git

class pack(BlockTransformationBase):
    """
    Scans the input data for numeric constants and packs them into a binary format. This is useful to convert the textual representation of
    an array of numbers into its binary form. For example, `123,34,256,12,1,234` would be transformed into the byte sequence `7B22000C01EA`,
    where `256` was wrapped and packed as a null byte because the default block size is one byte. If the above sequence would be packed
    with options -EB2, the result would be equal to `007B00220100000C000100EA` in hexadecimal.
    """

    def __init__(self,
        base: Arg(type=number[2:36], help=(
            'Find only numbers in given base. Default of 0 means that '
            'common expressions for hexadecimal, octal and binary are '
            'accepted.')) = 0,
        prefix : Arg.Switch('-r', group='FLT', help='Add numeric prefixes like 0x, 0b, and 0o in reverse mode.') = False,
        strict : Arg.Switch('-s', help='Only parse integers that fit in one block of the given block size.') = False,
        width  : Arg.Number('-w', help='Pad numbers with the specified amount of leading zeros.') = 0,
        single_floats: Arg.Switch('-f', group='FLT', help='Pack single-precision floating-point numbers. Implies -B4.') = False,
        double_floats: Arg.Switch('-d', group='FLT', help='Pack double-precision floating-point numbers. Implies -B8.') = False,
        bigendian=False, blocksize=None
    ):
        if single_floats and double_floats:
            raise ValueError('The floats and doubles option are mutually exclusive.')
        elif single_floats:
            fmode = FMode.SINGLE
            blocksize = 4
        elif double_floats:
            fmode = FMode.DOUBLE
            blocksize = 8
        else:
            fmode = FMode.TO_INT
        super().__init__(
            base=base,
            prefix=prefix,
            strict=strict,
            width=width,
            bigendian=bigendian,
            blocksize=blocksize,
            fmode=fmode,
            _truncate=2,
        )

    @property
    def bytestream(self):
        # never alow bytes to be left unchunked
        return False

    def reverse(self, data):
        base = self.args.base or 10
        width = self.args.width
        mode: FMode = self.args.fmode
        prefix = B''

        self.log_debug(F'using base {base:d}')

        if self.args.prefix:
            prefix = {
                0x02: b'0b',
                0x08: b'0o',
                0x10: b'0x'
            }.get(base, B'')

        if mode is FMode.TO_INT:
            converter = BaseUnit(
                base,
                little_endian=not self.args.bigendian,
                strip_padding=True,
            )
            for n in self.chunk_into_bytes(data):
                converted = converter.reverse(n)
                if width:
                    converted = converted.rjust(width, B'0')
                if prefix:
                    converted = prefix + converted
                yield converted
            return

        elif mode is FMode.SINGLE:
            float_format = 'f'
            float_size = 4

        elif mode is FMode.DOUBLE:
            float_format = 'd'
            float_size = 8

        count, rest = divmod(len(data), float_size)
        if rest:
            self.log_warn(F'data contained {rest} trailing bytes that were ignored')
            data = memoryview(data)[:-rest]
        float_format *= count
        if self.args.bigendian:
            float_format = F'>{float_format}'
        else:
            float_format = F'<{float_format}'
        for n in struct.unpack(float_format, data):
            yield str(n).encode(self.codec)

    def process(self, data):
        base: int = self.args.base
        strict: bool = self.args.strict
        mode: FMode = self.args.fmode
        ep = '>' if self.args.bigendian else '<'

        def evaluate_literals(literals: Iterable[bytes]):
            for literal in literals:
                if mode is FMode.TO_INT:
                    if base == 0 and literal[0] == 0x30 and literal[1:].isdigit():
                        literal = B'0o%s' % literal
                    N = int(literal, base)
                elif mode is FMode.SINGLE:
                    N, = struct.unpack(F'{ep}I', struct.pack(F'{ep}f', float(literal)))
                elif mode is FMode.DOUBLE:
                    N, = struct.unpack(F'{ep}Q', struct.pack(F'{ep}d', float(literal)))
                else:
                    raise TypeError('unexpected floating point mode')
                M = N & self.fmask
                if strict and M != N:
                    continue
                yield M

        if base == 0:
            pattern = formats.number
        elif base <= 10:
            pattern = re.compile(B'[-+]?[0-%d]{1,64}' % (base - 1))
        else:
            pattern = re.compile(B'[-+]?[0-9a-%c]{1,20}' % (0x57 + base), re.IGNORECASE)

        return self.unchunk(evaluate_literals(m[0] for m in pattern.finditer(data)))

class pad (width, padding=b'\x00', left=False, absolute=False)

Allows padding of the input data.

Expand source code Browse git

class pad(Unit):
    """
    Allows padding of the input data.
    """

    def __init__(
        self,
        width: Arg.Number(help='Input is padded to the nearest multiple of this size.'),
        padding: Arg(help=(
            'This custom binary sequence is used (repeatedly, if necessary) to pad the '
            'input. The default is a zero byte.')) = B'\0',
        left: Arg.Switch('-l', help='Pad on the left instead of the right.') = False,
        absolute: Arg.Switch('-a', help=(
            'The width argument specifies an absolute size, not a block size.')) = False
    ):
        super().__init__(width=width, padding=padding, left=left, absolute=absolute)

    def process(self, data):
        width = self.args.width
        if self.args.absolute and len(data) >= width:
            return data
        q, r = divmod(len(data), width)
        size = (q + bool(r)) * width
        missing = (size - len(data))
        if missing <= 0:
            return data
        pad = self.args.padding
        if missing > len(pad):
            pad *= missing // len(pad)
        if self.args.left:
            return pad[:missing] + data
        else:
            data += pad[:missing]
            return data

class pbkdf1 (size, salt=b'\x00\x00\x00\x00\x00\x00\x00\x00', iter=1000, hash='SHA1')

PBKDF1 Key derivation

Expand source code Browse git

class pbkdf1(KeyDerivation):
    """PBKDF1 Key derivation"""

    @Arg('salt', help='Salt for the derivation; default are 8 null bytes.')
    def __init__(self, size, salt=bytes(8), iter=1000, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data):
        from Cryptodome.Protocol.KDF import PBKDF1
        return multidecode(data, lambda pwd: (
            PBKDF1(pwd, self.args.salt, dkLen=self.args.size, count=self.args.iter, hashAlgo=self.hash)
        ))

class pbkdf2 (size, salt, iter=1000, hash='SHA1')

PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET binaries.

Expand source code Browse git

class pbkdf2(KeyDerivation):
    """
    PBKDF2 Key derivation. This is implemented as Rfc2898DeriveBytes in .NET
    binaries.
    """

    def __init__(self, size, salt, iter=1000, hash='SHA1'):
        self.superinit(super(), **vars())

    def process(self, data: ByteStr):
        from Cryptodome.Protocol.KDF import PBKDF2
        return multidecode(data, partial(
            PBKDF2,
            salt=self.args.salt,
            dkLen=self.args.size,
            hmac_hash_module=self.hash,
            count=self.args.iter
        ))

class pcap (merge=False, client=False, server=False)

Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of each TCP conversation, attaching several pieces of metadata to each such output: Included are the source and destination socket address as well as the variable stream which identifies the conversation which it was part of. The chunks are returned in the order that the bytes were exchanged between source and destination. When the --merge parameter is specified, the unit instead collects all bytes going forward and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place.

Expand source code Browse git

class pcap(Unit):
    """
    Performs TCP stream reassembly from packet capture (PCAP) files. By default, the unit emits the parts of
    each TCP conversation, attaching several pieces of metadata to each such output: Included are the source
    and destination socket address as well as the variable `stream` which identifies the conversation which
    it was part of. The chunks are returned in the order that the bytes were exchanged between source and
    destination. When the `--merge` parameter is specified, the unit instead collects all bytes going forward
    and backwards, respectively, and emitting these as two chunks, for each TCP conversation that took place.
    """

    def __init__(
        self,
        merge: Arg.Switch('-m', help='Merge both parts of each TCP conversation into one chunk.') = False,
        client: Arg.Switch('-c', group='D', help='Show only the client part of each conversation.') = False,
        server: Arg.Switch('-s', group='D', help='Show only the server part of each conversation.') = False,
    ):
        super().__init__(merge=merge, client=client, server=server)

    @Unit.Requires('pypcapkit[scapy]>=1.3', 'all')
    def _pcapkit():
        with NoLogging():
            import importlib
            importlib.import_module('scapy.layers.tls.session')
            import pcapkit
            return pcapkit

    @Unit.Requires('scapy', 'all')
    def _scapy():
        import scapy
        import scapy.packet
        return scapy

    def process(self, data):
        pcapkit = self._pcapkit
        merge = self.args.merge

        with NoLogging(), VirtualFileSystem() as fs:
            vf = VirtualFile(fs, data, 'pcap')
            pcap = pcapkit.extract(
                fin=vf.path,
                engine=pcapkit.Scapy,
                store=True,
                nofile=True,
                extension=False,
                ip=True,
                tcp=True,
                reassembly=True,
                reasm_strict=True,
            )
            tcp: List[Datagram] = list(pcap.reassembly.tcp)
            tcp.sort(key=lambda p: min(p.index, default=0))

        count, convo = 0, None
        src_buffer = MemoryFile()
        dst_buffer = MemoryFile()

        self.log_debug(F'extracted {len(pcap.frame)} packets, assembled {len(tcp)} datagrams')
        PT = self._scapy.packet

        def payload(packet: Packet):
            ok = (bytes, bytearray, PT.Raw)
            no = (PT.NoPayload, PT.Padding)
            circle = set()
            while True:
                try:
                    inner = packet.payload
                except AttributeError:
                    break
                if isinstance(packet, ok) and not isinstance(packet, no):
                    return packet.original
                if id(inner) in circle:
                    break
                packet = inner
                circle.add(id(inner))
            return B''

        def sequence(i: int):
            packet = pcap.frame[i - 1]
            while len(packet):
                try:
                    return packet.seq
                except AttributeError:
                    pass
                try:
                    packet = packet.payload
                except AttributeError:
                    break
            return 0

        client = self.args.client
        server = self.args.server

        def commit():
            if src_buffer.tell():
                if not server:
                    yield self.labelled(src_buffer.getvalue(), **convo.src_to_dst())
                src_buffer.truncate(0)
            if dst_buffer.tell():
                if not client:
                    yield self.labelled(dst_buffer.getvalue(), **convo.dst_to_src())
                dst_buffer.truncate(0)

        for datagram in tcp:
            this_convo = Conversation.FromID(datagram.id)
            if this_convo != convo:
                if count and merge:
                    yield from commit()
                count = count + 1
                convo = this_convo

            data = bytearray()
            for index in sorted(datagram.index, key=sequence):
                data.extend(payload(pcap.frame[index - 1]))
            if not data:
                continue
            if not merge:
                yield self.labelled(data, **this_convo.src_to_dst(), stream=count)
            elif this_convo.src == convo.src:
                src_buffer.write(data)
            elif this_convo.dst == convo.src:
                dst_buffer.write(data)
            else:
                raise RuntimeError(F'direction of packet {convo!s} in conversation {count} is unknown')

        yield from commit()

class pcap_http

Extracts HTTP payloads from packet capture (PCAP) files.

Expand source code Browse git

class pcap_http(Unit):
    """
    Extracts HTTP payloads from packet capture (PCAP) files.
    """
    def process(self, data):
        http_parser = httpresponse()
        requests: List[_HTTP_Request] = []
        responses: List[bytearray] = []

        def lookup(src, dst):
            for k, request in enumerate(requests):
                if request.src == dst and request.dst == src:
                    requests.pop(k)
                    return self.labelled(data, url=request.url)
            return None

        for stream in data | pcap():
            try:
                data = http_parser.process(stream)
            except Exception:
                try:
                    rq = _parse_http_request(stream)
                    requests.append(rq)
                except _HTTPParseError as E:
                    self.log_info(F'error parsing http request: {E!s}')
                except Exception:
                    pass
                continue
            if not data:
                continue
            src, dst = stream['src'], stream['dst']
            item = lookup(src, dst)
            if item is None:
                responses.append((src, dst, data))
                continue
            yield item

        while responses:
            src, dst, data = responses.pop()
            item = lookup(src, dst)
            yield data if item is None else item

class pecdb

Short for "PE: Clear Dynamic Base"; this unit will clear the bit in the PE header that allows for address space layout randomization. It will also set the integrity flag. With both bits set, this DLL when loaded into memory will usually be loaded at its header-defined base address, which can make debugging easier.

Expand source code Browse git

class pecdb(Unit):
    """
    Short for "PE: Clear Dynamic Base"; this unit will clear the bit in the PE header that allows
    for address space layout randomization. It will also set the integrity flag. With both bits
    set, this DLL when loaded into memory will usually be loaded at its header-defined base address,
    which can make debugging easier.
    """
    @Unit.Requires('pefile', 'default', 'extended')
    def _pefile():
        import pefile
        return pefile

    def process(self, data: bytearray):
        pe = self._pefile.PE(data=data, fast_load=True)
        dc = pe.OPTIONAL_HEADER.DllCharacteristics
        dc = dc & ~0x40 # IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE
        dc = dc & +0x80 # IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY
        pe.OPTIONAL_HEADER.DllCharacteristics = dc
        return pe.write()

class pedebloat (*names, certificate=False, directories=False, memdump=False, resources=False, sections=False, trim_code=False, trim_rsrc=False, threshold=0.05, size_limit=10.0 MB, keep_limit=False, aggressive=False)

Removes junk or excess data from PE files and returns the stripped executable. By default, only the PE overlay is considered; use the flags -r and -s to also consider resources and entire sections. Any buffer is only considered for removal if it exceeds a certain size. If this condition is met, a binary search is performed to determine the offset inside the buffer up to which the compression ratio is above a certain threshold; everything beyond that point is then removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.

Expand source code Browse git

class pedebloat(OverlayUnit):
    """
    Removes junk or excess data from PE files and returns the stripped executable. By default, only
    the PE overlay is considered; use the flags `-r` and `-s` to also consider resources and entire
    sections. Any buffer is only considered for removal if it exceeds a certain size. If this
    condition is met, a binary search is performed to determine the offset inside the buffer up to
    which the compression ratio is above a certain threshold; everything beyond that point is then
    removed. By setting the threshold compression ratio to 1, each large buffer is removed entirely.
    """
    def __init__(
        self,
        *names: Arg(type=str),
        certificate=False,
        directories=False,
        memdump=False,
        resources: Arg.Switch('-r', help='Strip large resources.') = False,
        sections : Arg.Switch('-s', help='Strip large sections.') = False,
        trim_code: Arg.Switch('-X', help='Lift the exception on code sections for stripping.') = False,
        trim_rsrc: Arg.Switch('-Y', help='Lift the exception on rsrc sections for stripping.') = False,
        threshold: Arg('-t', metavar='T', type=percent, help=(
            'Trailing data from resources and sections is stripped until the compression ratio '
            'of the remaining data rises above this threshold. The default value is {default}. '
            'Set this to 1 to ignore the limit entirely and trim every structure as much as '
            'possible without violating alignment. Setting this value to 0 will only strip repeated '
            'occurrences of the last byte.')) = 0.05,
        size_limit: Arg.Number('-l', help=(
            'Structures below this size are not stripped. Default is {default!r}.')) = _STRIP,
        keep_limit: Arg.Switch('-k', help=(
            'Do not strip structures to below the above size limit.')) = False,
        aggressive: Arg.Switch('-a', help=(
            'Equivalent to -srt1: Strip large sections and resources aggressively.')) = False,
    ):
        if aggressive:
            sections = True
            resources = True
            threshold = 1

        super().__init__(
            certificate,
            directories,
            memdump,
            sections=sections,
            resources=resources,
            size_limit=size_limit,
            keep_limit=keep_limit,
            threshold=threshold,
            trim_rsrc=trim_rsrc,
            trim_code=trim_code,
            names=names,
        )

    @OverlayUnit.Requires('pefile', 'default', 'extended')
    def _pefile():
        import pefile
        return pefile

    def _right_strip_data(self, data: memoryview, alignment=1, block_size=_MB) -> int:
        if not data:
            return 0
        threshold = self.args.threshold
        data_overhang = len(data) % alignment
        result = data_overhang

        if 0 < threshold < 1:
            def compression_ratio(offset: int):
                ratio = len(zlib.compress(data[:offset], level=1)) / offset
                self.log_debug(F'compressing {SizeInt(offset)!r} ratio={ratio:6.4f}')
                return ratio
            upper = len(data)
            lower = result
            if compression_ratio(upper) <= threshold:
                while block_size < upper - lower:
                    pivot = (lower + upper) // 2
                    ratio = compression_ratio(pivot)
                    if ratio > threshold:
                        lower = pivot + 1
                        continue
                    upper = pivot
                    if abs(ratio - threshold) < 1e-10:
                        break
            result = upper
        elif threshold == 0:
            result = len(data)
        elif threshold == 1:
            result = 0

        while result > 1 and data[result - 2] == data[result - 1]:
            result -= 1

        result = max(result, data_overhang)

        if self.args.keep_limit:
            result = max(result, self.args.size_limit)

        result = result + (data_overhang - result) % alignment

        if result > len(data):
            excess = result - len(data)
            excess = excess + (-excess % alignment)
            result = result - excess

        return result

    def _adjust_offsets(self, pe: PE, gap_offset: int, gap_size: int):
        base = pe.OPTIONAL_HEADER.ImageBase
        alignment = pe.OPTIONAL_HEADER.FileAlignment
        rva_offset = pe.get_rva_from_offset(gap_offset)
        tva_offset = rva_offset + base

        section = pe.get_section_by_offset(gap_offset)
        new_section_size = section.SizeOfRawData - gap_size
        if new_section_size % alignment != 0:
            raise RuntimeError(
                F'trimming 0x{gap_size:X} bytes from section {_ASCII(section.Name)} of size 0x{section.SizeOfRawData:X} '
                F'violates required section alignment of 0x{alignment:X} bytes')
        inside_section_offset = gap_offset - section.PointerToRawData
        if inside_section_offset > new_section_size:
            overlap = inside_section_offset - new_section_size
            raise RuntimeError(F'trimming from section {_ASCII(section.Name)}; data extends {overlap} beyond section')

        rva_lbound = section.VirtualAddress
        rva_ubound = section.VirtualAddress + section.Misc_VirtualSize - 1
        tva_lbound = rva_lbound + base
        tva_ubound = rva_ubound + base

        def adjust_attributes_of_structure(
            structure: Structure,
            gap_offset: int,
            valid_values_lower_bound: Optional[int],
            valid_values_upper_bound: Optional[int],
            attributes: Iterable[str]
        ):
            for attribute in attributes:
                old_value = getattr(structure, attribute, 0)
                if old_value <= gap_offset:
                    continue
                if valid_values_lower_bound is not None and old_value < valid_values_lower_bound:
                    continue
                if valid_values_upper_bound is not None and old_value > valid_values_upper_bound:
                    continue
                new_value = old_value - gap_size
                if new_value < gap_offset:
                    raise BrokenLink(F'attribute {attribute} points into removed region')
                self.log_debug(F'adjusting field in {structure.name}: {attribute}')
                setattr(structure, attribute, new_value)

        it: Iterable[Structure] = iter(pe.__structures__)
        structure_class = self._pefile.SectionStructure
        remove = []

        for index, structure in enumerate(it):
            old_offset = structure.get_file_offset()
            new_offset = old_offset - gap_offset

            if old_offset > gap_offset:
                if old_offset < gap_offset + gap_size:
                    self.log_debug(F'removing structure {structure.name}; starts inside removed region')
                    remove.append(index)
                    continue
                if isinstance(structure, structure_class) and new_offset % alignment != 0:
                    raise RuntimeError(
                        F'structure {structure.name} would be moved to offset 0x{new_offset:X}, '
                        F'violating section alignment value 0x{alignment:X}.')
                structure.set_file_offset(new_offset)

            try:
                adjust_attributes_of_structure(structure, rva_offset, rva_lbound, rva_ubound, (
                    'OffsetToData',
                    'AddressOfData',
                    'VirtualAddress',
                    'AddressOfNames',
                    'AddressOfNameOrdinals',
                    'AddressOfFunctions',
                    'AddressOfEntryPoint',
                    'AddressOfRawData',
                    'BaseOfCode',
                    'BaseOfData',
                ))
                adjust_attributes_of_structure(structure, tva_offset, tva_lbound, tva_ubound, (
                    'StartAddressOfRawData',
                    'EndAddressOfRawData',
                    'AddressOfIndex',
                    'AddressOfCallBacks',
                ))
                adjust_attributes_of_structure(structure, gap_offset, None, None, (
                    'OffsetModuleName',
                    'PointerToRawData',
                ))
            except BrokenLink as error:
                self.log_debug(F'removing structure {structure.name}; {error!s}')
                remove.append(index)
                continue

            for attribute in (
                'CvHeaderOffset',
                'OffsetIn2Qwords',
                'OffsetInQwords',
                'Offset',
                'OffsetLow',
                'OffsetHigh'
            ):
                if not hasattr(structure, attribute):
                    continue
                self.log_warn(F'potential offset in structure {structure.name} ignored: {attribute}')

        while remove:
            index = remove.pop()
            pe.__structures__[index:index + 1] = []

        section.SizeOfRawData = new_section_size

    def _trim_sections(self, pe: PE, data: bytearray) -> int:
        S = self.args.size_limit
        P = self.args.names
        trimmed = 0
        for section in pe.sections:
            section: SectionStructure
            offset = section.PointerToRawData
            name = _ASCII(section.Name)
            if not self.args.trim_code and name.lower() in ('.text', '.code'):
                self.log_debug(F'skipping code section {name}; specify --trim-code to override.')
                continue
            if not self.args.trim_rsrc and name.lower() == '.rsrc':
                self.log_debug(F'skipping rsrc section {name}; specify --trim-rsrc to override.')
                continue
            old_size = section.SizeOfRawData
            if old_size <= S and not any(fnmatch(name, p) for p in P):
                self.log_debug(F'criteria not satisfied for section: {SizeInt(old_size)!r} {name}')
                continue
            new_size = self._right_strip_data(
                memoryview(data)[offset:offset + old_size],
                pe.OPTIONAL_HEADER.FileAlignment)
            if new_size == old_size:
                continue
            self.log_info(F'stripping section {name} from {TI(old_size)!r} to {TI(new_size)!r}')
            gap_size = old_size - new_size
            gap_offset = offset + new_size
            if gap_size <= 0:
                continue
            self._adjust_offsets(pe, gap_offset, gap_size)
            trimmed += gap_size
            data[gap_offset:gap_offset + gap_size] = []
        return trimmed

    def _trim_pe_resources(self, pe: PE, data: bytearray) -> int:
        S = self.args.size_limit
        P = self.args.names
        trimmed = 0

        def find_bloated_resources(pe: PE, directory, level: int = 0, *path) -> Generator[Structure, None, None]:
            for entry in directory.entries:
                name = getattr(entry, 'name')
                numeric = getattr(entry, 'id')
                if not name:
                    if level == 0 and numeric in iter(RSRC):
                        name = RSRC(entry.id)
                    elif numeric is not None:
                        name = str(numeric)
                name = name and str(name) or '?'
                if entry.struct.DataIsDirectory:
                    yield from find_bloated_resources(pe, entry.directory, level + 1, *path, name)
                    continue
                struct: Structure = entry.data.struct
                name = '/'.join((*path, name))
                if struct.Size <= S and not any(fnmatch(name, p) for p in P):
                    self.log_debug(F'criteria not satisfied for resource: {SizeInt(struct.Size)!r} {name}')
                    continue
                yield name, struct

        RSRC_INDEX = self._pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']
        pe.parse_data_directories(directories=[RSRC_INDEX])

        try:
            resources = pe.DIRECTORY_ENTRY_RESOURCE
        except AttributeError:
            return 0
        for name, resource in find_bloated_resources(pe, resources):
            offset = pe.get_offset_from_rva(resource.OffsetToData)
            old_size = resource.Size
            new_size = self._right_strip_data(
                memoryview(data)[offset:offset + old_size],
                pe.OPTIONAL_HEADER.FileAlignment)
            self.log_info(F'stripping resource {name} from {old_size} to {new_size}')
            gap_size = old_size - new_size
            gap_offset = offset + new_size
            if gap_size <= 0:
                continue
            resource.Size = new_size
            self._adjust_offsets(pe, gap_offset, gap_size)
            trimmed += gap_size
            data[gap_offset:gap_offset + gap_size] = []

        pe.OPTIONAL_HEADER.DATA_DIRECTORY[RSRC_INDEX].Size -= trimmed
        self.log_info(F'trimming size of resource data directory by {TI(trimmed)!r}')
        return trimmed

    def process(self, data: bytearray) -> bytearray:
        overlay_offset = self._get_size(data)
        if len(data) - overlay_offset >= self.args.size_limit:
            view = memoryview(data)
            overlay_length = self._right_strip_data(view[overlay_offset:])
            body_size = overlay_offset + overlay_length
            try:
                data[body_size:] = []
            except Exception:
                data = data[:body_size]
        if not self.args.resources and not self.args.sections:
            return data
        pe = self._pefile.PE(data=data, fast_load=True)
        total = len(data)
        trimmed = 0
        view = pe.__data__
        copy = False
        if not isinstance(view, bytearray):
            view = memoryview(view)
            try:
                view[0] = 0x4D
            except Exception:
                copy = True
                view = bytearray(pe.__data__)
        if self.args.resources:
            trimmed += self._trim_pe_resources(pe, view)
        if self.args.sections:
            trimmed += self._trim_sections(pe, view)
        if copy:
            pe.__data__ = view
        data = pe.write()
        end = total - trimmed
        if end < len(data):
            self.log_warn(F'output contains {len(data) - end} trailing bytes')
        return data

class peek (lines=10, all=False, brief=False, decode=0, escape=False, bare=False, meta=0, gray=False, index=False, stdout=False, narrow=False, blocks=1, dense=False, expand=False, width=0)

The unit extracts preview information of the input data and displays it on the standard error stream. If the standard output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal, the data is discarded instead.

Expand source code Browse git

class peek(HexViewer):
    """
    The unit extracts preview information of the input data and displays it on the standard error stream. If the standard
    output of this unit is connected by a pipe, the incoming data is forwarded. However, if the unit outputs to a terminal,
    the data is discarded instead.
    """

    def __init__(
        self,
        lines  : Arg.Number('-l', group='SIZE', help='Specify number N of lines in the preview, default is 10.') = 10,
        all    : Arg.Switch('-a', group='SIZE', help='Output all possible preview lines without restriction') = False,
        brief  : Arg.Switch('-b', group='SIZE', help='One line peek, implies --lines=1.') = False,
        decode : Arg.Counts('-d', group='MODE', help=(
            'Attempt to decode and display printable data. Specify twice to enable line wrapping.')) = 0,
        escape : Arg.Switch('-e', group='MODE', help='Always peek data as string, escape characters if necessary.') = False,
        bare   : Arg.Switch('-r', group='META', help='Only peek the data itself, do not show a metadata preview.') = False,
        meta   : Arg.Counts('-m', group='META', help=(
            'Show more auto-derivable metadata. Specify multiple times to populate more variables.')) = 0,
        gray   : Arg.Switch('-g', help='Do not colorize the output.') = False,
        index  : Arg.Switch('-i', help='Display the index of each chunk within the current frame.') = False,
        stdout : Arg.Switch('-2', help='Print the peek to STDOUT rather than STDERR; the input data is lost.') = False,
        narrow=False, blocks=1, dense=False, expand=False, width=0
    ):
        if decode and escape:
            raise ValueError('The decode and esc options are exclusive.')
        if brief:
            narrow = True
        if environment.colorless.value:
            gray = True
        lines = 1 if brief else INF if all else lines
        super(peek, self).__init__(
            brief=brief,
            gray=gray,
            blocks=blocks,
            decode=decode,
            dense=dense,
            index=index,
            escape=escape,
            expand=expand,
            narrow=narrow,
            lines=lines,
            meta=meta,
            bare=bare,
            width=width,
            stdout=stdout,
        )

    @HexViewer.Requires('colorama', 'display', 'default', 'extended')
    def _colorama():
        import colorama
        return colorama

    def process(self, data):
        colorize = not self.args.gray and not self.args.stdout
        lines = self._peeklines(data, colorize)

        if self.args.stdout:
            for line in lines:
                yield line.encode(self.codec)
            return

        stderr = sys.stderr

        if colorize:
            colorama = self._colorama
            if os.name == 'nt':
                stderr = colorama.AnsiToWin32(stderr).stream
            _erase = ' ' * get_terminal_size()
            _reset = F'\r{colorama.Style.RESET_ALL}{_erase}\r'
        else:
            _reset = ''

        try:
            for line in lines:
                print(line, file=stderr)
        except BaseException:
            stderr.write(_reset)
            raise
        if not self.isatty():
            self.log_info('forwarding input to next unit')
            yield data

    def _peekmeta(self, linewidth, sep, meta: dict, peek=None) -> Generator[str, None, None]:
        if not meta and not peek:
            return
        width = max((len(name) for name in meta), default=0)
        separators = iter([sep])
        if peek is not None:
            if len(peek) > linewidth:
                peek = peek[:linewidth - 3] + '...'
            yield from separators
            yield peek
        for name in sorted(meta, key=lambda s: (len(s) <= 3, s)):
            value = meta[name]
            if value is None:
                continue
            if isinstance(value, CustomStringRepresentation):
                value = repr(value).strip()
            elif isbuffer(value):
                value = repr(ByteStringWrapper(value))
            elif isinstance(value, int):
                if value in range(-999, 1000):
                    value = str(value)
                elif value > 0:
                    value = F'0x{value:X}'
                else:
                    value = F'-0x{-value:X}'
            elif isinstance(value, float):
                value = F'{value:.4f}'
            metavar = F'{name:>{width + 2}} = {value!s}'
            if len(metavar) > linewidth:
                metavar = metavar[:linewidth - 3] + '...'
            yield from separators
            yield metavar

    def _trydecode(self, data, codec: Optional[str], width: int, linecount: int) -> str:
        remaining = linecount
        result = []
        wrap = self.args.decode > 1
        if codec is None:
            from refinery.units.encoding.esc import esc
            decoded = data[:abs(width * linecount)]
            decoded = str(decoded | -esc(bare=True))
            limit = abs(min(linecount * width, len(decoded)))
            for k in range(0, limit, width):
                result.append(decoded[k:k + width])
            return result
        try:
            import unicodedata
            unprintable = {'Cc', 'Cf', 'Co', 'Cs'}
            self.log_info(F'trying to decode as {codec}.')
            decoded = codecs.decode(data, codec, errors='strict')
            count = sum(unicodedata.category(c) not in unprintable for c in decoded)
            ratio = count / len(decoded)
        except UnicodeDecodeError as DE:
            self.log_info('decoding failed:', DE.reason)
            return None
        except ValueError as V:
            self.log_info('decoding failed:', V)
            return None
        if ratio < 0.8:
            self.log_info(F'data contains {ratio * 100:.2f}% printable characters, this is too low.')
            return None
        decoded = decoded.splitlines(False)
        if not wrap:
            for k, line in enumerate(decoded):
                line = line.replace('\t', '\x20' * 4)
                if len(line) <= width:
                    continue
                clipped = line[:width - 3]
                if self.args.gray:
                    color = ''
                    reset = ''
                else:
                    colorama = self._colorama
                    color = colorama.Fore.LIGHTRED_EX
                    reset = colorama.Style.RESET_ALL
                decoded[k] = F'{clipped}{color}...{reset}'
            return decoded[:abs(linecount)]
        for paragraph in decoded:
            if not remaining:
                break
            wrapped = [
                line for chunk in textwrap.wrap(
                    paragraph,
                    width,
                    break_long_words=True,
                    break_on_hyphens=False,
                    drop_whitespace=False,
                    expand_tabs=True,
                    max_lines=abs(remaining + 1),
                    replace_whitespace=False,
                    tabsize=4,
                )
                for line in chunk.splitlines(keepends=False)
            ]
            remaining -= len(wrapped)
            result.extend(wrapped)
        return result[:abs(linecount)]

    def _peeklines(self, data: bytearray, colorize: bool) -> Generator[str, None, None]:

        meta = metavars(data)

        codec = None
        lines = None
        final = data.temp or False
        empty = True

        if not self.args.index:
            meta.discard('index')
            index = None
        else:
            index = meta.get('index', None)

        if not self.args.brief:
            padding = 0
        else:
            padding = SizeInt.width + 2
            if index is not None:
                padding += 6

        metrics = self._get_metrics(len(data), self.args.lines, padding)

        if self.args.brief:
            metrics.address_width = 0
            metrics.fit_to_width(allow_increase=True)

        sepsize = metrics.hexdump_width
        txtsize = self.args.width or sepsize

        if self.args.lines and data:
            if self.args.escape:
                lines = self._trydecode(data, None, txtsize, metrics.line_count)
            if self.args.decode > 0:
                for codec in ('utf8', 'cp1251', 'cp1252', 'utf-16le', 'utf-16', 'utf-16be'):
                    lines = self._trydecode(data, codec, txtsize, metrics.line_count)
                    if lines:
                        codec = codec
                        break
                else:
                    codec = None
            if lines is None:
                lines = list(self.hexdump(data, metrics, colorize))
            else:
                sepsize = txtsize

        def separator(title=None):
            if title is None or sepsize <= len(title) + 8:
                return sepsize * '-'
            return '-' * (sepsize - len(title) - 5) + F'[{title}]---'

        if self.args.brief:
            final = False
        elif not self.args.bare:
            peek = repr(meta.size)
            line = separator()
            if len(data) <= 5_000_000:
                peek = F'{peek}; {meta.entropy!r} entropy'
            peek = F'{peek}; {meta.magic!s}'
            if self.args.lines == 0:
                peek = None
            elif not data:
                peek = None
                line = separator('empty chunk')
            if self.args.meta > 0:
                meta.derive('size')
                meta.derive('magic')
                meta.derive('entropy')
                peek = None
            if self.args.meta > 1:
                meta.derive('crc32')
                meta.derive('sha256')
            if self.args.meta > 2:
                for name in meta.derivations:
                    meta[name]
            for line in self._peekmeta(metrics.hexdump_width, line, meta, peek=peek):
                empty = False
                yield line

        if lines:
            empty = False
            if not self.args.brief:
                yield separator(codec or None)
                yield from lines
            else:
                brief = next(iter(lines))
                brief = F'{SizeInt(len(data))!r}: {brief}'
                if index is not None:
                    brief = F'#{index:03d}: {brief}'
                yield brief

        if final and (self.args.bare or not empty):
            yield separator()

    def filter(self, chunks):
        try:
            self._colorama.init(wrap=False)
        except ImportError:
            pass
        discarded = 0
        it = iter(chunks)
        buffer = collections.deque(itertools.islice(it, 0, 2))
        buffer.reverse()

        while buffer:
            if self.isatty() and not buffer[0].visible:
                buffer.popleft()
                discarded += 1
            else:
                item = buffer.pop()
                last = not bool(buffer)
                item.temp = last
                if not item.visible and self.isatty():
                    discarded += 1
                else:
                    yield item
            try:
                buffer.appendleft(next(it))
            except StopIteration:
                pass

        if discarded:
            self.log_warn(F'discarded {discarded} invisible chunks to prevent them from leaking into the terminal.')

class pefix

Take as input a buffer that represents a stripped PE file, i.e. magic numbers and other relevant parts of the header have been stripped. The unit attempts to repair the damage and return something that can be parsed.

Expand source code Browse git

class pefix(Unit):
    """
    Take as input a buffer that represents a stripped PE file, i.e. magic numbers and other
    relevant parts of the header have been stripped. The unit attempts to repair the damage
    and return something that can be parsed.
    """
    @Unit.Requires('pefile', 'default', 'extended')
    def _pefile():
        import pefile
        return pefile

    def process(self, data):
        sr = StructReader(data)
        sr.write(B'MZ')
        sr.seekset(0x3C)
        nt = sr.u16()
        oh = nt + 0x18
        sr.seekset(nt)
        sr.write(B'PE')
        sr.seekrel(2)
        mt = sr.u16()

        try:
            mt = MachineType(mt)
        except Exception:
            mt = None

        sr.seekset(oh)
        ms = bytes(sr.peek(2))

        try:
            ms = ImgState(ms)
        except ValueError:
            ms = {
                None: None,
                MachineType.I386  : ImgState.x32,
                MachineType.IA64  : ImgState.x64,
                MachineType.AMD64 : ImgState.x64,
            }.get(mt)

        if ms is None:
            self.log_warn('could not determine image state; nulling field')
            sr.write(B'\0\0')
        else:
            sr.write(ms.value)

        if mt is None:
            if mt := {
                None: None,
                ImgState.x32: MachineType.I386,
                ImgState.x64: MachineType.AMD64,
            }.get(ms):
                assert isinstance(mt, MachineType)
                sr.seekset(nt + 4)
                sr.write(mt.value.to_bytes(2, 'little'))

        pe = self._pefile.PE(data=data, fast_load=True)

        if (alignment := pe.OPTIONAL_HEADER.FileAlignment) not in {1 << k for k in range(9, 16)}:
            for k in range(9, 16):
                alignment = 1 << k
                size_of_headers = 0x28 * len(pe.sections) + oh + 0xF0
                soh = align(alignment, size_of_headers)
                if any(data[size_of_headers:soh]):
                    raise ValueError('nonzero bytes in what must be header padding')
                if any(data[soh:soh + 8]):
                    pe.OPTIONAL_HEADER.SizeOfHeaders = soh
                    break
            else:
                raise ValueError('unable to find a valid file alignment')

        pe.OPTIONAL_HEADER.FileAlignment = alignment
        pe.OPTIONAL_HEADER.SectionAlignment = max(pe.OPTIONAL_HEADER.SectionAlignment, alignment)

        return pe.write()

class pemeta (custom=False, debug=False, dotnet=False, signatures=False, timestamps=0, version=False, header=False, exports=0, imports=0, tabular=False, timeraw=False)

Extract metadata from PE files. By default, all information except for imports and exports are extracted.

Expand source code Browse git

class pemeta(Unit):
    """
    Extract metadata from PE files. By default, all information except for imports and exports are
    extracted.
    """
    def __init__(
        self, custom : Arg('-c', '--custom',
            help='Unless enabled, all default categories will be extracted.') = False,
        debug      : Arg.Switch('-D', help='Parse the PDB path from the debug directory.') = False,
        dotnet     : Arg.Switch('-N', help='Parse the .NET header.') = False,
        signatures : Arg.Switch('-S', help='Parse digital signatures.') = False,
        timestamps : Arg.Counts('-T', help='Extract time stamps. Specify twice for more detail.') = 0,
        version    : Arg.Switch('-V', help='Parse the VERSION resource.') = False,
        header     : Arg.Switch('-H', help='Parse base data from the PE header.') = False,
        exports    : Arg.Counts('-E', help='List all exported functions. Specify twice to include addresses.') = 0,
        imports    : Arg.Counts('-I', help='List all imported functions. Specify twice to include addresses.') = 0,
        tabular    : Arg.Switch('-t', help='Print information in a table rather than as JSON') = False,
        timeraw    : Arg.Switch('-r', help='Extract time stamps as numbers instead of human-readable format.') = False,
    ):
        if not custom and not any((debug, dotnet, signatures, timestamps, version, header)):
            debug = dotnet = signatures = timestamps = version = header = True
        super().__init__(
            debug=debug,
            dotnet=dotnet,
            signatures=signatures,
            timestamps=timestamps,
            version=version,
            header=header,
            imports=imports,
            exports=exports,
            timeraw=timeraw,
            tabular=tabular,
        )

    @classmethod
    def handles(self, data):
        return data[:2] == B'MZ'

    @classmethod
    def _ensure_string(cls, x):
        if not isinstance(x, str):
            x = repr(x) if not isinstance(x, bytes) else x.decode(cls.codec, 'backslashreplace')
        return x

    @classmethod
    def _parse_pedict(cls, bin):
        return dict((
            cls._ensure_string(key).replace(" ", ""),
            cls._ensure_string(val)
        ) for key, val in bin.items() if val)

    @classmethod
    def parse_signature(cls, data: bytearray) -> dict:
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        time stamp and code signing certificates that are attached to the input PE file.
        """
        from refinery.units.formats.pkcs7 import pkcs7

        try:
            signature = data | pkcs7 | json.loads
        except Exception as E:
            raise ValueError(F'PKCS7 parser failed with error: {E!s}')

        info = {}

        def _value(doc: dict, require_type=None):
            if require_type is not None:
                if doc.get('type', None) != require_type:
                    raise LookupError
            value = doc.get('value', None)
            value = [value] if value else doc.get('values', [])
            if not value:
                raise LookupError
            return value[0]

        def find_timestamps(entry) -> dict:
            if isinstance(entry, dict):
                try:
                    return {'Timestamp': _value(entry, 'signing_time')}
                except LookupError:
                    pass
                for value in entry.values():
                    result = find_timestamps(value)
                    if result is None:
                        continue
                    with suppress(KeyError):
                        result.setdefault('TimestampIssuer', entry['sid']['issuer']['common_name'])
                    return result
            elif isinstance(entry, list):
                for value in entry:
                    result = find_timestamps(value)
                    if result is None:
                        continue
                    return result

        timestamp_info = find_timestamps(signature)
        if timestamp_info is not None:
            info.update(timestamp_info)

        try:
            certificates = signature['content']['certificates']
            signer_infos = signature['content']['signer_infos']
        except KeyError:
            return info

        try:
            signer_serials = {info['sid']['serial_number']: info for info in signer_infos}
        except KeyError:
            return info

        signer_certificates = []

        for certificate in certificates:
            with suppress(Exception):
                crt = certificate['tbs_certificate']
                serial = crt['serial_number']
                signer = signer_serials[serial]
                if isinstance(serial, int):
                    serial = F'{serial:x}'
                if len(serial) % 2 != 0:
                    serial = F'0{serial}'
                assert bytes.fromhex(serial) in data
                subject = crt['subject']
                location = [subject.get(t, '') for t in ('locality_name', 'state_or_province_name', 'country_name')]
                cert_info = {}
                cert_info.update(Subject=subject['common_name'])
                if any(location):
                    cert_info.update(SubjectLocation=', '.join(filter(None, location)))
                for attr in signer['signed_attrs']:
                    if attr['type'] == 'authenticode_info':
                        auth = _value(attr)
                        cert_info.update(ProgramName=auth['programName'])
                        cert_info.update(MoreInfo=auth['moreInfo'])
                try:
                    valid_since = crt['validity']['not_before']
                    valid_until = crt['validity']['not_after']
                except KeyError:
                    pass
                else:
                    cert_info.update(ValidSince=valid_since, ValidUntil=valid_until)
                cert_info.update(
                    Issuer=crt['issuer']['common_name'], Fingerprint=certificate['fingerprint'], Serial=serial)
                signer_certificates.append(cert_info)

        if len(signer_certificates) == 1:
            info.update(signer_certificates[0])
        if len(signer_certificates) >= 2:
            info['Signer'] = signer_certificates
        return info

    def _pe_characteristics(self, pe: lief.PE.Binary):
        characteristics = {F'IMAGE_FILE_{flag.name}' for flag in lief.PE.Header.CHARACTERISTICS
            if pe.header.characteristics & flag.value}
        if pe.header.characteristics & 0x40:
            # TODO: Missing from LIEF
            characteristics.add('IMAGE_FILE_16BIT_MACHINE')
        return characteristics

    def _pe_address_width(self, pe: lief.PE.Binary, default=16) -> int:
        # TODO: missing from LIEF
        IMAGE_FILE_16BIT_MACHINE = 0x40
        if pe.header.characteristics & IMAGE_FILE_16BIT_MACHINE:
            return 4
        elif pe.header.machine == lief.PE.Header.MACHINE_TYPES.I386:
            return 8
        elif pe.header.machine in (
            lief.PE.Header.MACHINE_TYPES.AMD64,
            lief.PE.Header.MACHINE_TYPES.IA64,
        ):
            return 16
        else:
            return default

    def _vint(self, pe: lief.PE.Binary, value: int):
        if not self.args.tabular:
            return value
        aw = self._pe_address_width(pe)
        return F'0x{value:0{aw}X}'

    def parse_version(self, pe: lief.PE.Binary, data=None) -> dict:
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        the version resource of an input PE file, if available.
        """
        version_info = {}
        if not pe.resources_manager.has_version:
            return None
        version = pe.resources_manager.version

        if info := version.string_file_info:
            for lng in info.langcode_items:
                version_info.update({
                    k.replace(' ', ''): _STRING(v) for k, v in lng.items.items()
                })
                version_info.update(
                    CodePage=lng.code_page.name,
                    LangID=self._vint(pe, lng.lang << 0x10 | lng.sublang),
                    Language=LCID.get(lng.lang, 'Language Neutral'),
                    Charset=self._CHARSET.get(lng.sublang, 'Unknown Charset'),
                )

        def _to_version_string(hi: int, lo: int):
            a = hi >> 0x10
            b = hi & 0xFFFF
            c = lo >> 0x10
            d = lo & 0xFFFF
            return F'{a}.{b}.{c}.{d}'

        # TODO: Missing: Version.CompanyName
        # TODO: Missing: Version.FileDescription
        # TODO: Missing: Version.LegalCopyright
        # TODO: Missing: Version.ProductName

        if info := version.fixed_file_info:
            version_info.update(
                OSName=info.file_os.name,
                FileType=info.file_type.name,
            )
            if (s := info.file_subtype).value:
                version_info.update(FileSubType=s)
            if t := info.file_date_MS << 32 | info.file_date_LS:
                version_info.update(Timestamp=_FILETIME(t))
            version_info.update(
                ProductVersion=_to_version_string(info.product_version_MS, info.product_version_LS),
                FileVersion=_to_version_string(info.file_version_MS, info.file_version_LS),
            )

        if info := version.var_file_info:
            ...

        return version_info or None

    def parse_exports(self, pe: lief.PE.Binary, data=None, include_addresses=False) -> list:
        base = pe.optional_header.imagebase
        info = []
        if not pe.has_exports:
            return None
        for k, exp in enumerate(pe.get_export().entries):
            name = exp.demangled_name
            if not name:
                name = exp.name
            if not name:
                name = F'@{k}'
            if not isinstance(name, str):
                name = name.decode('latin1')
            item = {
                'Name': name, 'Address': self._vint(pe, exp.address + base)
            } if include_addresses else name
            info.append(item)
        return info

    def parse_imports(self, pe: lief.PE.Binary, data=None, include_addresses=False) -> list:
        info = {}
        for idd in itertools.chain(pe.imports, pe.delay_imports):
            dll = _STRING(idd.name)
            if dll.lower().endswith('.dll'):
                dll = dll[:~3]
            imports: list[str] = info.setdefault(dll, [])
            for imp in idd.entries:
                name = _STRING(imp.name) or F'@{imp.ordinal}'
                imports.append(dict(
                    Name=name, Address=self._vint(pe, imp.value)
                ) if include_addresses else name)
        return info

    def parse_header(self, pe: lief.PE.Binary, data=None) -> dict:
        major = pe.optional_header.major_operating_system_version
        minor = pe.optional_header.minor_operating_system_version
        version = self._WINVER.get(major, {0: 'Unknown'})

        try:
            MinimumOS = version[minor]
        except LookupError:
            MinimumOS = version[0]
        header_information = {
            'Machine': pe.header.machine.name,
            'Subsystem': pe.optional_header.subsystem.name,
            'MinimumOS': MinimumOS,
        }
        if pe.has_exports:
            export_name = _STRING(pe.get_export().name)
            if export_name.isprintable():
                header_information['ExportName'] = export_name

        if pe.has_rich_header:
            rich = []
            if self.args.tabular:
                cw = max(len(F'{entry.count:d}') for entry in pe.rich_header.entries)
            for entry in pe.rich_header.entries:
                idv = entry.build_id | (entry.id << 0x10)
                count = entry.count
                info = get_rich_info(idv)
                if not info:
                    continue
                pid = info.pid.upper()
                if self.args.tabular:
                    short_pid = get_rich_short_pid(pid)
                    rich.append(F'[{idv:08x}] {count:>0{cw}d} {short_pid!s} {info.ver}')
                else:
                    rich.append({
                        'Counter': count,
                        'Encoded': F'{idv:08x}',
                        'Library': pid,
                        'Product': info.ver,
                    })
            header_information['RICH'] = rich

        characteristics = self._pe_characteristics(pe)
        for typespec, flag in {
            'EXE': 'IMAGE_FILE_EXECUTABLE_IMAGE',
            'DLL': 'IMAGE_FILE_DLL',
            'SYS': 'IMAGE_FILE_SYSTEM'
        }.items():
            if flag in characteristics:
                header_information['Type'] = typespec

        base = pe.optional_header.imagebase
        header_information['ImageBase'] = self._vint(pe, base)
        header_information['ImageSize'] = self._vint(pe, pe.optional_header.sizeof_image)
        header_information['ComputedSize'] = get_pe_size(pe)
        header_information['Bits'] = 4 * self._pe_address_width(pe, 16)
        header_information['EntryPoint'] = self._vint(pe, pe.optional_header.addressof_entrypoint + base)
        return header_information

    def parse_time_stamps(self, pe: lief.PE.Binary, raw_time_stamps: bool, more_detail: bool) -> dict:
        """
        Extracts time stamps from the PE header (link time), as well as from the imports,
        exports, debug, and resource directory. The resource time stamp is also parsed as
        a DOS time stamp and returned as the "Delphi" time stamp.
        """
        def _id(x): return x
        dt = _id if raw_time_stamps else date_from_timestamp
        info = {}

        with suppress(AttributeError):
            info.update(Linker=dt(pe.header.time_date_stamps))

        import_timestamps = {}
        for entry in pe.imports:
            ts = entry.timedatestamp
            if ts == 0 or ts == 0xFFFFFFFF:
                continue
            import_timestamps[_STRING(entry.name, True)] = dt(ts)

        symbol_timestamps = {}
        for entry in pe.delay_imports:
            ts = entry.timestamp
            if ts == 0 or ts == 0xFFFFFFFF:
                continue
            symbol_timestamps[_STRING(entry.name, True)] = dt(ts)

        for key, impts in [
            ('Import', import_timestamps),
            ('Symbol', symbol_timestamps),
        ]:
            if not impts:
                continue
            if not more_detail:
                dmin = min(impts.values())
                dmax = max(impts.values())
                small_delta = 2 * 60 * 60
                if not raw_time_stamps:
                    small_delta = timedelta(seconds=small_delta)
                if dmax - dmin < small_delta:
                    impts = dmin
            info[key] = impts

        if pe.has_exports and (ts := pe.get_export().timestamp):
            info.update(Export=dt(ts))

        if pe.has_resources and pe.resources.is_directory:
            rsrc: lief.PE.ResourceDirectory = pe.resources
            if res_timestamp := rsrc.time_date_stamp:
                with suppress(ValueError):
                    from refinery.units.misc.datefix import datefix
                    dos = datefix.dostime(res_timestamp)
                    info.update(Delphi=dos)
                    info.update(RsrcTS=dt(res_timestamp))

        def norm(value):
            if isinstance(value, list):
                return [norm(v) for v in value]
            if isinstance(value, dict):
                return {k: norm(v) for k, v in value.items()}
            if isinstance(value, int):
                return value
            return str(value)

        return {key: norm(value) for key, value in info.items()}

    def parse_dotnet(self, pe: lief.PE.Binary, data):
        """
        Extracts a JSON-serializable and human-readable dictionary with information about
        the .NET metadata of an input PE file.
        """
        header = DotNetHeader(data, pe)
        tables = header.meta.Streams.Tables
        info = dict(
            RuntimeVersion=F'{header.head.MajorRuntimeVersion}.{header.head.MinorRuntimeVersion}',
            Version=F'{header.meta.MajorVersion}.{header.meta.MinorVersion}',
            VersionString=header.meta.VersionString
        )

        info['Flags'] = [name for name, check in header.head.KnownFlags.items() if check]

        if len(tables.Assembly) == 1:
            assembly = tables.Assembly[0]
            info.update(
                AssemblyName=assembly.Name,
                Release='{}.{}.{}.{}'.format(
                    assembly.MajorVersion,
                    assembly.MinorVersion,
                    assembly.BuildNumber,
                    assembly.RevisionNumber
                )
            )

        try:
            entry = self._vint(pe, header.head.EntryPointToken + pe.optional_header.imagebase)
            info.update(EntryPoint=entry)
        except AttributeError:
            pass

        if len(tables.Module) == 1:
            module = tables.Module[0]
            info.update(ModuleName=module.Name)

        return info

    def parse_debug(self, pe: lief.PE.Binary, data=None):
        result = []
        if not pe.has_debug:
            return None
        for entry in pe.debug:
            if entry.type != lief.PE.Debug.TYPES.CODEVIEW:
                continue
            try:
                entry: lief.PE.CodeViewPDB
                result.append(dict(
                    PdbPath=_STRING(entry.filename),
                    PdbGUID=entry.guid,
                    PdbAge=entry.age,
                ))
            except AttributeError:
                continue
        if len(result) == 1:
            result = result[0]
        return result

    def process(self, data):
        result = {}

        pe = lief.load_pe(
            data,
            parse_exports=self.args.exports,
            parse_imports=self.args.imports,
            parse_rsrc=self.args.version,
            parse_reloc=False,
            parse_signature=self.args.timestamps or self.args.signatures,
        )

        if pe is None:
            raise ValueError('Input not recognized as a PE file.')

        pe = NoLoggingProxy(pe)

        for switch, resolver, name in [
            (self.args.debug,   self.parse_debug,    'Debug'),    # noqa
            (self.args.dotnet,  self.parse_dotnet,   'DotNet'),   # noqa
            (self.args.header,  self.parse_header,   'Header'),   # noqa
            (self.args.version, self.parse_version,  'Version'),  # noqa
            (self.args.imports, self.parse_imports,  'Imports'),  # noqa
            (self.args.exports, self.parse_exports,  'Exports'),  # noqa
        ]:
            if not switch:
                continue
            self.log_debug(F'parsing: {name}')
            args = pe, data
            if switch > 1:
                args = *args, True
            try:
                info = resolver(*args)
            except Exception as E:
                self.log_info(F'failed to obtain {name}: {E!s}')
                continue
            if info:
                result[name] = info

        signature = {}

        if self.args.timestamps or self.args.signatures:
            with suppress(Exception):
                from refinery.units.formats.pe.pesig import pesig
                signature = self.parse_signature(next(data | pesig))

        if signature:
            try:
                verification = pe.verify_signature()
            except Exception:
                pass
            else:
                from lief.PE import Signature
                if verification == Signature.VERIFICATION_FLAGS.OK:
                    signature['IsValid'] = True
                else:
                    signature['Flags'] = [
                        vf.name for vf in Signature.VERIFICATION_FLAGS if vf & verification]
                    signature['IsValid'] = False

        if self.args.timestamps:
            ts = self.parse_time_stamps(pe, self.args.timeraw, self.args.timestamps > 1)
            with suppress(KeyError):
                ts.update(Signed=signature['Timestamp'])
            result.update(TimeStamp=ts)

        if signature and self.args.signatures:
            result['Signature'] = signature

        if result:
            yield from ppjson(tabular=self.args.tabular)._pretty_output(result, indent=4, ensure_ascii=False)

    _CHARSET = {
        0x0000: '7-bit ASCII',
        0x03A4: 'Japan (Shift ? JIS X-0208)',
        0x03B5: 'Korea (Shift ? KSC 5601)',
        0x03B6: 'Taiwan (Big5)',
        0x04B0: 'Unicode',
        0x04E2: 'Latin-2 (Eastern European)',
        0x04E3: 'Cyrillic',
        0x04E4: 'Multilingual',
        0x04E5: 'Greek',
        0x04E6: 'Turkish',
        0x04E7: 'Hebrew',
        0x04E8: 'Arabic',
    }

    _WINVER = {
        3: {
            0x00: 'Windows NT 3',
            0x0A: 'Windows NT 3.1',
            0x32: 'Windows NT 3.5',
            0x33: 'Windows NT 3.51',
        },
        4: {
            0x00: 'Windows 95',
            0x0A: 'Windows 98',
        },
        5: {
            0x00: 'Windows 2000',
            0x5A: 'Windows Me',
            0x01: 'Windows XP',
            0x02: 'Windows Server 2003',
        },
        6: {
            0x00: 'Windows Vista',
            0x01: 'Windows 7',
            0x02: 'Windows 8',
            0x03: 'Windows 8.1',
        },
        10: {
            0x00: 'Windows 10',
        }
    }

class peoverlay (certificate=False, directories=False, memdump=False)

Returns the overlay of a PE file, i.e. anything that may have been appended to the file. This does not include digital signatures. Use pestrip to obtain only the body of the PE file after removing the overlay.

Expand source code Browse git

class peoverlay(OverlayUnit):
    """
    Returns the overlay of a PE file, i.e. anything that may have been appended to the file.
    This does not include digital signatures. Use `refinery.pestrip` to obtain only the body
    of the PE file after removing the overlay.
    """
    def process(self, data: bytearray) -> bytearray:
        size = self._get_size(data)
        try:
            data[:size] = []
        except Exception:
            return data[size:]
        else:
            return data

class perc (*paths, pretty=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

Extract PE file resources.

Expand source code Browse git

class perc(PathExtractorUnit):
    """
    Extract PE file resources.
    """
    def __init__(
        self, *paths,
        pretty: Arg.Switch('-p', help='Add missing headers to bitmap and icon resources.') = False,
        **kwargs
    ):
        super().__init__(*paths, pretty=pretty, **kwargs)

    def _get_icon_dir(self, pe: lief.PE.Binary):
        for manifest_entry in pe.resources.childs:
            if manifest_entry.id != RSRC.ICON_GROUP.value:
                continue
            child: lief.PE.ResourceData = manifest_entry.childs[0].childs[0]
            return GRPICONDIR(bytearray(child.content))

    def _search(self, pe: lief.PE.Binary, directory: lief.PE.ResourceDirectory, *parts):
        if directory.depth >= 3:
            self.log_warn(F'unexpected resource tree level {directory.depth + 1:d}')
        for entry in directory.childs:
            entry: lief.PE.ResourceData
            if entry.has_name:
                identifier = str(entry.name)
            elif directory.depth == 0 and entry.id in iter(RSRC):
                identifier = RSRC(entry.id)
            elif entry.id is not None:
                identifier = entry.id
            else:
                self.log_warn(F'resource entry has name {entry.name} and id {entry.id} at level {directory.depth + 1:d}')
                continue
            if entry.is_directory:
                yield from self._search(pe, entry, *parts, identifier)
            else:
                def extract(_=pe, e=entry):
                    return bytearray(e.content)
                path = '/'.join(str(p) for p in (*parts, identifier))
                if self.args.pretty:
                    if parts[0] is RSRC.BITMAP:
                        extract = self._handle_bitmap(extract)
                    elif parts[0] is RSRC.ICON:
                        extract = self._handle_icon(pe, extract, parts)
                yield UnpackResult(
                    path,
                    extract,
                    lcid=self._get_lcid(entry),
                    offset=entry.offset,
                )

    def _get_lcid(self, node_data) -> Optional[str]:
        try:
            pid = node_data.id & 0x3FF
            sid = node_data.id >> 0x0A
        except AttributeError:
            return None
        try:
            pid = self._LANG_ID_TO_LCID[pid]
        except KeyError:
            return None
        lcid = pid.get(sid, 0)
        return LCID.get(lcid)

    def _handle_bitmap(self, extract_raw_data: Callable[[], ByteStr]) -> ByteStr:
        def extract():
            bitmap = extract_raw_data()
            total = (len(bitmap) + 14).to_bytes(4, 'little')
            return B'BM' + total + B'\0\0\0\0\x36\0\0\0' + bitmap
        return extract

    def _handle_icon(
        self,
        pe: lief.PE.Binary,
        extract_raw_data: Callable[[], ByteStr],
        parts: Tuple[RSRC, int, int]
    ) -> ByteStr:
        try:
            icondir = self._get_icon_dir(pe)
            index = int(parts[1]) - 1
            info = icondir.entries[index]
        except Exception as E:
            self.log_warn(F'unable to generate icon header: {E!s}')
            return extract_raw_data

        def extract(info=info):
            icon = extract_raw_data()
            if icon.startswith(B'(\0\0\0'):
                header = struct.pack('<HHHBBBBHHII',
                    0,
                    1,
                    1,
                    info.width,
                    info.height,
                    info.color_count,
                    0,
                    info.planes,
                    info.bit_count,
                    len(icon),
                    0x16
                )
                icon = header + icon
            return icon

        return extract

    def unpack(self, data):
        pe = lief.load_pe_fast(data, parse_rsrc=True)
        if not pe.has_resources:
            return
        yield from self._search(pe, pe.resources)

    def _mktbl(ids: List[Tuple[int, int, int]]) -> Dict[int, Dict[int, int]]:
        table = {}
        for pid, sid, lcid in ids:
            if pid not in table:
                table[pid] = {0: lcid}
            table[pid][sid] = lcid
        return table

    _LANG_ID_TO_LCID = _mktbl([
        (0x00, 0x03, 0x0C00),
        (0x00, 0x05, 0x1400),
        (0x7F, 0x00, 0x007F),
        (0x00, 0x00, 0x0000),
        (0x02, 0x02, 0x0800),
        (0x00, 0x04, 0x1000),
        (0x00, 0x01, 0x0400),
        (0x36, 0x01, 0x0436),
        (0x1c, 0x01, 0x041C),
        (0x84, 0x01, 0x0484),
        (0x5E, 0x01, 0x045E),
        (0x01, 0x05, 0x1401),
        (0x01, 0x0f, 0x3C01),
        (0x01, 0x03, 0x0C01),
        (0x01, 0x02, 0x0801),
        (0x01, 0x0B, 0x2C01),
        (0x01, 0x0D, 0x3401),
        (0x01, 0x0C, 0x3001),
        (0x01, 0x04, 0x1001),
        (0x01, 0x06, 0x1801),
        (0x01, 0x08, 0x2001),
        (0x01, 0x10, 0x4001),
        (0x01, 0x01, 0x0401),
        (0x01, 0x0A, 0x2801),
        (0x01, 0x07, 0x1C01),
        (0x01, 0x0E, 0x3801),
        (0x01, 0x09, 0x2401),
        (0x2B, 0x01, 0x042B),
        (0x4D, 0x01, 0x044D),
        (0x2C, 0x02, 0x082C),
        (0x2C, 0x01, 0x042C),
        (0x45, 0x02, 0x0445),
        (0x6D, 0x01, 0x046D),
        (0x2d, 0x01, 0x042D),
        (0x23, 0x01, 0x0423),
        (0x1A, 0x08, 0x201A),
        (0x1A, 0x05, 0x141A),
        (0x7E, 0x01, 0x047E),
        (0x02, 0x01, 0x0402),
        (0x92, 0x01, 0x0492),
        (0x5C, 0x01, 0x045C),
        (0x03, 0x01, 0x0403),
        (0x04, 0x03, 0x0C04),
        (0x04, 0x05, 0x1404),
        (0x04, 0x04, 0x1004),
        (0x04, 0x02, 0x0004),
        (0x04, 0x01, 0x7C04),
        (0x83, 0x01, 0x0483),
        (0x1A, None, 0x001A),
        (0x1a, 0x04, 0x101A),
        (0x1a, 0x01, 0x041A),
        (0x05, 0x01, 0x0405),
        (0x06, 0x01, 0x0406),
        (0x8C, 0x01, 0x048C),
        (0x65, 0x01, 0x0465),
        (0x13, 0x02, 0x0813),
        (0x13, 0x01, 0x0413),
        (0x09, 0x03, 0x0C09),
        (0x09, 0x0A, 0x2809),
        (0x09, 0x04, 0x1009),
        (0x09, 0x09, 0x2409),
        (0x09, 0x10, 0x4009),
        (0x09, 0x06, 0x1809),
        (0x09, 0x08, 0x2009),
        (0x09, 0x11, 0x4409),
        (0x09, 0x05, 0x1409),
        (0x09, 0x0D, 0x3409),
        (0x09, 0x12, 0x4809),
        (0x09, 0x07, 0x1c09),
        (0x09, 0x0B, 0x2C09),
        (0x09, 0x02, 0x0809),
        (0x09, 0x01, 0x0409),
        (0x09, 0x0C, 0x3009),
        (0x25, 0x01, 0x0425),
        (0x38, 0x01, 0x0438),
        (0x64, 0x01, 0x0464),
        (0x0B, 0x01, 0x040B),
        (0x0C, 0x02, 0x080c),
        (0x0C, 0x03, 0x0C0C),
        (0x0C, 0x01, 0x040c),
        (0x0C, 0x05, 0x140C),
        (0x0C, 0x06, 0x180C),
        (0x0C, 0x04, 0x100C),
        (0x62, 0x01, 0x0462),
        (0x56, 0x01, 0x0456),
        (0x37, 0x01, 0x0437),
        (0x07, 0x03, 0x0C07),
        (0x07, 0x01, 0x0407),
        (0x07, 0x05, 0x1407),
        (0x07, 0x04, 0x1007),
        (0x07, 0x02, 0x0807),
        (0x08, 0x01, 0x0408),
        (0x6F, 0x01, 0x046F),
        (0x47, 0x01, 0x0447),
        (0x68, 0x01, 0x0468),
        (0x75, 0x01, 0x0475),
        (0x0D, 0x01, 0x040D),
        (0x39, 0x01, 0x0439),
        (0x0E, 0x01, 0x040E),
        (0x0F, 0x01, 0x040F),
        (0x70, 0x01, 0x0470),
        (0x21, 0x01, 0x0421),
        (0x5D, 0x02, 0x085D),
        (0x5D, 0x01, 0x045D),
        (0x3C, 0x02, 0x083C),
        (0x34, 0x01, 0x0434),
        (0x35, 0x01, 0x0435),
        (0x10, 0x01, 0x0410),
        (0x10, 0x02, 0x0810),
        (0x11, 0x01, 0x0411),
        (0x4B, 0x01, 0x044B),
        (0x3F, 0x01, 0x043F),
        (0x53, 0x01, 0x0453),
        (0x86, 0x01, 0x0486),
        (0x87, 0x01, 0x0487),
        (0x57, 0x01, 0x0457),
        (0x12, 0x01, 0x0412),
        (0x40, 0x01, 0x0440),
        (0x54, 0x01, 0x0454),
        (0x26, 0x01, 0x0426),
        (0x27, 0x01, 0x0427),
        (0x2E, 0x02, 0x082E),
        (0x6E, 0x01, 0x046E),
        (0x2F, 0x01, 0x042F),
        (0x3E, 0x02, 0x083E),
        (0x3E, 0x01, 0x043e),
        (0x4C, 0x01, 0x044C),
        (0x3A, 0x01, 0x043A),
        (0x81, 0x01, 0x0481),
        (0x7A, 0x01, 0x047A),
        (0x4E, 0x01, 0x044E),
        (0x7C, 0x01, 0x047C),
        (0x50, 0x01, 0x0450),
        (0x50, 0x02, 0x0850),
        (0x61, 0x01, 0x0461),
        (0x14, 0x01, 0x0414),
        (0x14, 0x02, 0x0814),
        (0x82, 0x01, 0x0482),
        (0x48, 0x01, 0x0448),
        (0x63, 0x01, 0x0463),
        (0x29, 0x01, 0x0429),
        (0x15, 0x01, 0x0415),
        (0x16, 0x01, 0x0416),
        (0x16, 0x02, 0x0816),
        (0x67, 0x02, 0x0867),
        (0x46, 0x01, 0x0446),
        (0x46, 0x02, 0x0846),
        (0x6B, 0x01, 0x046B),
        (0x6B, 0x02, 0x086B),
        (0x6B, 0x03, 0x0C6B),
        (0x18, 0x01, 0x0418),
        (0x17, 0x01, 0x0417),
        (0x19, 0x01, 0x0419),
        (0x85, 0x01, 0x0485),
        (0x3B, 0x09, 0x243B),
        (0x3B, 0x04, 0x103B),
        (0x3B, 0x05, 0x143B),
        (0x3B, 0x03, 0x0C3B),
        (0x3B, 0x01, 0x043B),
        (0x3B, 0x02, 0x083B),
        (0x3B, 0x08, 0x203B),
        (0x3B, 0x06, 0x183B),
        (0x3B, 0x07, 0x1C3B),
        (0x4F, 0x01, 0x044F),
        (0x1a, 0x07, 0x1C1A),
        (0x1a, 0x06, 0x181A),
        (0x1a, 0x03, 0x0C1A),
        (0x1a, 0x02, 0x081A),
        (0x6C, 0x01, 0x046C),
        (0x32, 0x02, 0x0832),
        (0x32, 0x01, 0x0432),
        (0x32, 0x01, 0x0459),
        (0x32, 0x02, 0x0859),
        (0x5B, 0x01, 0x045B),
        (0x1b, 0x01, 0x041B),
        (0x24, 0x01, 0x0424),
        (0x0A, 0x0b, 0x2C0A),
        (0x0A, 0x10, 0x400A),
        (0x0A, 0x0D, 0x340A),
        (0x0A, 0x09, 0x240A),
        (0x0A, 0x05, 0x140A),
        (0x0A, 0x07, 0x1C0A),
        (0x0A, 0x0C, 0x300A),
        (0x0A, 0x11, 0x440A),
        (0x0A, 0x04, 0x100A),
        (0x0A, 0x12, 0x480A),
        (0x0A, 0x02, 0x080A),
        (0x0A, 0x13, 0x4C0A),
        (0x0A, 0x06, 0x180A),
        (0x0A, 0x0F, 0x3C0A),
        (0x0A, 0x0A, 0x280A),
        (0x0A, 0x14, 0x500A),
        (0x0A, 0x03, 0x0C0A),
        (0x0A, 0x01, 0x040A),
        (0x0A, 0x15, 0x540A),
        (0x0A, 0x0E, 0x380A),
        (0x0A, 0x08, 0x200A),
        (0x41, 0x01, 0x0441),
        (0x1D, 0x02, 0x081D),
        (0x1D, 0x01, 0x041D),
        (0x5A, 0x01, 0x045A),
        (0x28, 0x01, 0x0428),
        (0x5F, 0x02, 0x085F),
        (0x49, 0x01, 0x0449),
        (0x49, 0x02, 0x0849),
        (0x44, 0x01, 0x0444),
        (0x4A, 0x01, 0x044A),
        (0x1E, 0x01, 0x041E),
        (0x51, 0x01, 0x0451),
        (0x73, 0x02, 0x0873),
        (0x73, 0x01, 0x0473),
        (0x1F, 0x01, 0x041F),
        (0x42, 0x01, 0x0442),
        (0x22, 0x01, 0x0422),
        (0x2E, 0x01, 0x042E),
        (0x20, 0x02, 0x0820),
        (0x20, 0x01, 0x0420),
        (0x80, 0x01, 0x0480),
        (0x43, 0x02, 0x0843),
        (0x43, 0x01, 0x0443),
        (0x03, 0x02, 0x0803),
        (0x2A, 0x01, 0x042A),
        (0x52, 0x01, 0x0452),
        (0x88, 0x01, 0x0488),
        (0x78, 0x01, 0x0478),
        (0x6A, 0x01, 0x046A),
    ])

class pesig

Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file, i.e. the digital signatures in DER format.

Expand source code Browse git

class pesig(Unit):
    """
    Extracts the contents of the IMAGE_DIRECTORY_ENTRY_SECURITY entry of a PE file,
    i.e. the digital signatures in DER format.
    """
    def process(self, data: bytearray):
        view = memoryview(data)
        pe = lief.load_pe_fast(view)
        security = pe.data_directory(lief.PE.DataDirectory.TYPES.CERTIFICATE_TABLE)
        self.log_info(F'signature offset: 0x{security.rva:08X}')
        self.log_info(F'signature length: 0x{security.size:08X}')
        if security.rva == 0 or security.size == 0:
            raise ValueError('IMAGE_DIRECTORY_ENTRY_SECURITY is corrupt.')
        sgnoff = security.rva + 8
        sgnend = sgnoff + security.size
        length, _, _ = struct.unpack('<IHH', view[sgnoff - 8:sgnoff])
        signature = view[sgnoff:sgnend]
        if len(signature) + 8 != length:
            raise RefineryPartialResult(
                F'Found {len(signature) + 8} bytes of signature, but length should be {length}.',
                partial=signature)
        return signature

class pestrip (certificate=False, directories=False, memdump=False)

Removes the overlay of a PE file and returns the main executable. Use peoverlay to extract the overlay.

Expand source code Browse git

class pestrip(OverlayUnit):
    """
    Removes the overlay of a PE file and returns the main executable. Use `refinery.peoverlay` to
    extract the overlay.
    """

    def process(self, data: bytearray) -> bytearray:
        size = self._get_size(data)
        try:
            data[size:] = []
        except Exception:
            data = data[:size]
        else:
            return data

class pf (*formats, variable=None, separator=' ', multiplex=False, binary=False, unescape=False)

Stands for "Print Format": Transform a given chunk by applying a format string operation. The positional format string placeholder {} will be replaced by the incoming data, named placeholders have to exist as meta variables in the current chunk. For example, the following pipeline can be used to print all files in a given directory with their corresponding SHA-256 hash:

ef ** [| sha256 -t | pf {} {path} ]]

By default, format string arguments are simply joined along a space character to form a single format string.

Expand source code Browse git

class pf(Unit):
    """
    Stands for "Print Format": Transform a given chunk by applying a format string operation. The
    positional format string placeholder `{}` will be replaced by the incoming data, named
    placeholders have to exist as meta variables in the current chunk. For example, the following
    pipeline can be used to print all files in a given directory with their corresponding SHA-256
    hash:

        ef ** [| sha256 -t | pf {} {path} ]]

    By default, format string arguments are simply joined along a space character to form a single
    format string.
    """

    def __init__(
        self,
        *formats : Arg.Binary(help='Format strings.', metavar='format'),
        variable : Arg.String('-n', metavar='N', help='Store the formatted string in a meta variable.') = None,
        separator: Arg.String('-s', group='SEP', metavar='S',
            help='Separator to insert between format strings. The default is a space character.') = ' ',
        multiplex: Arg.Switch('-m', group='SEP',
            help='Do not join the format strings along the separator, generate one output for each.') = False,
        binary   : Arg.Switch('-b', help='Use the binary formatter instead of the string formatter.') = False,
        unescape : Arg.Switch('-e', help='Interpret escape sequences in format strings.') = False,
    ):
        def fixfmt(fmt: bytes | str):
            if unescape:
                if isinstance(fmt, str):
                    fmt = fmt.encode('latin1')
                return bytes(fmt).decode('unicode-escape')
            elif not isinstance(fmt, str):
                fmt = bytes(fmt).decode(self.codec)
            return fmt
        _formats = [fixfmt(f) for f in formats]
        if not multiplex:
            _formats = [fixfmt(separator).join(_formats)]
        super().__init__(formats=_formats, variable=variable, binary=binary)

    def process(self, data):
        meta = metavars(data)
        meta.ghost = True
        args = [data]
        variable = self.args.variable
        if self.args.binary:
            formatter = partial(meta.format_bin, codec=self.codec, args=args)
        else:
            def formatter(spec):
                return meta.format_str(spec, self.codec, args).encode(self.codec)
        for spec in self.args.formats:
            result = formatter(spec)
            if variable is not None:
                result = self.labelled(data, **{variable: result})
            yield result

class pick (*bounds)

Picks sequences from the array of multiple inputs. For example, pick 0 2: will return all but the second ingested input (which has index 1).

Expand source code Browse git

class pick(Unit):
    """
    Picks sequences from the array of multiple inputs. For example, `pick 0 2:`
    will return all but the second ingested input (which has index `1`).
    """
    def __init__(self, *bounds: Arg.Bounds(nargs='*', default=[0])):
        super().__init__(bounds=[sliceobj(s) for s in bounds])

    def process(self, data: Chunk):
        if not data.visible:
            yield data
            return

        state: _PickState = data.temp
        a = state.accessor
        lower = a.start
        upper = a.stop

        if lower is not None:
            lower -= state.discarded
        if upper is not None:
            upper -= state.discarded
        if state.consumed:
            yield from state.remaining[slice(lower, upper, a.step)]
            return

        while lower:
            try:
                chunk = next(state.chunks)
            except StopIteration:
                upper = None
                break
            if chunk.visible:
                lower -= 1
                upper -= 1
                state.discarded += 1
            else:
                yield chunk
        if upper is None:
            yield from state.chunks
            return
        while upper:
            try:
                chunk = next(state.chunks)
            except StopIteration:
                break
            if chunk.visible:
                upper -= 1
                state.discarded += 1
            yield chunk

    def filter(self, chunks: Iterable[Chunk]):
        chunks = begin(chunks)
        if chunks is None:
            return
        container, chunks = chunks
        if container.scope < 1:
            raise RuntimeError(F'{self.__class__.__name__} cannot be used outside a frame; maybe you meant to use snip?')
        container = container.copy()
        container.visible = True
        state = _PickState(deque(self.args.bounds), chunks)
        while state.next():
            if not state.consumed:
                if not state.discardable():
                    self.log_debug(F'consumed input into buffer after {state.discarded} skips')
                    for chunk in state.chunks:
                        if not chunk.visible:
                            yield chunk
                            continue
                        state.remaining.append(chunk)
                    state.consumed = True
            container.temp = state
            yield container

class pkcs7

Converts PKCS7 encoded data to a JSON representation.

Expand source code Browse git

class pkcs7(Unit):
    """
    Converts PKCS7 encoded data to a JSON representation.
    """
    @Unit.Requires('asn1crypto', 'default', 'extended')
    def _asn1crypto():
        import asn1crypto
        import asn1crypto.cms
        import asn1crypto.core
        import asn1crypto.x509
        return asn1crypto

    def process(self, data: bytes):
        asn1 = self._asn1crypto.core
        cms = self._asn1crypto.cms
        signature = cms.ContentInfo.load(data)

        def unsign(data):
            if isinstance(data, int):
                size = data.bit_length()
                if data < 0:
                    data = (1 << (size + 1)) - ~data - 1
                if data > 0xFFFFFFFF_FFFFFFFF:
                    size, r = divmod(size, 8)
                    size += bool(r)
                    data = data.to_bytes(size, 'big').hex()
                return data
            elif isinstance(data, dict):
                return {key: unsign(value) for key, value in data.items()}
            elif isinstance(data, list):
                return [unsign(x) for x in data]
            else:
                return data

        class SpcString(asn1.Choice):
            _alternatives = [
                ('unicode', asn1.BMPString, {'implicit': 0}),
                ('ascii', asn1.IA5String, {'implicit': 1})
            ]

        SpcUuid = asn1.OctetString

        class SpcSerializedObject(asn1.Sequence):
            _fields = [
                ('classId', SpcUuid),
                ('serializedData', asn1.OctetString),
            ]

        class SpcLink(asn1.Choice):
            _alternatives = [
                ('url', asn1.IA5String, {'implicit': 0}),
                ('monikier', SpcSerializedObject, {'implicit': 1}),
                ('file', SpcString, {'explicit': 2})
            ]

        class SpcSpOpusInfo(asn1.Sequence):
            _fields = [
                ('programName', SpcString, {'optional': True, 'explicit': 0}),
                ('moreInfo', SpcLink, {'optional': True, 'explicit': 1}),
            ]

        class SetOfInfos(asn1.SetOf):
            _child_spec = SpcSpOpusInfo

        cms.CMSAttributeType._map['1.3.6.1.4.1.311.2.1.12'] = 'authenticode_info'
        cms.CMSAttribute._oid_specs['authenticode_info'] = SetOfInfos

        class ParsedASN1ToJSON(BytesAsStringEncoder):
            unit = self

            @classmethod
            def _is_keyval(cls, obj):
                return (
                    isinstance(obj, dict)
                    and set(obj.keys()) == {'type', 'values'}
                    and len(obj['values']) == 1
                )

            @classmethod
            def handled(cls, obj) -> bool:
                return BytesAsStringEncoder.handled(obj) or cls._is_keyval(obj)

            def encode_bytes(self, obj: bytes):
                with suppress(Exception):
                    string = obj.decode('latin1')
                    if string.isprintable():
                        return string
                return super().encode_bytes(obj)

            def default(self, obj):
                if self._is_keyval(obj):
                    return dict(type=obj['type'], value=obj['values'][0])
                with suppress(TypeError):
                    return super().default(obj)
                if isinstance(obj, (set, tuple)):
                    return list(obj)
                if isinstance(obj, datetime):
                    return str(obj)
                dict_result = {}
                list_result = None
                if isinstance(obj, self.unit._asn1crypto.x509.Certificate):
                    dict_result.update(fingerprint=obj.sha1.hex())
                if isinstance(obj, asn1.BitString):
                    return {'bit_string': obj.native}
                with suppress(Exception):
                    list_result = list(obj)
                    if all(isinstance(k, str) for k in list_result):
                        dict_result.update((key, obj[key]) for key in list_result)
                if dict_result:
                    return dict_result
                if list_result is not None:
                    return list_result
                if isinstance(obj, self.unit._asn1crypto.cms.CertificateChoices):
                    return obj.chosen
                if isinstance(obj, asn1.Sequence):
                    children = obj.children
                    if children:
                        return children
                    return obj.dump()
                with suppress(Exception):
                    return obj.native
                if isinstance(obj, asn1.Any):
                    parsed = None
                    with suppress(Exception):
                        parsed = obj.parse()
                    if parsed:
                        return parsed
                    return obj.dump()
                if isinstance(obj, asn1.Asn1Value):
                    return obj.dump()
                raise ValueError(F'Unable to determine JSON encoding of {obj.__class__.__name__} object.')

        with ParsedASN1ToJSON as encoder:
            encoded = encoder.dumps(signature)
            converted = unsign(json.loads(encoded))
            return json.dumps(converted, indent=4).encode(self.codec)

class pkcs7sig (tabular=False)

Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used to parse authenticode signatures appended to files that are not PE files to get the same output that is produced by the pemeta unit.

Expand source code Browse git

class pkcs7sig(Unit):
    """
    Converts PKCS7 encoded signatures into a human-readable JSON representation. This can be used
    to parse authenticode signatures appended to files that are not PE files to get the same output
    that is produced by the pemeta unit.
    """
    def __init__(self, tabular: Arg('-t', help='Print information in a table rather than as JSON') = False):
        super().__init__(tabular=tabular)

    def process(self, data: bytes):
        json = pemeta.parse_signature(data)
        yield from ppjson(tabular=self.args.tabular)._pretty_output(json, indent=4, ensure_ascii=False)

class pkw

This unit implements PKWare decompression.

Expand source code Browse git

class pkw(Unit):
    """
    This unit implements PKWare decompression.
    """
    def process(self, data):

        def read_from_table(table: dict[tuple[int, int], int], start: int, stop: int):
            value = length = 0
            while length < start:
                value <<= 1
                value |= getint(1)
                length += 1
            while length < stop:
                try:
                    return table[length, value]
                except KeyError:
                    value <<= 1
                    value |= getint(1)
                    length += 1
            raise ValueError(
                'Failed to decode a symbol in the compressed data stream.')

        reader = StructReader(data)
        codelit = reader.u8()  # First byte is 0 if literals are uncoded, otherwise 1
        maxdict = reader.u8()  # Second byte is 4, 5, or 6 (max size of dictionary)

        if not 0 <= codelit <= 1:
            raise ValueError(F'Invalid literal encoding value {codelit}.')

        if not 4 <= maxdict <= 6:
            raise ValueError(F'Invalid dictionary size {maxdict}.')

        output = MemoryFile()
        getint = reader.read_integer

        while not reader.eof:
            try:
                if not getint(1):
                    if codelit:
                        code = read_from_table(_LITERALS, 4, 14)
                    else:
                        code = getint(8)
                    output.write_byte(code)
                else:
                    length = read_from_table(_COPY_LENGTHS, 2, 0x10)
                    if length == 519:
                        break
                    offset = read_from_table(_COPY_OFFSETS, 2, 0x09)
                    more = (2 if length == 2 else maxdict)
                    offset <<= more
                    offset += getint(more)
                    offset += 1
                    output.replay(offset, length)
            except Exception as E:
                if not (out := output.getvalue()):
                    raise
                raise RefineryPartialResult(str(E), out) from E

        return output.getvalue()

    @classmethod
    def handles(self, data: bytearray) -> bool:
        return (len(data) > 2) and (0 <= data[0] <= 1) and (4 <= data[1] <= 6)

class pop (*names)

In processing order, remove visible chunks from the current frame and store their contents in the given meta variables on all chunks that remain. All chunks in the input stream are consequently made visible again. If pop is used at the end of a frame, then variables will be local to the parent frame. A pop instruction has the following format:

count | @ | name[=source][:conversion]

If the instruction is an integer, it is interpreted as count, specifying a number of chunks to be skipped from the frame without storing them. The letter "@" can be used to remove a single chunk from the input and merge all of its meta data into the ones that follow. Otherwise, the pop instruction consists of the name of the variable to be created, an optional source variable name, and an optional conversion sequence. If no source variable is specified, the chunk contents are used as the source. The conversion is a sequence of multibin handlers that are applied to the source data from right to left before storing it. For example, the argument k:le:b64 first decodes the chunk data using base64, then converts it to an integer in little endian format, and store the integer result in the variable k. The visual aid is that the content is passed from right to left through all conversions, into the variable k. Similarly, the argument k=size will store the current chunk's size in k.

Expand source code Browse git

class pop(Unit):
    """
    In processing order, remove visible chunks from the current frame and store their contents in
    the given meta variables on all chunks that remain. All chunks in the input stream are
    consequently made visible again. If pop is used at the end of a frame, then variables will be
    local to the parent frame. A pop instruction has the following format:

        count | {_MERGE_META} | name[{_CHERRYPICK}source][{_CONVERSION}conversion]

    If the instruction is an integer, it is interpreted as `count`, specifying a number of chunks
    to be skipped from the frame without storing them. The letter "{_MERGE_META}" can be used to
    remove a single chunk from the input and merge all of its meta data into the ones that follow.
    Otherwise, the pop instruction consists of the name of the variable to be created, an optional
    source variable name, and an optional conversion sequence. If no source variable is specified,
    the chunk contents are used as the source. The conversion is a sequence of multibin handlers
    that are applied to the source data from right to left before storing it.
    For example, the argument `k:le:b64` first decodes the chunk data using base64, then converts
    it to an integer in little endian format, and store the integer result in the variable `k`. The
    visual aid is that the content is passed from right to left through all conversions, into the
    variable `k`. Similarly, the argument k=size will store the current chunk's size in `k`.
    """
    def __init__(
        self,
        *names: Arg(type=str, metavar='instruction', help='A sequence of instructions, see above.')
    ):
        if not names:
            names = _MERGE_META,
        super().__init__(names=[_popcount(n) for n in names])

    def process(self, data):
        return data

    def filter(self, chunks: Iterable[Chunk]):
        invisible = []
        variables = {}
        remaining: Iterator[_popcount] = iter(self.args.names)

        it = iter(chunks)
        pop = next(remaining).reset()
        done = False

        for chunk in it:
            if not chunk.visible:
                self.log_debug('buffering invisible chunk')
                invisible.append(chunk)
                continue
            try:
                while not pop.into(variables, chunk):
                    pop = next(remaining).reset()
            except StopIteration:
                done = True
                invisible.append(chunk)
                break

        if not done and pop.done:
            try:
                next(remaining)
            except StopIteration:
                done = True

        if not done:
            msg = 'Not all variables could be assigned.'
            if not self.leniency:
                raise ValueError(F'{msg} Increase leniency to downgrade this failure to a warning.')
            self.log_warn(msg)

        nesting = self.args.nesting

        for chunk in chain(invisible, it):
            meta = chunk.meta
            meta.update(variables)
            if nesting < 0:
                for name in variables:
                    meta.set_scope(name, chunk.scope + nesting)
            chunk.visible = True
            yield chunk

class ppjscript (indent=4, strip_comments=False, strip_lines=False, keep_lines=False, keep_escapes=False)

Pretty-prints JavaScript without any reflection or evaluation.

Expand source code Browse git

class ppjscript(Unit):
    """
    Pretty-prints JavaScript without any reflection or evaluation.
    """
    def __init__(
        self,
        indent: Arg.Number('-i', help=(
            'Number of space characters used for indentation in the output. Default is {default}.')) = 4,
        strip_comments: Arg.Switch('-c', help=(
            'Remove all comments from the input before pretty-printing.')) = False,
        strip_lines: Arg.Switch('-b', group='LINES', help=(
            'Remove all line breaks after potentially stripping comments, before beautifying.')) = False,
        keep_lines: Arg.Switch('-B', group='LINES', help=(
            'Preserve line breaks as they occur in the input.')) = False,
        keep_escapes: Arg.Switch('-E', help=(
            'Preserve unnecessary escape sequences in string literals.')) = False,
    ):
        return super().__init__(
            indent=indent,
            strip_comments=strip_comments,
            strip_lines=strip_lines,
            keep_lines=keep_lines,
            keep_escapes=keep_escapes,
        )

    @Unit.Requires('jsbeautifier', 'display', 'extended')
    def _jsb():
        import jsbeautifier
        import jsbeautifier.unpackers.javascriptobfuscator
        # TODO: This is a workaround for the following bug:
        # https://github.com/beautify-web/js-beautify/issues/1350
        jsbeautifier.unpackers.javascriptobfuscator.detect = lambda *_: False
        return jsbeautifier

    def process(self, data: bytes):
        if self.args.strip_comments:
            from refinery.units.obfuscation.js.comments import deob_js_comments
            code = data | deob_js_comments | str
        else:
            code = data.decode(self.codec)
        if self.args.strip_lines:
            code = ' '.join(code.splitlines(False))
        options = self._jsb.default_options()
        options.eval_code = False
        options.indent_size = self.args.indent
        options.unescape_strings = not self.args.keep_escapes
        options.preserve_newlines = self.args.keep_lines
        options.indent_level = 0
        options.keep_array_indentation = False
        return self._jsb.beautify(
            code.strip(), options).encode(self.codec)

class ppjson (tabular=False, indent=4)

Expects JSON input data and outputs it in a neatly formatted manner. If the indentation is set to zero, the output is minified.

Expand source code Browse git

class ppjson(Unit):
    """
    Expects JSON input data and outputs it in a neatly formatted manner.
    If the indentation is set to zero, the output is minified.
    """
    _TRAILING_COMMA = re.compile(BR',\s*(}|])')

    def __init__(
        self,
        tabular: Arg.Switch('-t', group='OUT', help='Convert JSON input into a flattened table.') = False,
        indent : Arg.Number('-i', group='OUT', help='Number of spaces used for indentation. Default is {default}.') = 4
    ):
        return super().__init__(indent=indent, tabular=tabular)

    def _pretty_output(self, parsed, **kwargs):
        encoded = json.dumps(parsed, **kwargs)
        if self.args.tabular:
            table = list(flattened(json.loads(encoded)))
            width = max(len(key) for key, _ in table)
            tsize = get_terminal_size(80) - width - 4
            for key, value in table:
                if isinstance(value, str):
                    value = value.strip()
                    if not is_printable(value) and all(ord(c) < 0x100 for c in value):
                        value = value.encode('latin1').hex(':')
                value = str(value).rstrip()
                value = textwrap.wrap(value, tsize)
                it = iter(value)
                try:
                    item = next(it)
                except StopIteration:
                    continue
                yield F'{key:<{width}} : {item}'.encode(self.codec)
                for wrap in it:
                    yield F'{"":<{width + 3}}{wrap}'.encode(self.codec)
        else:
            yield encoded.encode(self.codec)

    def process(self, data):
        if self._TRAILING_COMMA.search(data):
            def smartfix(match):
                k = match.start()
                return match.group(0 if any(k in s for s in strings) else 1)
            from refinery.lib.patterns import formats
            strings = {range(*m.span()) for m in formats.string.finditer(data)}
            data = self._TRAILING_COMMA.sub(smartfix, data)
        kwargs = {'indent': self.args.indent} if self.args.indent else {'separators': (',', ':')}
        yield from self._pretty_output(json.loads(data), **kwargs)

class ppxml (indent=4, header=False)

Expects XML input data and outputs it in a neatly formatted manner.

Expand source code Browse git

class ppxml(Unit):
    """
    Expects XML input data and outputs it in a neatly formatted manner.
    """

    def __init__(self,
        indent: Arg.Number('-i', help=(
            'Controls the amount of space characters used for indentation in the output. Default is 4.')) = 4,
        header: Arg.Switch('-x', help='Add an XML header to the formatted output.') = False
    ):
        super().__init__(indent=indent, header=header)

    def process(self, data):

        pad = self.args.indent * ' '
        etm = {}

        try:
            dom = ForgivingParse(data, etm)
        except Exception:
            from refinery.lib.meta import metavars
            msg = 'error parsing as XML, returning original content'
            path = metavars(data).get('path')
            if path:
                msg = F'{msg}: {path}'
            self.log_warn(msg)
            return data

        def indent(element, level=0, more_sibs=False):
            """
            The credit for this one goes to:
            https://stackoverflow.com/a/12940014
            """
            indentation = '\n'
            if level:
                indentation += (level - 1) * pad
            childcount = len(element)
            if childcount:
                if not element.text or not element.text.strip():
                    element.text = indentation + pad
                    if level:
                        element.text += pad
                for count, child in enumerate(element):
                    indent(child, level + 1, count < childcount - 1)
                if level and (not element.tail or element.tail.isspace()):
                    element.tail = indentation
                    if more_sibs:
                        element.tail += pad
            elif level and (not element.tail or element.tail.isspace()):
                element.tail = indentation
                if more_sibs: element.tail += pad

        indent(dom.getroot())

        with io.BytesIO() as output:
            dom.write(output, encoding=self.codec, xml_declaration=self.args.header)
            result = output.getvalue()

        for uid, key in etm.items():
            entity = F'&{key};'.encode(self.codec)
            needle = uid.encode(self.codec)
            result = result.replace(needle, entity)

        return result

    @classmethod
    def handles(cls, data):
        return is_xml(data)

class push (data=b'')

The unit inserts an additional chunk before each input chunk and moves the original data out of scope. This chunk is considered the "original" data, while the one inserted in front of it is used as an intermediate result. By default, this intermediate data is a copy of the input data. For example:

emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]

will output key=censored. The application of rex turns the (duplicated) data into just the value, which is then stored in the variable v. The application of repl replaces this value with the hard-coded string censored.

Expand source code Browse git

class push(Unit):
    """
    The unit inserts an additional chunk before each input chunk and moves the original
    data out of scope. This chunk is considered the "original" data, while the one inserted
    in front of it is used as an intermediate result. By default, this intermediate data is
    a copy of the input data. For example:

        emit key=value | push [[| rex =(.*)$ {1} | pop v ]| repl var:v censored ]

    will output `key=censored`. The application of `refinery.rex` turns the (duplicated)
    data into just the value, which is then stored in the variable `v`. The application
    of `refinery.repl` replaces this value with the hard-coded string `censored`.
    """
    def __init__(self, data: Arg(help='The data to be pushed, by default a copy of the input.') = B''):
        super().__init__(data=data)

    def process(self, data: Chunk):
        src = self.args.data
        tos = data.copy(meta=True, data=False)
        tos[:] = src or data
        if self.args.nesting > 0:
            data.set_next_scope(False)
        else:
            try:
                data.visible = False
            except AttributeError:
                self.log_warn('application has no effect outside frame.')
        yield data
        yield tos

class put (name, value=<object object>)

Can be used to add a meta variable to the processed chunk. Note that meta variables cease to exist outside a frame.

Expand source code Browse git

class put(Unit):
    """
    Can be used to add a meta variable to the processed chunk. Note that meta variables
    cease to exist outside a frame.
    """
    def __init__(
        self,
        name : Arg(help='The name of the variable to be used.', type=str),
        value: Arg(help='The value for the variable. If no value is given, the entire current chunk is stored.',
            type=functools.partial(numseq, typecheck=False)) = _EMPTY
    ):
        super().__init__(name=check_variable_name(name), value=value)

    def process(self, data: Chunk):
        value = self.args.value
        if value is _EMPTY:
            value = data
        if not isinstance(value, (int, float)) and not isbuffer(value):
            try:
                len(value)
            except TypeError:
                if isinstance(value, itertools.repeat):
                    value = next(value)
                if not isinstance(value, (int, float)):
                    raise NotImplementedError(F'put does not support {value.__class__.__name__} values.')
            else:
                if not isinstance(value, (dict, list)):
                    value = list(value)
        self.log_debug(F'storing {typename(value)}:', value, clip=True)
        data.meta[self.args.name] = value
        return data

class pyc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does not work on recent Python versions, but anything below 3.9 should work.

Expand source code Browse git

class pyc(ArchiveUnit):
    """
    Decompiles Python bytecode (PYC) files back to source code. A known limitation is that it does
    not work on recent Python versions, but anything below 3.9 should work.
    """

    def unpack(self, data):
        input_path = metavars(data).get(self.args.path.decode(self.codec))
        for k, code in enumerate(extract_code_from_buffer(bytes(data), input_path)):
            path = code.container.co_filename or F'__unknown_name_{k:02d}.py'
            date = datetime.fromtimestamp(code.timestamp)
            data = decompile_buffer(code)
            yield self._pack(path, date, data)

class pym

Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an older Python version, you can use the pyc unit to then decompile the code, but for more recent versions a separate Python decompiler will be required.

Expand source code Browse git

class pym(Unit):
    """
    Converts Python-Marshaled code objects to the PYC (Python Bytecode) format. If it is an
    older Python version, you can use the `refinery.pyc` unit to then decompile the code, but
    for more recent versions a separate Python decompiler will be required.
    """
    def reverse(self, data):
        return marshal.dumps(data)

    def process(self, data):
        def toblob(data):
            if isinstance(data, (bytes, bytearray)):
                self.log_info(U'unmarshalled a byte string, returning as is')
                return data
            if isinstance(data, str):
                self.log_info(F'unmarshalled a string object, encoding as {self.codec}')
                return data.encode(self.codec)
            if isinstance(data, CodeType):
                self.log_info(U'unmarshalled a code object, converting to pyc')
                import importlib
                return importlib._bootstrap_external._code_to_timestamp_pyc(data)
            if isinstance(data, int):
                self.log_info(U'unmarshalled an integer, returning big endian encoding')
                q, r = divmod(data.bit_length(), 8)
                q += int(bool(r))
                return data.to_bytes(q, 'big')
            if isinstance(data, dict):
                with BytesAsStringEncoder as encoder:
                    return encoder.dumps(data).encode(self.codec)
            raise NotImplementedError(
                F'No serialization implemented for object of type {data.__class__.__name__}')

        try:
            out = marshal.loads(data)
        except Exception:
            out = Marshal(memoryview(data)).object()

        if isinstance(out, (list, tuple, set, frozenset)):
            self.log_info('object is a collection, converting each item individually')
            for item in out:
                yield toblob(item)
        else:
            yield toblob(out)

class qb (*data)

Short for "queue back": Insert new chunks at the end of the current frame.

Expand source code Browse git

class qb(QueueUnit):
    """
    Short for "queue back": Insert new chunks at the end of the current frame.
    """
    def filter(self, chunks: Iterable[Chunk]):
        yield from self._queue(chunks, False)

class qf (*data)

Short for "queue front": Insert new chunks at the beginning of the current frame.

Expand source code Browse git

class qf(QueueUnit):
    """
    Short for "queue front": Insert new chunks at the beginning of the current frame.
    """
    def filter(self, chunks: Iterable[Chunk]):
        yield from self._queue(chunks, True)

class qlz

This unit implements QuickLZ decompression levels 1 and 3.

Expand source code Browse git

class qlz(Unit):
    """
    This unit implements QuickLZ decompression levels 1 and 3.
    """

    def process(self, data):
        source = memoryview(data)
        head = source[0]
        clvl = (head >> 2) & 0x3

        if head & 2:
            self.log_info('long header detected')
            size = int.from_bytes(source[5:9], 'little')
            source = source[9:]
        else:
            self.log_info('short header detected')
            size = source[3]
            source = source[3:]
        if head & 1 != 1:
            self.log_warn('header indicates that data is uncompressed, returning remaining data')
            return source
        else:
            self.log_info(F'compression level {clvl}, decompressed size {SizeInt(size)!r}')

        def fetchhash():
            return int.from_bytes(destination[hashvalue + 1:hashvalue + 4], byteorder='little')

        codeword = 1
        destination = bytearray()
        hashtable = [0] * _HASH_VALUES
        hashvalue = -1
        last_matchstart = size - _UNCONDITIONAL_MATCHLEN - _UNCOMPRESSED_END - 1
        fetch = 0

        if clvl == 2:
            raise ValueError("This version only supports level 1 and 3")
        while source:
            if codeword == 1:
                codeword = int.from_bytes(source[:4], byteorder='little')
                source = source[4:]
                if len(destination) <= last_matchstart:
                    c = 3 if clvl == 1 else 4
                    fetch = int.from_bytes(source[:c], byteorder='little')
            if codeword & 1:
                codeword = codeword >> 1
                if clvl == 1:
                    hash = (fetch >> 4) & 0xFFF
                    offset = hashtable[hash]
                    if fetch & 0xF:
                        matchlen = (fetch & 0xF) + 2
                        source = source[2:]
                    else:
                        matchlen = source[2]
                        source = source[3:]
                else:
                    if (fetch & 3) == 0:
                        delta = (fetch & 0xFF) >> 2
                        matchlen = 3
                        source = source[1:]
                    elif (fetch & 2) == 0:
                        delta = (fetch & 0xFFFF) >> 2
                        matchlen = 3
                        source = source[2:]
                    elif (fetch & 1) == 0:
                        delta = (fetch & 0xFFFF) >> 6
                        matchlen = ((fetch >> 2) & 15) + 3
                        source = source[2:]
                    elif (fetch & 127) != 3:
                        delta = (fetch >> 7) & 0x1FFFF
                        matchlen = ((fetch >> 2) & 0x1F) + 2
                        source = source[3:]
                    else:
                        delta = fetch >> 15
                        matchlen = ((fetch >> 7) & 255) + 3
                        source = source[4:]
                    offset = (len(destination) - delta) & 0xFFFFFFFF

                for i in range(offset, offset + matchlen):
                    destination.append(destination[i])

                if clvl == 1:
                    fetch = fetchhash()
                    while hashvalue < len(destination) - matchlen:
                        hashvalue += 1
                        hash = ((fetch >> 12) ^ fetch) & _HASH_MASK
                        hashtable[hash] = hashvalue
                        fetch = fetch >> 8 & 0xFFFF
                        try:
                            fetch |= destination[hashvalue + 3] << 16
                        except IndexError:
                            pass
                    fetch = int.from_bytes(source[:3], byteorder='little')
                else:
                    fetch = int.from_bytes(source[:4], byteorder='little')
                hashvalue = len(destination) - 1
            else:
                if len(destination) <= last_matchstart:
                    destination.append(source[0])
                    source = source[1:]
                    codeword = codeword >> 1
                    if clvl == 1:
                        while hashvalue < len(destination) - 3:
                            fetch2 = fetchhash()
                            hashvalue += 1
                            hash = ((fetch2 >> 12) ^ fetch2) & _HASH_MASK
                            hashtable[hash] = hashvalue
                        fetch = fetch >> 8 & 0xFFFF | source[2] << 16
                    else:
                        fetch = fetch >> 8 & 0xFFFF
                        fetch |= source[2] << 16
                        fetch |= source[3] << 24
                else:
                    while len(destination) <= size - 1:
                        if codeword == 1:
                            source = source[4:]
                            codeword = 0x80000000
                        destination.append(source[0])
                        source = source[1:]
                        codeword = codeword >> 1
                    break
        if len(destination) != size:
            raise RefineryPartialResult(
                F'Header indicates decompressed size 0x{size:X}, but 0x{len(destination):X} bytes '
                F'were decompressed.', destination)
        return destination

class rabbit (key, discard=0, stateful=False, iv=b'')

RABBIT encryption and decryption.

Expand source code Browse git

class rabbit(StreamCipherUnit):
    """
    RABBIT encryption and decryption.
    """
    key_size = {16}

    def __init__(self, key, discard=0, stateful=False, iv: Arg('-i', '--iv', help='Optional initialization vector.') = B''):
        super().__init__(key=key, iv=iv, stateful=stateful, discard=discard)

    def keystream(self) -> Iterable[int]:
        if len(self.args.iv) not in (0, 8):
            raise ValueError('The IV length must be exactly 8 bytes.')
        return RabbitCipher(self.args.key, self.args.iv)

class rc2 (key, *, iv=b'', eks=1024, derive_eks=False, padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

RC2 encryption and decryption.

Expand source code Browse git

class rc2(StandardBlockCipherUnit, cipher=PyCryptoFactoryWrapper(ARC2)):
    """
    RC2 encryption and decryption.
    """

    def __init__(
        self, key, *,
        iv=b'',
        eks: Arg.Number('-k', '--eks', group='EKS',
            help='Set the effective key size. Default is {default}.') = 1024,
        derive_eks: Arg.Switch('-d', '--dks', group='EKS',
            help='Act as .NET and derive the effective key size from the key length.') = False,
        padding=None,
        mode=None,
        raw=False,
        little_endian=False,
        segment_size=0,
        tag=None,
        aad=None,
        **keywords
    ):
        super().__init__(
            key,
            iv=iv,
            eks=eks,
            derive_eks=derive_eks,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            tag=tag,
            aad=aad,
            **keywords
        )

    def _new_cipher(self, **optionals) -> CipherInterface:
        eks = len(self.args.key) * 8 if self.args.derive_eks else self.args.eks
        optionals.update(effective_keylen=eks)
        return super()._new_cipher(**optionals)

class rc4 (key, discard=0)

RC4 encryption and decryption.

Expand source code Browse git

class rc4(StandardCipherUnit, cipher=PyCryptoFactoryWrapper(ARC4)):
    """
    RC4 encryption and decryption.
    """
    def __init__(
        self, key,
        discard: Arg.Number('-d', help='Discard the first {varname} bytes of the keystream, {default} by default.') = 0,
    ):
        super().__init__(key, discard=discard)

    def _new_cipher(self, **optionals):
        return super()._new_cipher(drop=self.args.discard, **optionals)

class rc4mod (key, stateful=False, discard=0, *, size=256)

Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered.

Expand source code Browse git

class rc4mod(StreamCipherUnit):
    """
    Implements a modified version of the RC4 stream cipher where the size of the RC4 SBox can be altered.
    """

    def __init__(
        self, key, stateful=False, discard=0, *,
        size: Arg.Number('-t', help='Table size, {default} by default.', bound=(1, None)) = 0x100
    ):
        super().__init__(key=key, stateful=stateful, discard=discard, size=size)

    def keystream(self):
        size = self.args.size
        tablerange = range(max(size, 0x100))
        b, table = 0, bytearray(k & 0xFF for k in tablerange)
        for a, keybyte in zip(tablerange, cycle(self.args.key)):
            t = table[a]
            b = (b + keybyte + t) % size
            table[a] = table[b]
            table[b] = t
        self.log_debug(lambda: F'SBOX = {table.hex(" ").upper()}', clip=True)
        b, a = 0, 0
        while True:
            a = (a + 1) % size
            t = table[a]
            b = (b + t) % size
            table[a] = table[b]
            table[b] = t
            yield table[(table[a] + t) % size]

class rc5 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=12, word_size=32, aad=None, tag=None)

RC5 encryption and decryption.

Expand source code Browse git

class rc5(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC5)):
    """
    RC5 encryption and decryption.
    """
    def __init__(
        self, key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0,
        rounds    : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R,
        word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W,
        **more
    ):
        super().__init__(
            key,
            iv=iv,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            rounds=rounds,
            word_size=word_size,
            **more
        )

    @property
    def block_size(self):
        return self.args.word_size // 4

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            rounds=self.args.rounds,
            word_size=self.args.word_size,
            **optionals
        )

class rc6 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, rounds=20, word_size=32)

RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but the unit will allow any key size up to 256 bits.

Expand source code Browse git

class rc6(StandardBlockCipherUnit, cipher=BlockCipherFactory(RC6)):
    """
    RC6 encryption and decryption. The parameter defaults are the RC6 parameters that were chosen
    for the AES candidacy. Only key sizes of 128, 192, and 256 bits are used for AES candidates, but
    the unit will allow any key size up to 256 bits.
    """
    def __init__(
        self, key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0,
        rounds    : Arg.Number('-k', help='Number of rounds to use, the default is {default}') = _R,
        word_size : Arg.Number('-w', help='The word size in bits, {default} by default.') = _W,
    ):
        super().__init__(
            key,
            iv=iv,
            padding=padding,
            mode=mode,
            raw=raw,
            little_endian=little_endian,
            segment_size=segment_size,
            rounds=rounds,
            word_size=word_size
        )

    @property
    def block_size(self):
        return self.args.word_size // 2

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(
            rounds=self.args.rounds,
            word_size=self.args.word_size,
            **optionals
        )

class recode (decode=None, encode='UTF8', decerr=None, encerr=None, errors=None)

Expects input string data encoded in the from encoding and encodes it in the to encoding, then outputs the result.

Expand source code Browse git

class recode(Unit):
    """
    Expects input string data encoded in the `from` encoding and encodes it in
    the `to` encoding, then outputs the result.
    """

    def __init__(
        self,
        decode: Arg(metavar='decode-as', type=str, help='Input encoding; Guess encoding by default.') = None,
        encode: Arg(metavar='encode-as', type=str, help=F'Output encoding; The default is {Unit.codec}.') = Unit.codec,
        decerr: Arg.Option('-d', choices=Handler,
            help='Specify an error handler for decoding.') = None,
        encerr: Arg.Option('-e', choices=Handler,
            help='Specify an error handler for encoding.') = None,
        errors: Arg.Option('-E', choices=Handler, help=(
            'Specify an error handler for both encoding and decoding. '
            'The possible choices are the following: {choices}')) = None,
    ):
        super().__init__(
            decode=decode,
            encode=encode,
            decerr=Arg.AsOption(decerr or errors or 'STRICT', Handler).value,
            encerr=Arg.AsOption(encerr or errors or 'STRICT', Handler).value
        )

    @Unit.Requires('chardet', 'default', 'extended')
    def _chardet():
        import chardet
        return chardet

    def _detect(self, data):
        mv = memoryview(data)
        if not any(mv[1::2]): return 'utf-16le'
        if not any(mv[0::2]): return 'utf-16be'
        detection = self._chardet.detect(data)
        codec = detection['encoding']
        self.log_info(lambda: F'Using input encoding: {codec}, detected with {int(detection["confidence"] * 100)}% confidence.')
        return codec

    def _recode(self, enc, dec, encerr, decerr, data):
        dec = dec or self._detect(data)
        return codecs.encode(codecs.decode(data, dec, errors=decerr), enc, errors=encerr)

    def reverse(self, data):
        return self._recode(self.args.decode, self.args.encode, self.args.decerr, self.args.encerr, data)

    def process(self, data):
        return self._recode(self.args.encode, self.args.decode, self.args.encerr, self.args.decerr, data)

class reduce (suffix, just=0, temp='t')

The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a single chunk. The first chunk in the frame serves as initialization.

Expand source code Browse git

class reduce(Unit):
    """
    The reduce unit applies an arbitrary multibin suffix repeatedly to reduce a complete frame to a
    single chunk. The first chunk in the frame serves as initialization.
    """

    def __init__(self,
        suffix: Arg(type=str, help=(
            'The remaining command line is a multibin suffix. The reduction accumulator is initialized '
            'with the first chunk in the frame. Then, each remaining chunk is processed with the given '
            'suffix and the result is used to overwrite the accumulator.'
        )),
        just: Arg.Number('-j',
            help='Optionally specify a maximum number of chunks to process beyond the first.') = 0,
        temp: Arg.String('-t', metavar='name',
            help='The name of the accumulator variable. The default is "{default}".') = 't',
    ):
        super().__init__(suffix=suffix, temp=temp, just=just)

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        just = self.args.just
        name = self.args.temp
        accu = next(it)
        if not just:
            scope = it
        else:
            import itertools
            self.log_info(F'reducing only the next {just} chunks')
            scope = itertools.islice(it, 0, just)
        for chunk in scope:
            chunk.meta[name] = accu
            accu[:] = DelayedBinaryArgument(self.args.suffix, reverse=True, seed=chunk)(chunk)
            self.log_debug('reduced:', accu, clip=True)
        accu.meta.discard(name)
        yield accu
        yield from it

class rep (count=2, label=None)

Duplicates the given input a given number of times. It is also possible to specify an iterable instead of a number, in which case the input will be replicated once for each item in this iterable.

Expand source code Browse git

class rep(Unit):
    """
    Duplicates the given input a given number of times. It is also possible to specify
    an iterable instead of a number, in which case the input will be replicated once for
    each item in this iterable.
    """

    def __init__(
        self,
        count: Arg.NumSeq(help=(
            'Defines the number of outputs to generate for each input. The default is {default}. '
            'You can specify any multibin expression that defines an integer iterable here: Each '
            'input chunk will be replicated once for each element of that sequence.')) = 2,
        label: Arg(type=str, help=(
            'If specified, the meta variable with this name will be populated with the index of '
            'the replicated chunk. When the count parameter is an integer, this label will be '
            'equivalent to the index meta variable.')) = None
    ):
        super().__init__(count=count, label=label)

    def process(self, data: bytes):
        def count():
            count = self.args.count
            if isinstance(count, int):
                return count
            return sum(1 for _ in count)

        if self.args.squeeze or not self._framed:
            self.log_debug('compressing all repeated items into a single chunk')
            yield data * count()
            return

        self.log_debug('emitting each repeated item as an individual chunk')

        label = self.args.label
        if label is None:
            yield from repeat(data, count())
            return

        meta = {}
        for counter in self.args.count:
            meta[label] = counter
            yield self.labelled(data, **meta)

class repl (search, replace=b'', count=-1)

Performs a simple binary string replacement on the input data.

Expand source code Browse git

class repl(Unit):
    """
    Performs a simple binary string replacement on the input data.
    """

    def __init__(
        self,
        search : Arg(help='This is the search term.'),
        replace: Arg(help='The substitution string. Leave this empty to remove all occurrences of the search term.') = B'',
        count  : Arg.Number('-n', help='Only replace the given number of occurrences') = -1
    ):
        super().__init__(search=search, replace=replace, count=count)

    def process(self, data: bytes):
        return data.replace(
            self.args.search,
            self.args.replace,
            self.args.count
        )

class resplit (regex=b'\\r?\\n', multiline=False, ignorecase=False, count=0)

Splits the data at the given regular expression and returns the sequence of chunks between the separators. By default, the input is split along line breaks.

Expand source code Browse git

class resplit(SingleRegexUnit):
    """
    Splits the data at the given regular expression and returns the sequence of
    chunks between the separators. By default, the input is split along line breaks.
    """

    def __init__(
        self, regex=RB'\r?\n', multiline=False, ignorecase=False, count=0
    ):
        super().__init__(regex=regex, multiline=multiline, ignorecase=ignorecase, count=count)

    def process(self, data):
        view = memoryview(data)
        cursor = 0
        count = self.args.count
        for k, match in enumerate(self.regex.finditer(view), 2):
            yield view[cursor:match.start()]
            cursor = match.end()
            yield from match.groups()
            if k > count > 0:
                break
        yield view[cursor:]

class resub (regex='\\s+', subst=b'', multiline=False, ignorecase=False, count=0)

A unit for performing substitutions based on a binary regular expression pattern. Besides the syntax {k} to insert the k-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

{match-group:handlers}

where :handlers is an optional reverse multibin expression that is used to post-process the binary data from the match. For example, {2:hex:b64} represents the base64-decoding of the hex-decoding of the second match group.

Expand source code Browse git

class resub(SingleRegexUnit):
    """
    A unit for performing substitutions based on a binary regular expression pattern. Besides the
    syntax `{k}` to insert the `k`-th match group, the unit supports processing the contents of
    match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

        {match-group:handlers}

    where `:handlers` is an optional reverse multibin expression that is used to post-process the
    binary data from the match. For example, `{2:hex:b64}` represents the base64-decoding of the
    hex-decoding of the second match group.
    """
    def __init__(
        self,
        regex: Arg(help='Regular expression to be searched and replaced. The default is "{default}".') = '\\s+',
        subst: Arg('subst', help=(
            'Substitution value: use {1} for group 1, {0} for entire match. Matches are removed '
            '(replaced by an empty string) by default.'
        )) = B'',
        multiline=False,
        ignorecase=False,
        count=0
    ):
        super().__init__(regex=regex, subst=subst, multiline=multiline, ignorecase=ignorecase, count=count)

    def process(self, data):
        def repl(match: Match):
            return meta.format_bin(spec, self.codec, [match[0], *match.groups()], match.groupdict())
        self.log_info('pattern:', getattr(self.regex, 'pattern', self.regex))
        self.log_info('replace:', self.args.subst)
        meta = metavars(data)
        spec = self.args.subst.decode('ascii', 'backslashreplace')
        substitute = self.regex.sub
        if self.args.count:
            from functools import partial
            substitute = partial(substitute, count=self.args.count)
        return substitute(repl, data)

class rev (blocksize=None)

The blocks of the input data are output in reverse order. If the length of the input data is not a multiple of the block size, the data is truncated.

Expand source code Browse git

class rev(UnaryOperation):
    """
    The blocks of the input data are output in reverse order. If the length of
    the input data is not a multiple of the block size, the data is truncated.
    """

    def __init__(self, blocksize=None):
        super().__init__(blocksize=blocksize, _truncate=2)

    def inplace(self, block: ndarray):
        return self._numpy.flip(block)

    operate = NotImplemented

    def process(self, data: bytearray):
        if self.bytestream:
            data.reverse()
            return data
        try:
            return self._fastblock(data)
        except FastBlockError:
            b = self.blocksize
            n = len(data)
            q = n // b
            m = q * b
            view = memoryview(data)
            temp = bytearray(b)
            for k in range(0, (q // 2) * b, b):
                lhs = slice(k, k + b)
                rhs = slice(m - k - b, m - k)
                temp[:] = view[rhs]
                data[rhs] = view[lhs]
                data[lhs] = temp
            if m < n:
                del view
                del temp
                del data[m:]
            return data

class rex (regex, *transformation, unicode=False, unique=False, multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False, longest=False, take=None)

Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each match. Each match is an individual output. Besides the syntax {k} to insert the k-th match group, the unit supports processing the contents of match groups with arbitrary refinery units. To do so, use the following F-string-like syntax:

{match-group:pipeline}

where :pipeline is an optional pipeline of refinery commands as it would be specified on the command line. The value of the corresponding match is post-processed with this command. The unit also supports the special output format {.} which represents the input data.

Expand source code Browse git

class rex(SingleRegexUnit, PatternExtractor):
    """
    Short for Regular Expression eXtractor: A binary grep which can apply a transformation to each
    match. Each match is an individual output. Besides the syntax `{k}` to insert the `k`-th match
    group, the unit supports processing the contents of match groups with arbitrary refinery units.
    To do so, use the following F-string-like syntax:

        {match-group:pipeline}

    where `:pipeline` is an optional pipeline of refinery commands as it would be specified on
    the command line. The value of the corresponding match is post-processed with this command. The
    unit also supports the special output format `{%s}` which represents the input data.
    """
    def __init__(
        self, regex,
        # TODO: Use positional only in Python 3.8
        # /,
        *transformation: Arg(type=utf8, help=(
            'An optional sequence of transformations to be applied to each match. '
            'Each transformation produces one output in the order in which they   '
            'are given. The default transformation is {0}, i.e. the entire match.  '
        )),
        unicode: Arg.Switch('-u', help='Also find unicode strings.') = False,
        unique: Arg.Switch('-q', help='Yield every (transformed) match only once.') = False,
        multiline=False, ignorecase=False, min=1, max=None, len=None, stripspace=False,
        longest=False, take=None
    ):
        super().__init__(
            regex=regex,
            transformation=transformation,
            unicode=unicode,
            unique=unique,
            multiline=multiline,
            ignorecase=ignorecase,
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            longest=longest,
            take=take,
            utf16=unicode,
            ascii=True,
            duplicates=not unique
        )

    def process(self, data):
        meta = metavars(data)
        self.log_debug('regular expression:', getattr(self.regex, 'pattern', self.regex))
        transformations = []
        specs: List[bytes] = list(self.args.transformation)
        if not specs:
            specs.append(B'{0}')
        for spec in specs:
            def transformation(match: Match, s=spec.decode(self.codec)):
                symb: dict = match.groupdict()
                args: list = [match.group(0), *match.groups()]
                used = set()
                for key, value in symb.items():
                    if value is None:
                        symb[key] = B''
                symb[_FORWARD_VAR] = data
                item = meta.format(s, self.codec, args, symb, True, True, used)
                used.update(key for key, value in symb.items() if not value)
                used.add(_FORWARD_VAR)
                for variable in used:
                    symb.pop(variable, None)
                symb.update(offset=match.start())
                chunk = Chunk(item)
                chunk.meta.update(meta)
                chunk.meta.update(symb)
                return chunk
            transformations.append(transformation)
        yield from self.matches_filtered(memoryview(data), self.regex, *transformations)

class rijndael (key, iv=b'', block_size=16, *, aad=None, tag=None, segment_size=0, little_endian=False, raw=False, mode=None, padding=None)

Rijndael encryption and decryption. Note that there is also a aes unit which has much better performance because it calls into the PyCryptodome library. You would have to use this specific Rijndael unit only if Rijndael is used with a block size that is different from 16 bytes, in which case it is equivalent to AES.

Expand source code Browse git

class rijndael(StandardBlockCipherUnit, cipher=BlockCipherFactory(Rijndael)):
    """
    Rijndael encryption and decryption. Note that there is also a `refinery.aes` unit which has
    much better performance because it calls into the PyCryptodome library. You would have to
    use this specific Rijndael unit only if Rijndael is used with a block size that is different
    from 16 bytes, in which case it is equivalent to AES.
    """
    def __init__(
        self, key, iv=b'',
        block_size: Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 16, 24, and 32.') = 16,
        **more
    ):
        return super().__init__(key, iv=iv, block_size=block_size, **more)

    @property
    def block_size(self):
        return self.args.block_size

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(block_size=self.args.block_size, **optionals)

class ripemd128 (reps=1, text=False)

Returns the RIPEMD-128 hash of the input data.

Expand source code Browse git

class ripemd128(HashUnit):
    """
    Returns the RIPEMD-128 hash of the input data.
    """
    def _algorithm(self, data):
        from refinery.lib.ripemd128 import ripemd128
        return ripemd128(data)

class ripemd160 (reps=1, text=False)

Returns the RIPEMD160 hash of the input data.

class rmv (*names)

Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no variable names are given, the unit removes all of them. Note that this can recover variables from outer frames that were previously shadowed.

Expand source code Browse git

class rmv(Unit):
    """
    Short for "ReMove Variable": Removes meta variables that were created in the current frame. If no
    variable names are given, the unit removes all of them. Note that this can recover variables from
    outer frames that were previously shadowed.
    """
    def __init__(self, *names: Arg(type=str, metavar='name', help='Name of a variable to be removed.')):
        super().__init__(names=names)

    def process(self, data: Chunk):
        meta = metavars(data)
        keys = self.args.names or list(meta.variable_names())
        for key in keys:
            meta.discard(key)
        return data

class rncrypt (password)

Implements encryption and decryption using the RNCryptor specification. See also: https://github.com/RNCryptor

Expand source code Browse git

class rncrypt(Unit):
    """
    Implements encryption and decryption using the RNCryptor specification.
    See also: https://github.com/RNCryptor
    """
    def __init__(self, password: bytearray):
        super().__init__(password=password)

    def process(self, data: bytes) -> bytes:
        encryption_salt = data[2:10]
        hmac_salt = data[10:18]
        iv = data[18:34]
        cipher_text = data[34:-32]
        hmac_signature = data[-32:]
        encryption_key = self._pbkdf2(self.args.password, encryption_salt)
        hmac_key = self._pbkdf2(self.args.password, hmac_salt)
        if not hmac.compare_digest(self._hmac(hmac_key, data[:-32]), hmac_signature):
            raise ValueError("Failed to verify signature.")
        return unpad(
            self._aes_decrypt(encryption_key, iv, cipher_text),
            block_size=AES.block_size
        )

    def reverse(self, data: bytes) -> bytes:
        prng = Random.new()
        data = pad(data, block_size=AES.block_size)
        encryption_salt = prng.read(8)
        encryption_key = self._pbkdf2(self.args.password, encryption_salt)
        hmac_salt = prng.read(8)
        hmac_key = self._pbkdf2(self.args.password, hmac_salt)
        iv = prng.read(AES.block_size)
        cipher_text = self._aes_encrypt(encryption_key, iv, data)
        new_data = b'\x03\x01' + encryption_salt + hmac_salt + iv + cipher_text
        return new_data + self._hmac(hmac_key, new_data)

    def _aes_encrypt(self, key, iv, text):
        return AES.new(key, AES.MODE_CBC, iv).encrypt(text)

    def _aes_decrypt(self, key, iv, text):
        return AES.new(key, AES.MODE_CBC, iv).decrypt(text)

    def _hmac(self, key, data):
        return hmac.new(key, data, hashlib.sha256).digest()

    def _prf(self, secret, salt):
        return hmac.new(secret, salt, hashlib.sha1).digest()

    def _pbkdf2(self, password, salt, iterations=10000, key_length=32):
        return KDF.PBKDF2(password, salt, dkLen=key_length, count=iterations, prf=self._prf)

class rot (amount=13)

Rotate the characters of the alphabet by the given amount. The default amount is 13, providing the common (and weak) string obfuscation method.

Expand source code Browse git

class rot(Unit):
    """
    Rotate the characters of the alphabet by the given amount. The default
    amount is 13, providing the common (and weak) string obfuscation method.
    """

    def __init__(self, amount: Arg.Number(help='Number of letters to rotate by; Default is 13.') = 13):
        super().__init__(amount=amount)

    def process(self, data: bytearray):
        rot = self.args.amount % 26
        for index, byte in enumerate(data):
            for alphabet in _LCASE, _UCASE:
                if byte in alphabet:
                    zero = alphabet[0]
                    data[index] = zero + (byte - zero + rot) % 26
                    break
        return data

class rotl (*argument, bigendian=False, blocksize=None)

Rotate the bits of each block left.

Expand source code Browse git

class rotl(BinaryOperation):
    """
    Rotate the bits of each block left.
    """
    def operate(self, value, shift):
        shift %= self.fbits
        return (value << shift) | (value >> (self.fbits - shift))

    def inplace(self, value, shift):
        shift %= self.fbits
        lower = value >> (self.fbits - shift)
        value <<= shift
        value |= lower

class rotr (*argument, bigendian=False, blocksize=None)

Rotate the bits of each block right.

Expand source code Browse git

class rotr(BinaryOperation):
    """
    Rotate the bits of each block right.
    """
    def operate(self, value, shift):
        shift %= self.fbits
        return (value >> shift) | (value << (self.fbits - shift))

    def inplace(self, value, shift):
        shift %= self.fbits
        lower = value >> shift
        value <<= self.fbits - shift
        value |= lower

class rsa (key, swapkeys=False, textbook=False, padding=PAD.AUTO, rsautl=False)

Implements single block RSA encryption and decryption. This unit can be used to encrypt and decrypt blocks generated by openssl's rsautl tool when using the mode -verify. When it is executed with a public key for decryption or with a private key for encryption, it will perform a raw RSA operation. The result of these operations are (un)padded using EMSA-PKCS1-v1_5.

Expand source code Browse git

class rsa(Unit):
    """
    Implements single block RSA encryption and decryption. This unit can be used to encrypt
    and decrypt blocks generated by openssl's `rsautl` tool when using the mode `-verify`.
    When it is executed with a public key for decryption or with a private key for encryption,
    it will perform a raw RSA operation. The result of these operations are (un)padded using
    EMSA-PKCS1-v1_5.
    """
    def __init__(
        self,
        key: Arg(help='RSA key in PEM, DER, or Microsoft BLOB format.'),
        swapkeys: Arg.Switch('-s', help='Swap public and private exponent.') = False,
        textbook: Arg.Switch('-t', group='PAD', help='Equivalent to --padding=NONE.') = False,
        padding : Arg.Option('-p', group='PAD', choices=PAD,
            help='Choose one of the following padding modes: {choices}. The default is AUTO.') = PAD.AUTO,
        rsautl  : Arg.Switch('-r', group='PAD',
            help='Act as rsautl from OpenSSH; This is equivalent to --swapkeys --padding=PKCS10') = False,
    ):
        padding = Arg.AsOption(padding, PAD)
        if textbook:
            if padding != PAD.AUTO:
                raise ValueError('Conflicting padding options!')
            padding = padding.NONE
        if rsautl:
            if padding and padding != PAD.PKCS10:
                raise ValueError('Conflicting padding options!')
            swapkeys = True
            padding = PAD.PKCS10

        super().__init__(key=key, textbook=textbook, padding=padding, swapkeys=swapkeys)

        self._key_hash = None
        self._key_data = None

    @property
    def blocksize(self) -> int:
        return self.key.size_in_bytes()

    @property
    def _blocksize_plain(self) -> int:
        # PKCS#1 v1.5 padding is at least 11 bytes.
        return self.blocksize - 11

    @property
    def pub(self):
        return self.key.d if self.args.swapkeys else self.key.e

    @property
    def prv(self):
        return self.key.e if self.args.swapkeys else self.key.d

    def _get_msg(self, data):
        msg = int.from_bytes(data, byteorder='big')
        if msg > self.key.n:
            raise ValueError(F'This key can only handle messages of size {self.blocksize}.')
        return msg

    def _encrypt_raw(self, data):
        return pow(
            self._get_msg(data),
            self.pub,
            self.key.n
        ).to_bytes(self.blocksize, byteorder='big')

    def _decrypt_raw(self, data):
        return pow(
            self._get_msg(data),
            self.prv,
            self.key.n
        ).to_bytes(self.blocksize, byteorder='big')

    def _unpad(self, data, head, padbyte=None):
        if len(data) > self.blocksize:
            raise ValueError(F'This key can only handle messages of size {self.blocksize}.')
        if data.startswith(head):
            pos = data.find(B'\0', 2)
            if pos > 0:
                pad = data[2:pos]
                if padbyte is None or all(b == padbyte for b in pad):
                    return data[pos + 1:]
        raise ValueError('Incorrect padding')

    def _pad(self, data, head, padbyte=None):
        if len(data) > self._blocksize_plain:
            raise ValueError(F'This key can only encrypt messages of size at most {self._blocksize_plain}.')
        pad = self.blocksize - len(data) - len(head) - 1
        if padbyte is not None:
            padding = pad * bytes((padbyte,))
        else:
            padding = bytearray(1)
            while not all(padding):
                padding = bytearray(filter(None, padding))
                padding.extend(get_random_bytes(pad - len(padding)))
        return head + padding + B'\0' + data

    def _unpad_pkcs10(self, data):
        return self._unpad(data, B'\x00\x01', 0xFF)

    def _unpad_pkcs15(self, data):
        return self._unpad(data, B'\x00\x02', None)

    def _pad_pkcs10(self, data):
        return self._pad(data, B'\x00\x01', 0xFF)

    def _pad_pkcs15(self, data):
        return self._pad(data, B'\x00\x02', None)

    def _decrypt_block_OAEP(self, data):
        self.log_debug('Attempting decryption with PyCrypto PKCS1 OAEP.')
        return PKCS1_OAEP.new(self.key).decrypt(data)

    def _encrypt_block_OAEP(self, data):
        self.log_debug('Attempting encryption with PyCrypto PKCS1 OAEP.')
        return PKCS1_OAEP.new(self.key).encrypt(data)

    def _decrypt_block(self, data):
        if self._oaep and self._pads in {PAD.AUTO, PAD.OAEP}:
            try:
                return self._decrypt_block_OAEP(data)
            except ValueError as E:
                if self._pads:
                    raise
                self.log_debug(F'{E!s} No longer attempting OAEP.')
                self._oaep = False

        data = self._decrypt_raw(data)
        return self._unpad_per_argument(data)

    def _unpad_per_argument(self, data):
        if self._pads == PAD.NONE:
            return data
        elif self._pads == PAD.PKCS10:
            return self._unpad_pkcs10(data)
        elif self._pads == PAD.PKCS15:
            return self._unpad_pkcs15(data)
        elif self._pads == PAD.AUTO:
            with suppress(ValueError):
                data = self._unpad_pkcs10(data)
                self.log_info('Detected PKCS1.0 padding.')
                self._pads = PAD.PKCS10
                return data
            with suppress(ValueError):
                data = self._unpad_pkcs15(data)
                self.log_info('Detected PKCS1.5 padding.')
                self._pads = PAD.PKCS15
                return data
            raise RefineryPartialResult('No padding worked, returning raw decrypted blocks.', data)
        else:
            raise ValueError(F'Invalid padding value: {self._pads!r}')

    def _encrypt_block(self, data):
        if self._pads in {PAD.AUTO, PAD.OAEP}:
            try:
                return self._encrypt_block_OAEP(data)
            except ValueError:
                if self._pads: raise
                self.log_debug('PyCrypto primitives for OAEP failed, falling back to PKCS1.5.')
                self._pads = PAD.PKCS15

        if self._pads == PAD.PKCS15:
            data = self._pad_pkcs15(data)
        elif self._pads == PAD.PKCS10:
            data = self._pad_pkcs10(data)

        return self._encrypt_raw(data)

    @property
    def key(self) -> RSA.RsaKey:
        key_blob = self.args.key
        key_hash = hash(key_blob)
        if key_hash != self._key_hash:
            fmt, key_data = normalize_rsa_key(key_blob)
            self.log_info(F'successfully parsed RSA key as {fmt.value}')
            self._key_hash = key_hash
            self._key_data = key_data
        return self._key_data

    def process(self, data):
        self._oaep = True
        self._pads = self.args.padding
        if not self.key.has_private():
            try:
                return self._unpad_per_argument(self._encrypt_raw(data))
            except RefineryPartialResult:
                raise
            except Exception as E:
                raise ValueError(F'A public key was given for decryption and rsautl mode resulted in an error: {E}') from E
        return B''.join(self._decrypt_block(block) for block in splitchunks(data, self.blocksize))

    def reverse(self, data):
        self._pads = self.args.padding
        return B''.join(self._encrypt_block(block) for block in splitchunks(data, self._blocksize_plain))

class rsakey (public=False, output=RSAFormat.PEM)

Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported. The same formats are supported for the input format, but you can also specify a key in the following format, where both modulus and exponent have to be hex-encoded: [modulus]:[exponent]

Expand source code Browse git

class rsakey(Unit):
    """
    Parse RSA keys in various formats; PEM, DER, Microsoft BLOB, and W3C-XKMS (XML) format are supported.
    The same formats are supported for the input format, but you can also specify a key in the following
    format, where both modulus and exponent have to be hex-encoded: `[modulus]:[exponent]`
    """
    def __init__(
        self,
        public: Arg.Switch('-p', help='Force public key output even if the input is private.') = False,
        output: Arg.Option(help='Select an output format ({choices}), default is {default}.', choices=RSAFormat) = RSAFormat.PEM
    ):
        super().__init__(public=public, output=Arg.AsOption(output, RSAFormat))

    def _xkms_wrap(self, number: int):
        size, r = divmod(number.bit_length(), 8)
        size += int(bool(r))
        return base64.b64encode(number.to_bytes(size, 'big'))

    def process(self, data):
        from refinery.lib.mscrypto import TYPES, ALGORITHMS
        fmt, key = normalize_rsa_key(data, force_public=self.args.public)
        self.log_info(F'parsing input as {fmt.value} format')
        out = self.args.output
        if out is RSAFormat.PEM:
            yield key.export_key('PEM')
            return
        if out is RSAFormat.DER:
            yield key.export_key('DER')
            return
        if out is RSAFormat.BLOB:
            def le(v: int, s: int):
                return v.to_bytes(s, 'little')
            buffer = bytearray()
            buffer.append(TYPES.PRIVATEKEYBLOB if key.has_private() else TYPES.PUBLICKEYBLOB)
            buffer.extend(le(2, 3))
            buffer.extend(le(ALGORITHMS.CALG_RSA_KEYX, 4))
            buffer.extend(B'RSA2' if key.has_private() else B'RSA1')
            size = 2
            while size < key.n.bit_length():
                size <<= 1
            self.log_info(F'using bit size {size}')
            buffer.extend(le(size, 4))
            size //= 8
            buffer.extend(le(key.e, 4))
            buffer.extend(le(key.n, size))
            if key.has_private():
                exp_1 = key.d % (key.p - 1)
                exp_2 = key.d % (key.q - 1)
                coeff = pow(key.q, -1, key.p)
                half = size // 2
                buffer.extend(le(key.p, half))
                buffer.extend(le(key.q, half))
                buffer.extend(le(exp_1, half))
                buffer.extend(le(exp_2, half))
                buffer.extend(le(coeff, half))
                buffer.extend(le(key.d, size))
            yield buffer
            return
        components = {
            'Modulus' : key.n,
            'Exponent': key.e,
        }
        if key.has_private():
            decoded = DerSequence()
            decoded.decode(key.export_key('DER'))
            it = itertools.islice(decoded, 3, None)
            for v in ('D', 'P', 'Q', 'DP', 'DQ', 'InverseQ'):
                try:
                    components[v] = next(it)
                except StopIteration:
                    break
        if out is RSAFormat.XKMS:
            for tag in components:
                components[tag] = base64.b64encode(number.long_to_bytes(components[tag])).decode('ascii')
            tags = '\n'.join(F'\t<{tag}>{value}</{tag}>' for tag, value in components.items())
            yield F'<RSAKeyPair>\n{tags}\n</RSAKeyPair>'.encode(self.codec)
            return
        components['BitSize'] = key.n.bit_length()
        for tag, value in components.items():
            if value.bit_length() > 32:
                components[tag] = F'{value:X}'
        if out is RSAFormat.JSON:
            yield json.dumps(components, indent=4).encode(self.codec)
            return
        if out is RSAFormat.TEXT:
            table = list(flattened(components))
            for key, value in table:
                value = F'0x{value}' if isinstance(value, str) else str(value)
                value = '\n'.join(F'{L}' for L in textwrap.wrap(value, 80))
                yield F'-- {key + " ":-<77}\n{value!s}'.encode(self.codec)

class rtfc

Implements the RTF compression format. This compression algorithm is used, for example, to compress RTF data in Outlook messages.

Expand source code Browse git

class rtfc(Unit):
    """
    Implements the RTF compression format. This compression algorithm is used, for example, to
    compress RTF data in Outlook messages.
    """
    @Unit.Requires('compressed_rtf', 'formats', 'office', 'default', 'extended')
    def _rtfc():
        import compressed_rtf
        return compressed_rtf

    def process(self, data):
        return self._rtfc.decompress(data)

    def reverse(self, data):
        return self._rtfc.compress(data)

class run (*commandline, stream=False, noinput=False, errors=False, timeout=0.0)

Turns any command into a refinery unit. Data is processed by feeding it to the standard input of a process spawned from the given command line, and then reading the standard output of that process as the result of the operation. The main purpose of this unit is to allow using the syntax from refinery.lib.frame with other command line tools. By default, the unit streams the output from the executed command as individual outputs, but the buffer option can be set to buffer all output of a single execution. The format string expression {} or {0} can be used as one of the arguments passed to the external command to represent the incoming data. In this case, the data will not be sent to the standard input device of the new process.

Expand source code Browse git

class run(Unit):
    """
    Turns any command into a refinery unit. Data is processed by feeding it to the standard input
    of a process spawned from the given command line, and then reading the standard output of that
    process as the result of the operation. The main purpose of this unit is to allow using the
    syntax from `refinery.lib.frame` with other command line tools. By default, the unit streams
    the output from the executed command as individual outputs, but the `buffer` option can be set
    to buffer all output of a single execution. The format string expression `{}` or `{0}` can be
    used as one of the arguments passed to the external command to represent the incoming data. In
    this case, the data will not be sent to the standard input device of the new process.
    """

    _JOIN_TIME = 0.1

    def __init__(
        self, *commandline : Arg(nargs='...', type=str, metavar='(all remaining)', help=(
            'All remaining command line tokens form an arbitrary command line to be executed. Use'
            ' format string syntax to insert meta variables and incoming data chunks.')),
        stream : Arg.Switch('-s',
            help='Stream the command output rather than buffering it.') = False,
        noinput: Arg('-x', help='Do not send any input to the new process.') = False,
        errors : Arg('-m', help=(
            'Merge stdout and stderr. By default, the standard error stream of the coupled command'
            ' is forwarded to the logger, i.e. it is only visible if -v is also specified.'
        )) = False,
        timeout: Arg('-t', metavar='T', help=(
            'Optionally set an execution timeout as a floating point number in seconds.'
        )) = 0.0
    ):
        if not commandline:
            raise ValueError('you need to provide a command line.')
        super().__init__(
            commandline=commandline, errors=errors, noinput=noinput, stream=stream, timeout=timeout)

    def process(self, data):
        def shlexjoin():
            import shlex
            return ' '.join(shlex.quote(cmd) for cmd in commandline)

        meta = metavars(data)
        meta.ghost = True
        used = set()
        commandline = [
            meta.format(cmd, self.codec, [data], None, False, used=used)
            for cmd in self.args.commandline
        ]

        if self.args.noinput:
            self.log_info('sending no input to process stdin')
            data = None

        if not self.log_debug(commandline):
            self.log_info(shlexjoin)

        posix = 'posix' in sys.builtin_module_names
        process = Popen(commandline,
            stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False, close_fds=posix)

        if not self.args.stream and not self.args.timeout:
            out, err = process.communicate(data)
            for line in err.splitlines():
                self.log_info(line)
            yield out
            return

        from threading import Thread, Event
        from queue import Queue, Empty
        from time import process_time, sleep

        start = 0
        result = None

        qerr = Queue()
        qout = Queue()
        done = Event()

        def adapter(stream, queue: Queue, event: Event):
            while not event.is_set():
                out = stream.read1()
                if out: queue.put(out)
                else: break
            stream.close()

        recvout = Thread(target=adapter, args=(process.stdout, qout, done), daemon=True)
        recverr = Thread(target=adapter, args=(process.stderr, qerr, done), daemon=True)

        recvout.start()
        recverr.start()

        if data:
            process.stdin.write(data)
        process.stdin.close()
        start = process_time()

        if not self.args.stream or self.args.timeout:
            result = MemoryFile()

        def queue_read(q: Queue):
            try: return q.get_nowait()
            except Empty: return None

        errbuf = MemoryFile()

        while True:
            out = queue_read(qout)
            err = None

            if self.args.errors:
                out = out or queue_read(qerr)
            else:
                err = queue_read(qerr)

            if err and self.log_info():
                errbuf.write(err)
                errbuf.seek(0)
                lines = errbuf.readlines()
                errbuf.seek(0)
                errbuf.truncate()
                if lines:
                    if not (done.is_set() or lines[~0].endswith(B'\n')):
                        errbuf.write(lines.pop())
                    for line in lines:
                        msg = line.rstrip(B'\n')
                        if msg: self.log_info(msg)
            if out:
                if not self.args.stream or self.args.timeout:
                    result.write(out)
                if self.args.stream:
                    yield out

            if done.is_set():
                if recverr.is_alive():
                    self.log_warn('stderr receiver thread zombied')
                if recvout.is_alive():
                    self.log_warn('stdout receiver thread zombied')
                break
            elif not err and not out and process.poll() is not None:
                recverr.join(self._JOIN_TIME)
                recvout.join(self._JOIN_TIME)
                done.set()
            elif self.args.timeout:
                if process_time() - start > self.args.timeout:
                    self.log_info('terminating process after timeout expired')
                    done.set()
                    process.terminate()
                    for wait in range(4):
                        if process.poll() is not None:
                            break
                        sleep(self._JOIN_TIME)
                    else:
                        self.log_warn('process termination may have failed')
                    recverr.join(self._JOIN_TIME)
                    recvout.join(self._JOIN_TIME)
                    if not len(result):
                        result = RuntimeError('timeout reached, process had no output')
                    else:
                        result = RefineryPartialResult(
                            'timeout reached, returning all collected output',
                            partial=result.getvalue())

        if isinstance(result, Exception):
            raise result
        elif not self.args.stream:
            yield result.getvalue()

class salsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided as the key, this data is interpreted as the initial state box and all other parameters are ignored.

Expand source code Browse git

class salsa(LatinCipherUnit):
    """
    Salsa encryption and decryption. The nonce must be 8 bytes long. When 64 bytes are provided
    as the key, this data is interpreted as the initial state box and all other parameters are
    ignored.
    """
    def keystream(self) -> Iterable[int]:
        key = self.args.key
        if len(key) == 64:
            it = SalsaCipher.FromState(key)
        else:
            it = SalsaCipher(
                key,
                self.args.nonce,
                self.args.magic,
                self.args.rounds,
                self.args.offset,
            )
        yield from it

class salsa20 (key, nonce=b'REFINERY')

Salsa20 encryption and decryption. This unit is functionally equivalent to salsa with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure Python implementation used by salsa.

Expand source code Browse git

class salsa20(LatinCipherStandardUnit, cipher=PyCryptoFactoryWrapper(Salsa20)):
    """
    Salsa20 encryption and decryption. This unit is functionally equivalent to `refinery.salsa`
    with 20 rounds, but it uses the PyCryptodome library C implementation rather than the pure
    Python implementation used by `refinery.salsa`.
    """
    pass

class scope (*slice, visible=True)

After using scope within in a refinery.lib.frame, all the following operations will be applied only to the selected indices. All remaining chunks still exist, they are just not operated on. When the frame closes or the frame is being rescoped by a second application of this unit, they become visible again.

Expand source code Browse git

class scope(FrameSlicer):
    """
    After using `refinery.scope` within in a `refinery.lib.frame`, all the
    following operations will be applied only to the selected indices. All
    remaining chunks still exist, they are just not operated on. When the
    frame closes or the frame is being rescoped by a second application of
    this unit, they become visible again.
    """
    def __init__(self, *slice, visible: Arg.Switch('-n', '--not', off=True, help=(
        'Hide the given chunks instead of making them the only ones visible.')) = True
    ):
        super().__init__(*slice, visible=visible)
        # Sort any slices with negative arguments to the back so we check
        # them last. This delays potential consumption of the chunks iterator
        # as much as possible.
        self.args.slice.sort(
            key=lambda s: (s.start or 0, s.stop or 0), reverse=True)

    def filter(self, chunks):
        it = iter(chunks)
        consumed = None
        size = None

        def buffered() -> Generator[Chunk, None, None]:
            yield from it
            while consumed:
                yield consumed.popleft()

        def shift(offset, default):
            nonlocal consumed, size
            if offset is None:
                return default
            if offset >= 0:
                return offset
            if consumed is None:
                from collections import deque
                self.log_info(F'consuming iterator to compute negative offset {offset}.')
                consumed = deque(it)
                size = len(consumed) + k + 1
            return max(0, offset + size)

        for k, chunk in enumerate(buffered()):
            for s in self.args.slice:
                if k in range(shift(s.start, 0), shift(s.stop, k + 1), s.step or 1):
                    chunk.visible = self.args.visible
                    break
            else:
                chunk.visible = not self.args.visible
            self.log_debug(chunk)
            yield chunk

class seal (key, discard=0, stateful=False)

SEAL encryption and decryption.

Expand source code Browse git

class seal(StreamCipherUnit):
    """
    SEAL encryption and decryption.
    """
    key_size = {20}

    def keystream(self) -> Iterable[bytes]:
        return SEAL_Cipher(self.args.key)

class secstr (key=b'\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10', iv=None)

Implements the AES-based encryption scheme used by the PowerShell commands ConvertFrom-SecureString and ConvertTo-SecureString.

Expand source code Browse git

class secstr(Unit):
    """
    Implements the AES-based encryption scheme used by the PowerShell commands
    `ConvertFrom-SecureString` and `ConvertTo-SecureString`.
    """

    # This is a magic header value used for PowerShell secure strings.
    _MAGIC = bytes((
        0xEF, 0xAE, 0x3D, 0xD9, 0xDD, 0x75, 0xD7, 0xAE, 0xF8, 0xDD, 0xFD, 0x38,
        0xDB, 0x7E, 0x35, 0xDD, 0xBD, 0x7A, 0xD3, 0x9D, 0x1A, 0xE7, 0x7E, 0x39))

    # Secure strings include a decimal number formatted as a string directly
    # following the header. Presumably, this is the PowerShell version.
    _PSVER = 2

    def __init__(
        self, key: Arg(
            help='Secure string encryption 16-byte AES key; the default are the bytes from 1 to 16.'
        ) = bytes(range(1, 17)),
        iv: Arg('-i', help='Optionally specify an IV to use for encryption.') = None
    ):
        super().__init__(key=key, iv=iv)

    @property
    def key(self):
        key = self.args.key
        if len(key) not in (0x10, 0x18, 0x20):
            raise ValueError('The encryption key has to be 16 bytes long.')
        return key

    @property
    def iv(self):
        iv = self.args.iv
        if iv is not None and len(iv) != 0x10:
            raise ValueError('The IV has to be 16 bytes long.')
        return iv

    def reverse(self, data):
        ivec = self.iv or urandom(0x10)
        if len(ivec) != 0x10:
            raise ValueError(self._IVERR)
        cipher = AES.new(self.key, AES.MODE_CBC, ivec)
        data = data.decode('latin-1').encode('utf-16LE')
        data = cipher.encrypt(pad(data, block_size=0x10))
        data = base64.b16encode(data).lower().decode('ascii')
        ivec = base64.b64encode(ivec).decode('ascii')
        data = '|'.join(('%d' % self._PSVER, ivec, data)).encode('utf-16LE')
        return base64.b64encode(self._MAGIC + data)

    def process(self, data):
        head, ivec, data = base64.b64decode(data).split(b'|\0')
        self.log_info('head:', head.hex())
        ivec = base64.b64decode(ivec.decode('utf-16LE'))
        self.log_info('ivec:', ivec.hex())
        data = base64.b16decode(data.decode('utf-16LE'), casefold=True)
        if len(data) % 0x10 != 0:
            self.log_info('data not block-aligned, padding with zeros')
            data += B'\0' * (0x10 - len(data) % 0x10)
        cipher = AES.new(self.key, AES.MODE_CBC, ivec)
        data = cipher.decrypt(data)
        try:
            data = unpad(data, block_size=0x10)
        except Exception:
            self.log_warn('decrypted data does not have PKCS7 padding')
        for p in range(0x10):
            try:
                return data[-p:].decode('utf-16LE').encode('latin-1')
            except UnicodeDecodeError:
                pass
            except UnicodeEncodeError:
                pass
        self.log_warn('result is not a padded unicode string, key is likely wrong')
        return data

class sep (separator=b'\n', scoped=False)

Multiple inputs are joined along a specified separator. If any of the input Chunks is currently out of scope, sep turns makes them visible by default. This can be prevented by using the -s flag.

Expand source code Browse git

class sep(Unit):
    """
    Multiple inputs are joined along a specified separator. If any of the input
    `refinery.lib.frame.Chunk`s is currently out of scope, `refinery.sep` turns
    makes them visible by default. This can be prevented by using the `-s` flag.
    """

    def __init__(
        self, separator: Arg(help='Separator; the default is a line break.') = B'\n',
        scoped: Arg.Switch('-s', help=(
            'Maintain chunk scope; i.e. do not turn all input chunks visible.')) = False
    ):
        super().__init__(separator=separator, scoped=scoped)
        self.separate = False

    def filter(self, chunks):
        it = iter(chunks)
        try:
            chunk = next(it)
        except StopIteration:
            return
        self.separate = True
        for upcoming in it:
            if not self.args.scoped:
                chunk.visible = True
            yield chunk
            chunk = upcoming
        self.separate = False
        yield chunk

    def process(self, data):
        yield data
        if self.separate:
            yield self.args.separator

class serpent (key, iv=b'', padding=None, mode=None, raw=False, swap=False)

Serpent encryption and decryption. Some Serpent implementations read the bytes of each block in one direction, some in the other. When decryption results with this unit do not yield the expected result, try using the --swap (or -s) option to swap the bytes in each block. Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done by prefixing the input key by the multibin handler snip[::-1].

Expand source code Browse git

class serpent(StandardBlockCipherUnit, cipher=BlockCipherFactory(Serpent)):
    """
    Serpent encryption and decryption. Some Serpent implementations read the bytes of each block
    in one direction, some in the other. When decryption results with this unit do not yield the
    expected result, try using the `--swap` (or `-s`) option to swap the bytes in each block.
    Furthermore, it is sometimes necessary to swap the bytes of the input key, which can be done
    by prefixing the input key by the multibin handler `snip[::-1]`.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        swap: Arg.Switch('-s', help='Read the bytes in each block in reverse order.') = False
    ):
        super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, swap=swap)

    def _new_cipher(self, **optionals) -> CipherInterface:
        instance: Serpent = super()._new_cipher()
        instance.swap = self.args.swap
        return instance

class sha1 (reps=1, text=False)

Returns the SHA1 hash of the input data.

class sha224 (reps=1, text=False)

Returns the SHA224 hash of the input data.

class sha256 (reps=1, text=False)

Returns the SHA256 hash of the input data.

class sha384 (reps=1, text=False)

Returns the SHA384 hash of the input data.

class sha3_224 (reps=1, text=False)

Returns the SHA3-224 hash of the input data.

class sha3_256 (reps=1, text=False)

Returns the SHA3-256 hash of the input data.

class sha3_384 (reps=1, text=False)

Returns the SHA3-384 hash of the input data.

class sha3_512 (reps=1, text=False)

Returns the SHA3-512 hash of the input data.

class sha512 (reps=1, text=False)

Returns the SHA512 hash of the input data.

class shl (*argument, bigendian=False, blocksize=None)

Shift the bits of each block left, filling with zero bits.

Expand source code Browse git

class shl(BinaryOperation):
    """
    Shift the bits of each block left, filling with zero bits.
    """
    @staticmethod
    def operate(a, b): return a << b
    @staticmethod
    def inplace(a, b): a <<= b

class shr (*argument, bigendian=False, blocksize=None)

Shift the bits of each block right, filling with zero bits.

Expand source code Browse git

class shr(BinaryOperation):
    """
    Shift the bits of each block right, filling with zero bits.
    """
    @staticmethod
    def operate(a, b): return a >> b
    @staticmethod
    def inplace(a, b): a >>= b

class sm4 (key, *, iv=b'', padding=None, mode=None, raw=False, little_endian=False, segment_size=0, tag=None, aad=None)

The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography Administration of China (SCA).

Expand source code Browse git

class sm4(StandardBlockCipherUnit, cipher=BlockCipherFactory(SM4)):
    """
    The SM4 symmetric blockcipher algorithm published as GB/T 32907-2016 by the State Cryptography
    Administration of China (SCA).
    """
    pass

class snip (slices=[slice(None, None, None)], length=False, stream=False, remove=False)

Snips the input data based on a Python slice expression. For example, the initialization slice 0::1 1::1 would yield a unit that first extracts every byte at an even position and then, every byte at an odd position. In this case, multiple outputs are produced. The unit can be used in reverse mode, in which case the specified ranges are deleted sequentially from the input.

Expand source code Browse git

class snip(Unit):
    """
    Snips the input data based on a Python slice expression. For example, the
    initialization `slice 0::1 1::1` would yield a unit that first extracts
    every byte at an even position and then, every byte at an odd position. In
    this case, multiple outputs are produced. The unit can be used in reverse
    mode, in which case the specified ranges are deleted sequentially from the
    input.
    """
    def __init__(
        self,
        slices: Arg(help='Specify start:stop:step in Python slice syntax.') = [slice(None, None)],
        length: Arg.Switch('-l', help=(
            'Interpret the end of a slice as a length rather than as an offset.')) = False,
        stream: Arg.Switch('-s', help=(
            'After each slice, consider only the data that follows after it for subsequent '
            'slicing. This mode is incompatible with negative step sizes.')) = False,
        remove: Arg.Switch('-r', help=(
            'Remove the slices from the input rather than selecting them.')) = False,
    ):
        super().__init__(slices=slices, length=length, stream=stream, remove=remove)

    def process(self, data: bytearray):
        slices: list[slice] = list(self.args.slices)
        opt_stream = self.args.stream
        opt_remove = self.args.remove
        opt_length = self.args.length
        view = memoryview(data)
        offset = 0

        if opt_stream and any(b.step or 1 < 0 for b in slices):
            raise RuntimeError('Streaming is incompatible with negative step slices.')

        for k, bounds in enumerate(slices):
            step = bounds.step or 1
            stop = bounds.stop
            start = bounds.start
            if opt_length:
                if stop is not None:
                    if start is None:
                        if step < 0:
                            stop += len(data)
                    else:
                        stop += start
            if opt_stream:
                start = start or 0
                stop = stop or len(data)
                start += offset
                stop += offset

            bounds = slice(start, stop, bounds.step)

            if not opt_remove:
                temp = view[bounds]
            else:
                temp = bytearray(data)
                del temp[bounds]
            yield temp

            if opt_stream:
                offset = stop

class sorted (key=None, ascending=False)

Sorts all elements of the input refinery.lib.frame lexicographically. This unit is a nop on single inputs.

Expand source code Browse git

class sorted(Unit):
    """
    Sorts all elements of the input `refinery.lib.frame` lexicographically.
    This unit is a `refinery.nop` on single inputs.
    """

    def __init__(
        self,
        key: Arg('key', type=str, help='A meta variable expression to sort by instead of sorting the content.') = None,
        ascending: Arg.Switch('-a', help='Sort in ascending order, the default is descending.') = False
    ):
        super().__init__(key=key, ascending=ascending)

    def filter(self, chunks):
        sortbuffer = []
        invisibles = []
        key = self.args.key
        rev = not self.args.ascending

        if key is not None:
            def _key(chunk):
                return expression(metavars(chunk)), chunk
            expression = PythonExpression(key, all_variables_allowed=True)
            key = _key

        def sorted():
            if not sortbuffer:
                return
            sortbuffer.sort(key=key, reverse=rev)
            yield from sortbuffer
            sortbuffer.clear()

        for chunk in chunks:
            if chunk.visible:
                yield from invisibles
                invisibles.clear()
                sortbuffer.append(chunk)
            else:
                yield from sorted()
                invisibles.append(chunk)

        yield from invisibles
        yield from sorted()

class sosemanuk (key, stateful=False, discard=0, nonce=b'')

Expand source code Browse git

class sosemanuk(StreamCipherUnit):

    def __init__(
        self, key, stateful=False, discard=0,
        nonce: Arg(help='The nonce. Default is empty, which is equivalent to 16 null bytes.') = B'',
    ):
        super().__init__(key=key, nonce=nonce, stateful=stateful, discard=discard)

    def keystream(self):
        yield from Sosemanuk(self.args.key, self.args.nonce)

class speck (key, iv=b'', padding=None, mode=None, raw=False, block_size=16, *, aad=None, tag=None, segment_size=0, little_endian=False)

SPECK encryption and decryption. It supports block sizes of 8 and 16 bytes.

Expand source code Browse git

class speck(StandardBlockCipherUnit, cipher=BlockCipherFactory(Speck)):
    """
    SPECK encryption and decryption. It supports block sizes of 8 and 16 bytes.
    """
    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False,
        block_size: Arg.Number('-b', help='Cipher block size, default is {default}. Valid choices are 8 and 16.') = 16,
        **more
    ):
        return super().__init__(key, iv=iv, padding=padding, mode=mode, raw=raw, block_size=block_size, **more)

    @property
    def block_size(self):
        return self.args.block_size

    def _new_cipher(self, **optionals) -> CipherInterface:
        return super()._new_cipher(block_size=self.args.block_size, **optionals)

class stego (split=False, parts='RGB')

Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs these values as bytes. By default, the pixels are converted left to right, top to bottom. When the input image is grayscale, the color channels are ignored. Colored images are converted to RGBA mode.

Expand source code Browse git

class stego(Unit):
    """
    Decodes the RGBA (red/green/blue/alpha) values of the pixels of a given image file and outputs
    these values as bytes. By default, the pixels are converted left to right, top to bottom. When
    the input image is grayscale, the color channels are ignored. Colored images are converted to
    RGBA mode.
    """
    def __init__(
        self,
        split: Arg.Switch('-m', help='Emit the individual rows or columns as separate outputs.') = False,
        parts: Arg('parts', nargs='?', type=str, help=(
            'A string containing any ordering of the letters R, G, B, and A (case-insensitive). '
            'These pixel components will be extracted from every pixel in the given order. The '
            'default value is {default}.'
        )) = 'RGB'
    ):
        super().__init__(
            split=split,
            parts=tuple(Arg.AsOption(p, PIXEL_PART) for p in parts)
        )

    @Unit.Requires('Pillow', 'formats')
    def _image():
        from PIL import Image
        return Image

    def process(self, data):
        split = self.args.split
        parts = self.args.parts
        image = self._image.open(MemoryFile(data, read_as_bytes=True))

        grayscale = image.mode.startswith('L')
        bw_bitmap = image.mode.startswith('1')
        no_colors = grayscale or bw_bitmap

        if not no_colors:
            image = image.convert('RGBA')

        width, height = image.size
        chunk_size = 1 if no_colors else len(parts)
        output = MemoryFile()
        buffer = bytearray(chunk_size * width)
        pixels = iter(image.getdata())

        for _ in range(height):
            offset = 0
            for _ in range(width):
                pixel = next(pixels)
                next_offset = offset + chunk_size
                if no_colors:
                    buffer[offset] = pixel
                else:
                    buffer[offset:next_offset] = (pixel[p] for p in parts)
                offset = next_offset
            if split:
                yield buffer
            else:
                output.write(buffer)
        if not split:
            yield output.getvalue()

class stretch (*count)

Stretch the input data by repeating every byte a number of times.

Expand source code Browse git

class stretch(Unit):
    """
    Stretch the input data by repeating every byte a number of times.
    """
    def __init__(self, *count: Arg.Number(metavar='count', help=(
        'The number of times every byte should be repeated. By default,  '
        'every byte is repeated once.'
    ))):
        count = count or (2,)
        if any(k <= 0 for k in count):
            raise ValueError('You can not use a stretching factor of less than 1.')
        super().__init__(count=count or (2,))

    def process(self, data):
        def stretched(it):
            factor = cycle(self.args.count)
            for byte in it:
                yield from repeat(byte, next(factor))
        return bytearray(stretched(iter(data)))

    def reverse(self, data):
        # one-sided inverse
        def clinched(it):
            factor = cycle(self.args.count)
            while True:
                try:
                    take = islice(it, next(factor))
                    yield next(take)
                    for _ in take: pass
                except StopIteration:
                    break
        return bytearray(clinched(iter(data)))

class struct (spec, *outputs, multi=False, count=∞, until=None, format=None, name=None, more=False)

Read structured data from the beginning of a chunk and store the extracted fields in chunk meta variables. The structure format is specified in extended Python struct format, and all remaining arguments to this unit are the names of the variables that receive the values from this struct. The extended struct format supports all field types supported by Python, as well as the following:

a for null-terminated ASCII strings,
u to read encoded, null-terminated UTF16 strings,
w to read decoded, null-terminated UTF16 strings,
g to read Microsoft GUID values,
E to read 7-bit encoded integers.

For example, the string LLxxHaa will read two unsigned 32bit integers, then skip two bytes, then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults to using native byte order with no alignment. The spec parameter may additionally contain format expressions of the following form:

{name[!alignment]:format}

The alignment parameter is optional. It must be an expression that evaluates to an integer value. The current data pointer is aligned to a multiple of this value before reading the field. The format can either be an integer expression specifying a number of bytes to read, or any format string. If name is specified for an extracted field, its value is made available as a meta variable under the given name. For example, the expression LLxxH{foo:a}{bar:a} would be parsed in the same way as the previous example, but the two ASCII strings would also be stored in meta variables under the names foo and bar, respectively. The format string of a named field is itself parsed as a foramt string expression, where all the previously parsed fields are already available. For example, I{:{}} reads a single 32-bit integer length prefix and then reads as many bytes as that prefix specifies.

A second format string expression is used to specify the output format. For example, the format string LLxxH{foo:a}{bar:a} together with the output format {foo}/{bar} would parse data as before, but the output body would be the concatnation of the field foo, a forward slash, and the field bar. Variables used in the output expression are not included as meta variables. As format fields in the output expression, one can also use {1}, {2} or {-1} to access extracted fields by index. The value {0} represents the entire chunk of structured data. By default, the output format {#} is used, which represents either the last byte string field that was extracted, or the entire chunk of structured data if none of the fields were extracted.

Reverse multibin() expressions can be used to post-process the fields included in any output format. For example, {F:b64:zl} will be the base64-decoded and inflate- decompressed contents of the data that was read as field F.

Finally, it is possible to specify a byte alignment by using the syntax {field!T:a:b:c} where the letter T is either a single digit specifying the alignment, or a single letter variable that holds the byte alignment value in the current metadata. It is also possible to specify the alignment as 0 which instructs the parser to only peek the contents of this field, i.e. the read pointer is not advanced after reading it.

Expand source code Browse git

class struct(Unit):
    """
    Read structured data from the beginning of a chunk and store the extracted fields in chunk meta
    variables. The structure format is specified in extended Python struct format, and all
    remaining arguments to this unit are the names of the variables that receive the values from
    this struct. The extended struct format supports all field types supported by Python, as well
    as the following:

    - `a` for null-terminated ASCII strings,
    - `u` to read encoded, null-terminated UTF16 strings,
    - `w` to read decoded, null-terminated UTF16 strings,
    - `g` to read Microsoft GUID values,
    - `E` to read 7-bit encoded integers.

    For example, the string `LLxxHaa` will read two unsigned 32bit integers, then skip two bytes,
    then read one unsigned 16bit integer, then two null-terminated ASCII strings. The unit defaults
    to using native byte order with no alignment. The `spec` parameter may additionally contain
    format expressions of the following form:

        {name[!alignment]:format}

    The `alignment` parameter is optional. It must be an expression that evaluates to an integer
    value. The current data pointer is aligned to a multiple of this value before reading the field.
    The `format` can either be an integer expression specifying a number of bytes to read, or any
    format string. If `name` is specified for an extracted field, its value is made available as a
    meta variable under the given name. For example, the expression `LLxxH{foo:a}{bar:a}` would be
    parsed in the same way as the previous example, but the two ASCII strings would also be stored
    in meta variables under the names `foo` and `bar`, respectively. The `format` string of a named
    field is itself parsed as a foramt string expression, where all the previously parsed fields
    are already available. For example, `I{:{}}` reads a single 32-bit integer length prefix and
    then reads as many bytes as that prefix specifies.

    A second format string expression is used to specify the output format. For example, the format
    string `LLxxH{foo:a}{bar:a}` together with the output format `{foo}/{bar}` would parse data as
    before, but the output body would be the concatnation of the field `foo`, a forward slash, and
    the field `bar`. Variables used in the output expression are not included as meta variables. As
    format fields in the output expression, one can also use `{1}`, `{2}` or `{-1}` to access
    extracted fields by index. The value `{0}` represents the entire chunk of structured data. By
    default, the output format `{%s}` is used, which represents either the last byte string field
    that was extracted, or the entire chunk of structured data if none of the fields were extracted.

    Reverse `refinery.lib.argformats.multibin` expressions can be used to post-process the fields
    included in any output format. For example, `{F:b64:zl}` will be the base64-decoded and inflate-
    decompressed contents of the data that was read as field `F`.

    Finally, it is possible to specify a byte alignment by using the syntax `{field!T:a:b:c}` where
    the letter `T` is either a single digit specifying the alignment, or a single letter variable
    that holds the byte alignment value in the current metadata. It is also possible to specify the
    alignment as `0` which instructs the parser to only peek the contents of this field, i.e. the
    read pointer is not advanced after reading it.
    """

    def __init__(
        self,
        spec: Arg(type=str, help='Structure format as explained above.'),
        *outputs: Arg(metavar='output', type=str, help='Output format as explained above.'),
        multi: Arg.Switch('-m', help=(
            'Read as many pieces of structured data as possible intead of just one.')) = False,
        count: Arg.Number('-c', help=(
            'A limit on the number of chunks to read in multi mode; default is {default}.')) = INF,
        until: Arg('-u', metavar='E', type=str, help=(
            'An expression evaluated on each chunk in multi mode. New chunks will be parsed '
            'only if the result is nonzero.')) = None,
        format: Arg.String('-f', metavar='F', help=(
            'Optionally specify a format string expression to auto-name extracted fields without a '
            'given name. The format string accepts the field {{c}} for the type code and {{n}} for '
            'the variable index.')) = None,
        name: Arg.String('-n', metavar='VAR', group='FIELDS', help=(
            U'Equivalent to --format=VAR{{n}}.')) = None,
        more: Arg.Switch('-M', help=(
            'After parsing the struct, emit one chunk that contains the data that was left '
            'over in the buffer. If no data was left over, this chunk will be empty.')) = False
    ):
        if name:
            format = format or F'{name}{{n}}'
        outputs = outputs or [F'{{{_REST_MARKER}}}']
        super().__init__(spec=spec, outputs=outputs, until=until, format=format, count=count, multi=multi, more=more)

    def process(self, data: Chunk):
        formatter = string.Formatter()
        field_format: str | None = self.args.format
        until = self.args.until
        until = until and PythonExpression(until, all_variables_allowed=True)
        reader = StructReader(memoryview(data))
        checkpoint = 0
        mainspec = self.args.spec
        byteorder = mainspec[:1]
        if byteorder in '<@=!>':
            mainspec = mainspec[1:]
        else:
            byteorder = '='

        def fixorder(spec):
            if spec[0] not in '<@=!>':
                spec = byteorder + spec
            return spec

        previously_existing_variables = set(metavars(data).variable_names())

        it = itertools.count() if self.args.multi else (0,)
        for index in it:

            field_counter = 0
            checkpoint = reader.tell()

            if reader.eof:
                break
            if index >= self.args.count:
                break

            meta = metavars(data)
            meta.ghost = True
            meta.index = index

            args = []
            last = None
            self.log_debug(F'starting new read at: 0x{checkpoint:08X}')

            try:
                for prefix, name, spec, conversion in formatter.parse(mainspec):
                    name: str
                    spec: str = spec and spec.strip()
                    if prefix:
                        fields = reader.read_struct(fixorder(prefix))
                        if field_format is not None:
                            codes = re.findall('[?cbBhHiIlLqQnNefdspPauwgk]', prefix)
                            if len(codes) != len(fields):
                                codes = 'v' * len(fields)
                            for code, field in zip(codes, fields):
                                code = 'b' if code == '?' else code.lower()
                                v = field_format.format_map({'c': code, 'n': field_counter})
                                meta[v] = field
                                field_counter += 1
                        args.extend(fields)

                    if name is None:
                        continue

                    field_counter += 1

                    if name and not name.isdecimal():
                        check_variable_name(name)

                    if not conversion:
                        peek = False
                    else:
                        alignment = PythonExpression.Evaluate(conversion, meta)
                        if alignment == 0:
                            peek = True
                        else:
                            _aa = reader.tell()
                            reader.byte_align(alignment)
                            _ab = reader.tell()
                            if _aa != _ab:
                                self.log_info(F'aligned from 0x{_aa:X} to 0x{_ab:X}')

                    spec, _, pipeline = spec.partition(':')

                    if spec:
                        spec = meta.format_str(spec, self.codec, args)

                    if spec:
                        try:
                            _exp = PythonExpression.Evaluate(spec, meta)
                        except ParserError:
                            pass
                        else:
                            spec = _exp

                    if spec == '':
                        last = value = reader.read(peek=peek)
                    elif isinstance(spec, int):
                        if spec < 0:
                            spec += reader.remaining_bytes
                        if spec < 0:
                            raise ValueError(F'The specified negative read offset is {-spec} beyond the cursor.')
                        last = value = reader.read_bytes(spec, peek=peek)
                    else:
                        value = reader.read_struct(fixorder(spec), peek=peek)
                        if not value:
                            self.log_debug(F'field {name} was empty, ignoring.')
                            continue
                        if len(value) > 1:
                            self.log_info(F'parsing field {name} produced {len(value)} items reading a tuple')
                        else:
                            value = value[0]

                    if pipeline:
                        value = numseq(pipeline, reverse=True, seed=value)
                    args.append(value)

                    if name == _REST_MARKER:
                        raise ValueError(F'Extracting a field with name {_REST_MARKER} is forbidden.')
                    elif name.isdecimal():
                        index = int(name)
                        limit = len(args) - 1
                        if index > limit:
                            self.log_warn(F'cannot assign index field {name}, the highest index is {limit}')
                        else:
                            args[index] = value
                        continue
                    elif name:
                        meta[name] = value

                if until and until(meta):
                    self.log_info(F'the expression ({until}) evaluated to true; aborting.')
                    break

                with StreamDetour(reader, checkpoint) as detour:
                    full = reader.read(detour.cursor - checkpoint)
                if last is None:
                    last = full

                outputs = []
                symbols = dict(meta)
                symbols[_REST_MARKER] = last

                for template in self.args.outputs:
                    used = set()
                    outputs.append(meta.format(template, self.codec, [full, *args], symbols, True, used=used))
                    for key in used:
                        if key in previously_existing_variables:
                            continue
                        meta.discard(key)

                for output in outputs:
                    chunk = Chunk(output)
                    chunk.meta.update(meta)
                    chunk.set_next_batch(index)
                    yield chunk

            except EOFError:
                break

        leftover = len(reader) - checkpoint

        if not leftover:
            return
        elif self.args.more:
            reader.seekset(checkpoint)
            yield reader.read()
        else:
            leftover = repr(SizeInt(leftover)).strip()
            self.log_info(F'discarding {leftover} left in buffer')

class sub (*argument, bigendian=False, blocksize=None)

Subtract the given argument from each block.

Expand source code Browse git

class sub(BinaryOperationWithAutoBlockAdjustment):
    """
    Subtract the given argument from each block.
    """
    @staticmethod
    def operate(a, b): return a - b
    @staticmethod
    def inplace(a, b): a -= b

class subfiles (memdump=False, recursive=False)

Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against the input data and generates one output chunk for each successfully carved subfile.

Expand source code Browse git

class subfiles(Unit):
    """
    Deploys carvers for ZIP, 7-Zip, PE-File, Windows Shortcuts (LNK files), JSON and XML documents against
    the input data and generates one output chunk for each successfully carved subfile.
    """

    _MINLENGTH = {
        'json': 300,
        'xml' : 300,
        'rtf' : 100,
    }

    def __init__(
        self,
        memdump  : Unit.Arg.Switch('-m',
            help='Assume that the input is a memdump for PE file carving.') = False,
        recursive: Unit.Arg.Switch('-r',
            help='Extract files that are subfiles of other extracted files as separate chunks.') = False,
    ):
        super().__init__(memdump=memdump, recursive=recursive)

    def process(self, data: bytearray):
        carvers = {
            'zip'  : carve_zip(),
            '7z'   : carve_7z(),
            'pe'   : carve_pe(memdump=self.args.memdump, fileinfo=True, recursive=True, keep_root=True),
            'lnk'  : carve_lnk(),
            'json' : carve_json(dictonly=True),
            'xml'  : carve_xml(),
            'rtf'  : carve_rtf(),
        }

        covered = []

        for extension, unit in carvers.items():
            self.log_info(F'carving {extension} files')
            for chunk in data | unit:
                if len(chunk) < self._MINLENGTH.get(extension, 1):
                    continue
                start = chunk['offset']
                end = start + len(chunk)
                if any(start > left and end < right for left, right in covered):
                    continue
                if not self.args.recursive:
                    covered.append((start, end))
                yield chunk

class swap (src, dst=None)

Swap the contents of an existing variable with the contents of the chunk or with another meta variable. When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable that does not exist, the original variable is cleared, essentially renaming the variable.

Expand source code Browse git

class swap(Unit):
    """
    Swap the contents of an existing variable with the contents of the chunk or with another meta variable.
    When swapping with the chunk, the variable has to contain a binary string. When swapping with a variable
    that does not exist, the original variable is cleared, essentially renaming the variable.
    """
    def __init__(
        self,
        src: Arg(type=str, help='The meta variable name.'),
        dst: Arg(type=str, help='Optional name of the second meta variable.') = None
    ):
        super().__init__(
            src=check_variable_name(src),
            dst=check_variable_name(dst)
        )

    def filter(self, chunks: Iterable[Chunk]):
        src = self.args.src
        dst = self.args.dst
        for chunk in chunks:
            if not chunk.visible:
                pass
            elif dst is None:
                try:
                    value = chunk.meta[src]
                except KeyError:
                    value = bytearray()
                if isinstance(value, str):
                    value = value.encode(self.codec)
                elif not isbuffer(value):
                    raise ValueError(F'Unable to swap data with variable {src} because it has type {type(value).__name__}.')
                if not chunk:
                    chunk.meta.discard(src)
                else:
                    chunk.meta[src] = bytes(chunk)
                chunk[:] = value
            else:
                try:
                    value = chunk.meta.pop(src)
                except KeyError:
                    raise KeyError(F'The variable {src} does not exist.')
                try:
                    swap = chunk.meta.pop(dst)
                except KeyError:
                    chunk.meta[dst] = value
                else:
                    chunk.meta[src], chunk.meta[dst] = swap, value
            yield chunk

class szdd

Extract files from SZDD archives.

Expand source code Browse git

class szdd(Unit):
    """
    Extract files from SZDD archives.
    """
    def process(self, data):
        with StructReader(data) as archive:
            if archive.read(8) != b'SZDD\x88\xF0\x27\x33':
                if not self.args.lenient:
                    raise ValueError('signature missing')
                self.log_warn('the header signature is invalid, this is likely not an SZDD archive')
            if archive.read_byte() != 0x41:
                raise ValueError('Unsupported compression mode')
            # ignore the missing file extension letter:
            archive.seekrel(1)
            output_len = archive.u32()
            window_pos = 0x1000 - 0x10
            output_pos = 0
            output = bytearray(output_len)
            window = bytearray(0x1000)
            for k in range(len(window)):
                window[k] = 0x20
            while not archive.eof:
                control = archive.read_byte()
                for cb in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80):
                    if archive.eof:
                        break
                    if control & cb:
                        output[output_pos] = window[window_pos] = archive.read_byte()
                        output_pos += 1
                        window_pos += 1
                        window_pos &= 0xFFF
                    else:
                        match_pos = archive.read_byte()
                        match_len = archive.read_byte()
                        match_pos |= (match_len & 0xF0) << 4
                        match_len = (match_len & 0x0F) + 3
                        match_pos &= 0xFFF
                        for _ in range(match_len):
                            window[window_pos] = window[match_pos]
                            output[output_pos] = window[window_pos]
                            output_pos += 1
                            window_pos += 1
                            match_pos += 1
                            window_pos &= 0xFFF
                            match_pos &= 0xFFF
            return output

    @classmethod
    def handles(self, data: bytearray):
        return data[:4] == B'SZDD'

class tea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, rounds=32)

TEA encryption and decryption.

Expand source code Browse git

class tea(TEAUnit, cipher=BlockCipherFactory(TEA)):
    """
    TEA encryption and decryption.
    """

class termfit (width=0, delta=0, tight=False)

Reformat incoming text data to fit a certain width.

Expand source code Browse git

class termfit(Unit):
    """
    Reformat incoming text data to fit a certain width.
    """

    def __init__(
        self,
        width: Arg('width', help='Optionally specify the width, by default the current terminal width is used.') = 0,
        delta: Arg.Number('-d', help='Subtract this number from the calculated width (0 by default).') = 0,
        tight: Arg.Switch('-t', help='Separate paragraphs by a single line break instead of two.') = False,
    ):
        super().__init__(width=width, delta=delta, tight=tight)

    @unicoded
    def process(self, data: str) -> str:
        parsep = '\n' if self.args.tight else '\n\n'
        return terminalfit(data, self.args.delta, self.args.width, parsep)

class terminate (sentinel=b'\x00', blocksize=None, bigendian=False)

The unit reads data from the incoming chunk in blocks of any given size until the sentinel value is encountered. The output of the unit is all data that was read, excluding the sentinel. The default block size is one and the default sentinel value is zero, which corresponds to reading a null-terminated string from the input. If the sentinel value is not found anywhere in the incoming data, the complete input is returned as output.

Expand source code Browse git

class terminate(BlockTransformationBase):
    """
    The unit reads data from the incoming chunk in blocks of any given size until the
    sentinel value is encountered. The output of the unit is all data that was read,
    excluding the sentinel. The default block size is one and the default sentinel value
    is zero, which corresponds to reading a null-terminated string from the input.
    If the sentinel value is not found anywhere in the incoming data, the complete input
    is returned as output.
    """
    def __init__(
        self,
        sentinel: Arg(help='sentinel value to look for; default is {default}') = B'\0',
        blocksize=None, bigendian=False
    ):
        super().__init__(blocksize=blocksize, bigendian=bigendian, sentinel=sentinel)

    def process(self, data: bytearray):
        sentinel = self.args.sentinel
        position = 0
        blocksize = self.blocksize

        self.log_info('blocksize:', blocksize)
        self.log_debug('separator:', sentinel)

        while position >= 0:
            position = data.find(sentinel, position)
            if position < 0:
                self.log_info(F'The sentinel value {sentinel} was not found.')
                break
            q, r = divmod(position, blocksize)
            if r:
                position = (q + 1) * blocksize
                continue
            else:
                data[position:] = []
                break

        return data

    def reverse(self, data: bytearray):
        sentinel = self.args.sentinel
        position = 0
        while True:
            position = data.find(sentinel, position)
            if position < 0:
                data.extend(sentinel)
                break
            if position % self.blocksize == 0:
                self.log_warn('input string already contains the termination character; returning unmodified input')
                break
            position += 1
        return data

class transpose (padding=b'')

Interprets the chunks in the current frame as rows of a matrix and yields the columns of that matrix. When chunks are not of even length, the matrix is considered to have empty entries in some positions. Optionally, a padding sequence can be provided to pad all rows to the same length.

Expand source code Browse git

class transpose(Unit):
    """
    Interprets the chunks in the current frame as rows of a matrix and yields the columns
    of that matrix. When chunks are not of even length, the matrix is considered to have
    empty entries in some positions. Optionally, a padding sequence can be provided to pad
    all rows to the same length.
    """
    @Unit.Requires('numpy', 'speed', 'default', 'extended')
    def _numpy():
        import numpy
        return numpy

    def __init__(
        self,
        padding: Arg(help='Optional byte sequence to use as padding for incomplete rows.') = B'',
    ):
        super().__init__(bigendian=False, padding=padding)

    def filter(self, chunks: Iterable[Chunk]):
        rows = []
        for chunk in chunks:
            if not chunk.visible:
                yield chunk
                continue
            rows.append(chunk)
        if not rows:
            return
        matrix = rows[0]
        matrix.temp = rows
        yield matrix

    def process(self, data: Chunk):
        chunks: List[Chunk] = data.temp
        if not chunks:
            return
        length = [len(chunk) for chunk in chunks]
        n = min(length)
        m = max(length)
        pad = self.args.padding
        if pad:
            for chunk in chunks:
                while len(chunk) < m:
                    chunk.extend(pad)
                del chunk[m:]
        if n > 0:
            try:
                np = self._numpy
            except ImportError:
                pass
            else:
                t = [chunk[n:] for chunk in chunks if len(chunk) > n]
                for chunk in chunks:
                    del chunk[n:]
                a = np.array(chunks, dtype=np.uint8).transpose()
                for row in a:
                    yield row.tobytes('C')
                m = m - n
                chunks = t
        for i in range(m):
            yield bytes(chunk[i] for chunk in chunks if len(chunk) > i)

class trim (*junk, unpad=False, left=True, right=True, nocase=False)

Removes byte sequences at beginning and end of input data.

Expand source code Browse git

class trim(Unit):
    """
    Removes byte sequences at beginning and end of input data.
    """

    def __init__(
        self, *junk: Arg(help='Binary strings to be removed, default are all whitespace characters.'),
        unpad: Arg.Switch('-u', help='Also trim partial occurrences of the junk string.') = False,
        left: Arg.Switch('-r', '--right-only', group='SIDE', help='Do not trim left.') = True,
        right: Arg.Switch('-l', '--left-only', group='SIDE', help='Do not trim right.') = True,
        nocase: Arg.Switch('-i', help='Ignore capitalization for alphabetic characters.') = False,
    ):
        super().__init__(junk=junk, left=left, right=right, unpad=unpad, nocase=nocase)

    def _trimfast(self, view: memoryview, *junks: bytes, right=False) -> Tuple[bool, memoryview]:
        done = False
        pos = 0
        while not done:
            done = True
            for junk in junks:
                temp = junk
                size = len(junk)
                if right and self.args.unpad:
                    for k in range(size):
                        n = size - k
                        if view[pos:pos + n] == junk[k:]:
                            pos += n
                            done = False
                            break
                if view[pos:pos + size] == temp:
                    m = len(temp)
                    while True:
                        mm = m << 1
                        if view[pos + m:pos + mm] != temp:
                            break
                        temp += temp
                        m = mm
                    temp = memoryview(temp)
                    while m >= size:
                        if view[pos:pos + m] == temp[:m]:
                            done = False
                            pos += m
                        m //= 2
                if right or not self.args.unpad:
                    continue
                while size > 0:
                    if view[pos:pos + size] == temp[:size]:
                        done = False
                        pos += size
                        break
                    size -= 1
        return pos

    def process(self, data: bytearray):
        junk = list(self.args.junk)
        if not junk:
            import string
            space = string.whitespace.encode('ascii')
            junk = [space[k - 1:k] for k in range(1, len(space))]
        lpos = 0
        rpos = 0
        if self.args.nocase:
            work = data.lower()
            junk = [j.lower() for j in junk]
        else:
            work = data
        if self.args.left:
            lpos = self._trimfast(memoryview(work), *junk)
        if self.args.right:
            work.reverse()
            junk = [bytes(reversed(j)) for j in junk]
            rpos = self._trimfast(memoryview(work), *junk, right=True)
            work.reverse()
        view = memoryview(data)
        if lpos:
            view = view[+lpos:]
        if rpos:
            view = view[:-rpos]
        return view

class u16

Encodes and decodes UTF-16 encoded string data.

Expand source code Browse git

class u16(Unit):
    """
    Encodes and decodes UTF-16 encoded string data.
    """

    def reverse(self, data):
        return data.decode(self.codec).encode('utf-16LE')

    def process(self, data):
        return data.decode('utf-16').encode(self.codec)

    @classmethod
    def handles(self, data: bytearray):
        view = memoryview(data)
        if len(view) % 2 != 0:
            return False
        if not any(view[1:0x100:2]):
            return True
        if not any(view[0:0x100:2]):
            return any(view[:4])

class ucrypt (size=13, salt=b'AA')

Implements the classic Unix crypt algorithm.

Expand source code Browse git

class ucrypt(KeyDerivation):
    """
    Implements the classic Unix crypt algorithm.
    """
    def __init__(
        self,
        size: Arg(help='The number of bytes to generate, default is 13.') = 13,
        salt: Arg(help='Salt for the derivation, the default is "AA".') = B'AA'
    ):
        super().__init__(size=size, salt=salt)

    def process(self, data):
        crypted = bytes(UnixCrypt(data, salt=self.args.salt))
        if len(crypted) < self.args.size:
            raise RefineryPartialResult(
                F'unix crypt only provided {len(crypted)} bytes, but {self.args.size} '
                F'were requested.', partial=crypted
            )
        return crypted[:self.args.size]

class url (plus=False, hex=False)

Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the following symbols: _, ., -, ~, \, /. Every other character is escaped by hex-encoding it and prefixing it with a percent symbol.

Expand source code Browse git

class url(Unit):
    """
    Decodes and encodes URL-encoding, which preserves only alphanumeric characters and the
    following symbols: `_`, `.`, `-`, `~`, `\\`, `/`. Every other character is escaped by
    hex-encoding it and prefixing it with a percent symbol.
    """

    def __init__(
        self,
        plus: Arg.Switch('-p', help='also replace plus signs by spaces') = False,
        hex : Arg.Switch('-x', help='hex encode every character in reverse mode') = False
    ):
        super().__init__(plus=plus, hex=hex)

    def process(self, data):
        if self.args.plus:
            data = data.replace(B'+', B' ')
        data = unquote_to_bytes(bytes(data))
        data = re.sub(
            B'%[uU]([0-9a-fA-F]{4})',
            lambda m: int(m[1], 16).to_bytes(2, 'little'),
            data)
        return data

    def reverse(self, data):
        if self.args.hex:
            result = bytearray(len(data) * 3)
            offset = 0
            for byte in data:
                result[offset + 0] = 0x25
                offset += 1
                result[offset:offset + 2] = B'%02X' % byte
                offset += 2
            return result
        elif self.args.plus:
            def replace(m):
                c = m[0][0]
                return b'+' if c == 0x20 else B'%%%02X' % c
        else:
            def replace(m):
                return B'%%%02X' % m[0][0]
        return re.sub(B'[^a-zA-Z0-9_.-~\\/]', replace, data)

class urlfix (meta=False, keep=0)

Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all characters in the URL path component and normalizes the network location part to lowercase. Note that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a URL will be swallowed and not return any output.

Expand source code Browse git

class urlfix(Unit):
    """
    Removes fragments, query strings, and parameters from input URLs. It also correctly escapes all
    characters in the URL path component and normalizes the network location part to lowercase. Note
    that URLs without a scheme will not be recognized as valid URLs; chunks that do not look like a
    URL will be swallowed and not return any output.
    """
    def __init__(
        self,
        meta: Arg.Switch('-m', help='Extract the query string parameters as metadata.') = False,
        keep: Arg.Counts('-k', help=(
            'If specified once, keeps the it keeps the URL params and query string. If specified '
            'twice, it keeps the URL fragment as well. At this level, the unit still filters out '
            'anything that does not parse as a URL.'
        )) = 0
    ):
        super().__init__(keep=keep, meta=meta)

    def process(self, data):
        def fix(string):
            return quote(unquote(string))
        keep = self.args.keep
        meta = self.args.meta
        parsed = urlparse(data.decode(self.codec))
        if not parsed.scheme or not parsed.netloc:
            return None
        query_dict = {key: unquote(value) for key, value in parse_qsl(parsed.query)}
        query_string = '&'.join(F'{key}={quote(value)}' for key, value in query_dict.items())
        replacements = dict(
            netloc=parsed.netloc.lower(),
            params=fix(parsed.params),
            path=fix(parsed.path),
            query=query_string,
            fragment=fix(parsed.fragment),
        )
        if keep < 2:
            replacements.update(fragment='')
            if keep < 1:
                replacements.update(params='', query='')
        url = urlunparse(parsed._replace(**replacements))
        url = url.encode(self.codec)
        if meta:
            url = self.labelled(url, **query_dict)
        return url

class urlguards

Restores the original URLs from their 'protected' versions as generated by Outlook protection and ProofPoint.

Expand source code Browse git

class urlguards(Unit):
    """
    Restores the original URLs from their 'protected' versions as generated by
    Outlook protection and ProofPoint.
    """

    _PP3RLENC = {
        letter: rl for rl, letter in enumerate(
            'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            'abcdefghijklmnopqrstuvwxyz'
            '0123456789-_', 2
        )
    }

    @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v([12])/url\?([:;/_=!?#&.,\w\%\-\+|]+)')
    def _proofpointV2(self, match):
        version = int(match[1])
        self.log_info('proofpoint match:', version)
        argmatch = re.match(
            R'^u=(.+?)&(?:amp;)?{}='.format('k' if version == 1 else '[dc]'),
            match[2],
            flags=re.DOTALL
        )
        if not argmatch:
            self.log_warn('not able to translate unexpected proofpoint format:', match)
            return match[0]
        encoded = argmatch[1]
        if match[1] == '2':
            encoded = encoded.translate(str.maketrans('-_', '%/'))
        return unescape(unquote(encoded))

    @unguard(r'https?://urldefense(?:\.proofpoint)?\.com/v3/__(.+?)__;(.*?)![-\w!?$]+')
    def _proofpointV3(self, match):
        data = unquote(match[1])
        cmap = match[2] + '=' * (-len(match[2]) % 4)
        cmap = urlsafe_b64decode(cmap).decode('UTF-8')
        cursor = 0
        result = ''
        for k in range(len(cmap)):
            ast = data.find('*', cursor)
            if ast < 0:
                break
            result += data[cursor:ast]
            if data[ast + 1] == '*':
                end = self._PP3RLENC[data[ast + 2]]
                result += cmap[k:end]
                ast += 2
            else:
                result += cmap[k]
            cursor = ast + 1
        self.log_debug(result)
        self.log_debug(data[cursor:])
        return result + data[cursor:]

    @unguard(r'https?://\w+.safelinks\.protection\.outlook\.com/([:;/_=!?#&.,\w\%\-\+|]+)')
    def _outlook(self, match):
        result = match[0]
        self.log_info('outlook match:', result)
        parsed = urlparse(result)
        params = parse_qs(parsed.query)
        try:
            result = unquote(params['url'][0])
        except Exception:
            pass
        return result

    @unguard(r'https?://outlook.office.com/actions/ei\?u=([:;/_=!?#&.,\w\%\-\+|]+)')
    def _outlook_image_proxy(self, match):
        return unquote(match[1])

    @unguard(r'https?://(?:[\w-]+\.)?trendmicro.com(?::\d+)?/wis/clicktime/v[12]/(?:query|clickthrough)[:;/_=!?#&.,\w\%\-\+|]+')
    def _trendmicro(self, match):
        result = match[0]
        self.log_info('trendmicro match:', result)
        parsed = urlparse(result)
        params = parse_qs(parsed.query)
        try:
            result = unquote(params['url'][0])
        except Exception:
            pass
        return result

    @unicoded
    def process(self, data: str) -> str:
        newsize, size = 0, len(data)
        while newsize != size:
            for handler in (
                self._proofpointV2,
                self._proofpointV3,
                self._outlook,
                self._outlook_image_proxy,
                self._trendmicro
            ):
                data = handler(data)
            size = newsize
            newsize = len(data)
        return data

class urn (size='N:N', keep=False, sort=False)

Treat the chunks in the current frame as items in an urn and produce every possible sequence that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent to generating all possible permutations of these chunks.

Expand source code Browse git

class urn(Unit):
    """
    Treat the chunks in the current frame as items in an urn and produce every possible sequence
    that could occur as a sequence of draws. For example, selecting both -k and -s is equivalent
    to generating all possible permutations of these chunks.
    """

    def __init__(self,
        size: Arg.String(metavar='a:b', help=(
            'Generate sequences of length x, where x is in [a:b]. The default value is {default}, '
            'where N is the number of chunks in the current frame.')) = 'N:N',
        keep: Arg.Switch('-k', help=(
            'Chunks are not returned back to the urn after being drawn.')) = False,
        sort: Arg.Switch('-s', help=(
            'The order of items does not matter; for the output, chunks are sorted according to '
            'their original position in the frame.')) = False
    ):
        super().__init__(size=size, keep=keep, sort=sort)

    def process(self, data: Chunk):
        yield from data.temp

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        head = next(it)
        buffer = [bytes(head)]
        buffer.extend(bytes(c) for c in it)
        head = head.copy(meta=True, data=False)
        head.meta['N'] = len(buffer)
        size = sliceobj(self.args.size, head)
        a = size.start or 1
        b = size.stop or len(buffer)
        b = max(b, a + 1)
        c = size.step or 1
        self.log_debug(F'using size [{a}:{b}:{c}]')
        s = 1 if self.args.sort else 0
        k = 1 if self.args.keep else 0
        m = (s << 1) | k
        method = {
            0b00: lambda i, r: product(i, repeat=r),
            0b01: combinations,
            0b10: combinations_with_replacement,
            0b11: permutations
        }[m]
        self.log_info(F'choosing {method.__name__}')
        for n in range(a, b, c):
            self.log_debug(F'generating sequences of length {n}')
            for head.temp in method(buffer, n):
                yield head

class uuenc

Unit for uuencode.

Expand source code Browse git

class uuenc(Unit):
    """
    Unit for uuencode.
    """
    def process(self, data):
        header = re.search(
            B'^begin ([0-7]{3}) (.*?)$', data, flags=re.M)
        if header is None:
            raise ValueError('invalid uu header')
        output = bytearray()
        view = memoryview(data)
        breaks = [m.end() for m in iter(re.finditer(B'^', data, flags=re.M))]
        eol = False
        for k, br in enumerate(itertools.islice(breaks, 1, None)):
            if eol and view[br:br + 3] == b'end':
                path = header[2]
                if path != B'-':
                    output = self.labelled(output, path=path)
                return output
            count = view[br] - 0x20
            if count not in range(0x41):
                raise ValueError(F'Invalid length encoding 0x{view[br]:02X} in line {k}.')
            count %= 0x40
            cursor = len(output)
            q, r = divmod(count, 3)
            q += int(bool(r))
            end = br + 1 + q * 4
            for b in range(br + 1, end, 4):
                chunk = 0
                for j in range(4):
                    character = view[b + j]
                    if character not in range(0x21, 0x61):
                        raise ValueError(F'Invalid character 0x{character:02X} in line {k}.')
                    chunk = ((character - 0x20) % 0x40) | (chunk << 6)
                output.extend(chunk.to_bytes(3, 'big'))
            del output[cursor + count:]
            eol = count == 0
            if len(output) < cursor + count:
                break
        raise RefineryPartialResult(F'Data truncated in line {k}', output)

    def reverse(self, data):
        meta = metavars(data)
        path = meta.get('path', None)
        name = path and pathlib.Path(path).name or '-'
        view = memoryview(data)
        with MemoryFile() as stream:
            stream.write(B'begin 666 ')
            stream.write(name.encode(self.codec))
            for k in range(0, len(view), 45):
                slice = view[k:k + 45]
                stream.write_byte(0x0A)
                stream.write_byte(0x20 + len(slice))
                for chunk in chunks.unpack(slice, 3, bigendian=True, pad=True):
                    for j in range(3, -1, -1):
                        stream.write_byte(0x20 + (((chunk >> j * 6) & 0x3F) or 0x40))
            stream.write(B'\n`\nend\n')
            return stream.getvalue()

    @classmethod
    def handles(self, data):
        if len(data) < 16:
            return False
        if data[:6] == B'begin ':
            return set(data[6:9]) <= set(B'01234567')

class vaddr (*name, base=None)

Converts a metadata variable holding a file offset to a virtual address. This unit only works when the chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you would like to retain the original value, it is recommended to use the put unit first to create a copy of an already existing variable, and then convert the copy.

Expand source code Browse git

class vaddr(Unit):
    """
    Converts a metadata variable holding a file offset to a virtual address. This unit only works when the
    chunk body contains a PE, ELF, or MachO executable. The variable will be substituted in place. If you
    would like to retain the original value, it is recommended to use the `refinery.put` unit first to create
    a copy of an already existing variable, and then convert the copy.
    """

    def __init__(
        self, *name: Arg(type=str, help='The name of a metadata variable holding an integer.'),
        base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None
    ):
        return super().__init__(names=name, base=base)

    def process(self, data):
        try:
            exe = Executable.Load(data, self.args.base)
        except Exception:
            self.log_warn('unable to parse input as executable; no variable conversion was performed')
            return data
        meta = metavars(data)
        for name in self.args.names:
            value = meta[name]
            meta[name] = exe.location_from_offset(value).virtual.position
        return data

    def reverse(self, data):
        try:
            exe = Executable.Load(data, self.args.base)
        except Exception:
            self.log_warn('unable to parse input as executable; no variable conversion was performed')
            return data
        meta = metavars(data)
        for name in self.args.names:
            value = meta[name]
            meta[name] = exe.location_from_address(value).physical.position
        return data

class vbapc (raw=False)

Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to decompile the disassembled p-code. This unit is specifically useful for macro documents that use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent the p-code functionality that the document will actually execute.

Expand source code Browse git

class vbapc(Unit):
    """
    Extract VBA macro p-code from Office documents. By default, the unit also uses pcode2code to
    decompile the disassembled p-code. This unit is specifically useful for macro documents that
    use VBA code stomping, i.e. the embedded macro source code is stomped and does not represent
    the p-code functionality that the document will actually execute.
    """
    def __init__(self, raw: Unit.Arg.Switch('-r', help='Return disassembled p-code, do not try to decompile.') = False):
        super().__init__(raw=raw)

    @Unit.Requires('oletools', 'formats', 'office', 'extended')
    def _pcodedmp():
        with NoLogging(NoLogging.Mode.ALL):
            import pcodedmp.pcodedmp
            return pcodedmp.pcodedmp

    def process(self, data):
        class args:
            disasmOnly = True
            verbose = False
        with io.StringIO() as output:
            with VirtualFileSystem() as vfs:
                vf = vfs.new(data)
                self._pcodedmp.processFile(vf, args, output)
            code = output.getvalue()
            if not self.args.raw:
                from refinery.lib.thirdparty.pcode2code import Parser
                parser = Parser(code)
                parser.parseInput()
                parser.processInput(False)
                code = parser.getOutput()
                code = re.sub(R'(?m)^((?:Sub|Function).*?)$(?!\n[^\s])', r'\n\1', code)
            return code.encode(self.codec)

class vbastr (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract VBA macro variables from Office documents. The items are extracted in a directory hierarchy that specifies their corresponding OLE stream. The stem of their file name is the same as the variable's name. The variable can define a caption, a control tip text, and a value; the unit extracts these with the synthesized file extension "cap", "tip", and "val", respectively.

Expand source code Browse git

class vbastr(PathExtractorUnit):
    """
    Extract VBA macro variables from Office documents. The items are extracted in a directory
    hierarchy that specifies their corresponding OLE stream. The stem of their file name is the
    same as the variable's name. The variable can define a caption, a control tip text, and a
    value; the unit extracts these with the synthesized file extension "cap", "tip", and "val",
    respectively.
    """
    @PathExtractorUnit.Requires('oletools', 'formats', 'office')
    def _olevba():
        from oletools import olevba
        return olevba

    def unpack(self, value):
        try:
            parser = self._olevba.VBA_Parser('.', data=bytes(value), relaxed=True)
        except self._olevba.FileOpenError:
            raise ValueError('Input data not recognized by VBA parser')
        try:
            for path, name, vars in parser.extract_form_strings_extended():
                if not vars:
                    continue
                name = _txt(vars['name'])
                for ext, key in {
                    'cap': 'caption',
                    'tip': 'control_tip_text',
                    'val': 'value',
                }.items():
                    value = _bin(vars.get(key))
                    if not value:
                        continue
                    yield UnpackResult(F'{path!s}/{name!s}/{name}.{ext}', value)
        except self._olevba.oleform.OleFormParsingError as error:
            from collections import Counter
            self.log_debug(str(error))
            self.log_info('extended form extraction failed with error; falling back to simple method')
            form_strings = list(parser.extract_form_strings())
            name_counter = Counter(name for _, name, _ in form_strings)
            dedup = Counter()
            for path, name, string in form_strings:
                if string is None:
                    continue
                if name_counter[name] > 1:
                    dedup[name] += 1
                    name = F'{name!s}.v{dedup[name]}'
                yield UnpackResult(F'{path!s}/{name!s}.val', _bin(string))

class vigenere (key, alphabet=b'abcdefghijklmnopqrstuvwxyz', operator='add', case_sensitive=False, ignore_unknown=False)

Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.

Expand source code Browse git

class vigenere(Unit):
    """
    Encryption and decryption using the Vigenère-Bellaso polyalphabetic cipher.
    """

    def __init__(
        self,
        key: Arg(help='The encryption key'),
        alphabet: Arg(
            help='The alphabet, by default the Latin one is used: "{default}"'
        ) = b'abcdefghijklmnopqrstuvwxyz',
        operator: Arg.Choice('-:', choices=['add', 'sub', 'xor'], metavar='OP', help=(
            'Choose the vigenere block operation. The default is {default}, and the available options are: {choices}')) = 'add',
        case_sensitive: Arg.Switch('-c', help=(
            'Unless this option is set, the key will be case insensitive. Uppercase letters from the input are transformed '
            'using the same shift as would be the lowercase variant, but case is retained.')) = False,
        ignore_unknown: Arg.Switch('-i', help=(
            'Unless this option is set, the key stream will be iterated even '
            'for letters that are not contained in the alphabet.'
        )) = False
    ):
        if not callable(operator):
            operator = {
                'add': __add__,
                'sub': __sub__,
                'xor': __xor__,
            }.get(operator.lower(), None)
            if operator is None:
                raise ValueError(F'The value {operator!r} is not valid as an operator.')
        self.superinit(super(), **vars())

    def _tabula_recta(self, data, reverse=True):
        key: str = self.args.key.decode(self.codec)
        alphabet: str = self.args.alphabet.decode(self.codec)
        operator = self.args.operator
        case_sensitive: bool = self.args.case_sensitive
        ignore_unknown: bool = self.args.ignore_unknown
        if not case_sensitive:
            key = key.lower()
            alphabet = alphabet.lower()
            if len(set(alphabet)) != len(alphabet):
                raise ValueError('Duplicate entries detected in alphabet.')
        if not set(key) <= set(alphabet):
            diff = set(key) - set(alphabet)
            diff = ', '.join(diff)
            raise ValueError(F'key contains letters which are not from the given alphabet: {diff}')
        self.log_info(F'using key {key} and alphabet {alphabet}')
        keystream = cycle(key)
        alph_size = len(alphabet)
        if reverse:
            operator = _opeator_inverse[operator]
        for letter in data:
            uppercase = not case_sensitive and letter.isupper()
            if uppercase:
                letter = letter.lower()
            try:
                position = alphabet.index(letter)
            except ValueError:
                yield letter
                if not ignore_unknown:
                    next(keystream)
                continue
            shift = alphabet.index(next(keystream))
            result = alphabet[operator(position, shift) % alph_size]
            yield result.upper() if uppercase else result

    @unicoded
    def process(self, data):
        return ''.join(self._tabula_recta(data, True))

    @unicoded
    def reverse(self, data):
        return ''.join(self._tabula_recta(data, False))

class vmemref (*address, take=None, base=None, deref_count=1, deref_depth=2)

The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual address for memory references. For each memory reference, the unit looks up the corresponding section and file offset for the reference. It then returns all data from that section starting at the given offset.

Expand source code Browse git

class vmemref(Unit):
    """
    The unit expects an executable as input (PE/ELF/MachO) and scans a function at a given virtual
    address for memory references. For each memory reference, the unit looks up the corresponding
    section and file offset for the reference. It then returns all data from that section starting
    at the given offset.
    """

    @Unit.Requires('smda<2.0', 'all')
    def _smda():
        import datetime
        datetime.UTC = datetime.timezone.utc
        import smda
        import smda.Disassembler
        import smda.DisassemblyResult
        return smda

    def _memory_references(
        self,
        exe: Executable,
        function: SmdaFunction,
        codes: Container[Range],
        max_dereference_depth: int,
        max_dereference_count: int,
        references: dict,
    ):
        def is_valid_data_address(address):
            if not isinstance(address, int):
                return False
            if address not in exe:
                return False
            if address in instructions:
                return False
            for code in codes:
                if address in code:
                    return False
            return True

        def dereference(address):
            return int.from_bytes(exe[address:address + pointer_size], exe.byte_order().value)

        pointer_size = exe.pointer_size // 8

        with NoLogging():
            instructions = {op.offset: op for op in function.getInstructions()}

        for op in instructions.values():
            try:
                with NoLogging():
                    refs = list(op.getDataRefs())
            except Exception:
                continue
            for address in refs:
                try:
                    address = int(address)
                except Exception:
                    continue
                addresses = deque([address])
                while addresses:
                    address = addresses.pop()
                    if not is_valid_data_address(address):
                        continue
                    if (count := references.get(address, 0)) > max_dereference_depth:
                        continue
                    elif not count:
                        yield address
                    references[address] = count + 1
                    for _ in range(max_dereference_count):
                        try:
                            point = dereference(address)
                        except Exception:
                            pass
                        else:
                            addresses.appendleft(point)
                        finally:
                            address += pointer_size

    def __init__(
        self,
        *address: Arg.Number(metavar='ADDR', help=(
            'Specify the address of a function to scan. If no argument is given, the unit will scan'
            ' all functions for memory references.')),
        take: Arg.Number('-t', metavar='SIZE', help=(
            'Optionally specify the number of bytes to read from each reference; by default, all '
            'data until the end of the section is returned.')) = None,
        base: Arg.Number('-b', metavar='ADDR',
            help='Optionally specify a custom base address B.') = None,
        deref_count: Arg.Number('-c', help=(
            'Optionally specify the number of items to inspect at a discovered memory address as '
            'as a potential pointer. The default is {default}.')) = 1,
        deref_depth: Arg.Number('-d', help=(
            'Optionally specify the maximum number of times that referenced data is dereferenced '
            'as a pointer, potentially leading to another referenced memory location. The default '
            'is {default}.')) = 2,
    ):
        super().__init__(
            address=address,
            take=take,
            base=base,
            deref_count=deref_count,
            deref_depth=deref_depth,
        )

    def process(self, data):
        smda = self._smda
        take = self.args.take
        exe = Executable.Load(data, self.args.base)
        fmt = exe.pointer_size // 4
        addresses = self.args.address

        self.log_info('disassembling and exploring call graph using smda')
        with NoLogging():
            cfg = smda.Disassembler.SmdaConfig()
            cfg.CALCULATE_SCC = False
            cfg.CALCULATE_NESTING = False
            cfg.TIMEOUT = 600
            dsm = smda.Disassembler.Disassembler(cfg)
            _input = data
            if not isinstance(_input, bytes):
                _input = bytes(data)
            graph = dsm.disassembleUnmappedBuffer(_input)

        self.log_info('collecting code addresses for memory reference exclusion list')
        visits = {}
        avoid = set()

        for symbol in exe.symbols():
            if not symbol.code:
                continue
            avoid.add(exe.location_from_address(symbol.address).virtual.box)

        if addresses:
            reset = visits.clear
        else:
            def reset():
                pass
            self.log_info('scanning executable for functions')
            with NoLogging():
                addresses = [pfn.offset for pfn in graph.getFunctions()]
                addresses.sort()

        for a in addresses:
            reset()
            address, function = min(
                graph.xcfg.items(), key=lambda t: (abs(t[0] - a), t[0] >= a))
            self.log_debug(F'scanning function: 0x{address:0{fmt}X}')
            refs = list(self._memory_references(
                exe,
                function,
                avoid,
                self.args.deref_depth,
                self.args.deref_count,
                visits,
            ))
            refs.sort(reverse=True)
            last_start = None
            for ref in refs:
                try:
                    box = exe.location_from_address(ref)
                    end = box.physical.box.upper
                    if take is not None:
                        end = min(box.physical.position + take, end)
                    if last_start is not None:
                        end = min(last_start, end)
                    last_start = box.physical.position
                except CompartmentNotFound:
                    self.log_info(F'memory reference could not be resolved: 0x{ref:0{fmt}X}')
                else:
                    yield exe.data[last_start:end]

class vsect (*paths, meta=False, synthetic=False, path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

Extract sections/segments from PE, ELF, and MachO executables.

Expand source code Browse git

class vsect(PathExtractorUnit):
    """
    Extract sections/segments from PE, ELF, and MachO executables.
    """
    def __init__(
        self, *paths,
        meta: Arg.Switch('-m', help=(
            'Populates the metadata variables vaddr and vsize containing the virtual address and size '
            'of each section, respectively.')) = False,
        synthetic: Arg.Switch('-s', help=(
            'Include synthesized sections: These represent data regions that are outside the sections '
            'as listed by the executable metadata, such as headers and overlays.')) = False,
        **keywords
    ):
        super().__init__(*paths, meta=meta, synthetic=synthetic, **keywords)

    def unpack(self, data):
        exe = Executable.Load(data)
        mv = memoryview(data)
        for k, section in enumerate(exe.sections()):
            if section.synthetic and not self.args.synthetic:
                continue
            start = section.physical.lower
            end = section.physical.upper
            va = section.virtual.lower
            vs = len(section.virtual)
            kwargs = {'offset': start}
            if self.args.meta:
                if va is not None:
                    kwargs['vaddr'] = va
                if vs is not None:
                    kwargs['vsize'] = vs
            name = section.name
            if not name:
                addr = F'{section.virtual.lower:0{exe.pointer_size // 4}X}'
                self.log_warn(F'section {k} had no name, synthesizing name from virtual address 0x{addr}')
                name = F'.{addr}'
            yield UnpackResult(name, mv[start:end], **kwargs)

class vsnip (*addresses, ascii=False, utf16=False, until=b'', base=None)

Extract data from PE, ELF, and MachO files based on virtual offsets.

Expand source code Browse git

class vsnip(Unit):
    """
    Extract data from PE, ELF, and MachO files based on virtual offsets.
    """

    def __init__(
        self, *addresses: Arg.Bounds(metavar='start:count:align', help=(
            'Use Python slice syntax to describe an area of virtual memory to read. If a chunksize is '
            'specified, then the unit will always read a multiple of that number of bytes')),
        ascii: Arg.Switch('-a', group='END', help='Read ASCII strings; equivalent to -th:00') = False,
        utf16: Arg.Switch('-u', group='END', help='Read UTF16 strings; equivalent to -th:0000 (also sets chunksize to 2)') = False,
        until: Arg.Binary('-t', group='END', help='Read until sequence {varname} is read.') = B'',
        base : Arg.Number('-b', metavar='ADDR', help='Optionally specify a custom base address B.') = None,
    ):
        if sum(1 for t in (until, utf16, ascii) if t) > 1:
            raise ValueError('Only one of utf16, ascii, and until can be specified.')
        return super().__init__(addresses=addresses, utf16=utf16, ascii=ascii, until=until, base=base)

    def process(self, data: bytearray):
        until = self.args.until
        addrs = self.args.addresses
        if self.args.ascii:
            until = B'\0'
        if self.args.utf16:
            until = B'\0\0'
            addrs = (slice(a.start, a.stop, 2) for a in addrs)

        exe = Executable.Load(data, self.args.base)

        for addr in addrs:
            area = MemoryArea(addr)
            location = exe.location_from_address(area.start)
            offset = location.physical.position
            max_offset = location.physical.box.upper
            if not until:
                end = max_offset
            else:
                end = offset - 1
                align = area.align
                while True:
                    end = data.find(until, end + 1)
                    if end not in range(offset, max_offset):
                        raise EndOfStringNotFound
                    if (end - offset) % align == 0:
                        break

            if area.count:
                end = min(end, offset + area.count)

            yield self.labelled(data[offset:end], offset=offset)

class vstack (*address, stop=None, base=None, arch=Arch.X32, engine=_engine.unicorn, se=False, ic=False, uc=False, meta_registers=False, timeout=None, patch_range=slice(5, None, None), write_range=slice(1, None, None), wait=20, wait_calls=False, skip_calls=0, stack_size=65536, stack_push=None, log_api_calls=False, block_size=4096, max_visits=65536, log_writes_in_calls=False, log_stack_addresses=False, log_other_addresses=False, log_zero_overwrites=False, log_stack_cookies=False)

The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and extracts data patches that are written to memory during emulation. The unit can also be used to emulate shellcode blobs, in which case it defaults to emulating 32bit x86 instructions.

Emulation is halted as soon as a certain number of instructions have not performed any memory writes, or when an error occurs. By default, most registers are set to the current location in the emulated stack. If you want to initialize some of them differently, the -r switch maes the unit initialize register values from meta variables:

emit shellcode [| put eax 0x2000 | vstack -r ]

In this pipeline, the eax register is set to 0x2000 before emulation begins.

Expand source code Browse git

class vstack(Unit):
    """
    The unit emulates instructions at a given address in the input executable (PE/ELF/MachO) and
    extracts data patches that are written to memory during emulation. The unit can also be used
    to emulate shellcode blobs, in which case it defaults to emulating 32bit x86 instructions.

    Emulation is halted as soon as a certain number of instructions have not performed any memory
    writes, or when an error occurs. By default, most registers are set to the current location in
    the emulated stack. If you want to initialize some of them differently, the `-r` switch maes
    the unit initialize register values from meta variables:

        emit shellcode [| put eax 0x2000 | vstack -r ]

    In this pipeline, the eax register is set to `0x2000` before emulation begins.
    """
    @Unit.Requires('capstone', 'default', 'extended')
    def _capstone():
        import capstone
        return capstone

    @Unit.Requires('unicorn==2.0.1.post1', 'default', 'extended')
    def _unicorn():
        import importlib
        importlib.import_module('setuptools')
        with NoLogging():
            import unicorn
            return unicorn

    @Unit.Requires('speakeasy-emulator-refined', 'extended')
    def _speakeasy():
        import speakeasy
        return speakeasy

    @Unit.Requires('icicle-emu', 'all')
    def _icicle():
        import icicle
        return icicle

    def __init__(
        self,
        *address: Arg.NumSeq(metavar='start', help='Specify the (virtual) addresses of a stack string instruction sequences.'),
        stop: Arg.Number('-s', metavar='stop', help='Optional: Stop when reaching this address.') = None,
        base: Arg.Number('-b', metavar='Addr', help='Optionally specify a custom base address B.') = None,
        arch: Arg.Option('-a', help='Specify for blob inputs: {choices}', choices=Arch) = Arch.X32,
        engine: Arg.Option('-e', group='EMU', choices=_engine, metavar='E',
            help='The emulator engine. The default is {default}, options are: {choices}') = _engine.unicorn,
        se: Arg.Switch(group='EMU', help='Equivalent to --engine=speakeasy') = False,
        ic: Arg.Switch(group='EMU', help='Equivalent to --engine=icicle') = False,
        uc: Arg.Switch(group='EMU', help='Equivalent to --engine=unicorn') = False,
        meta_registers: Arg.Switch('-r', help=(
            'Consume register initialization values from the chunk\'s metadata. If the value is a byte string, '
            'the data will be mapped.')) = False,
        timeout: Arg.Number('-t', help='Optionally stop emulating after a given number of instructions.') = None,
        patch_range: Arg.Bounds('-p', metavar='MIN:MAX',
            help='Extract only patches that are in the given range, default is {default}.') = slice(5, None),
        write_range: Arg.Bounds('-n', metavar='MIN:MAX',
            help='Log only writes whose size is in the given range, default is {default}.') = slice(1, None),
        wait: Arg.Number('-w', help=(
            'When this many instructions did not write to memory, emulation is halted. The default is {default}.')) = 20,
        wait_calls: Arg.Switch('-c', group='CALL',
            help='Wait indefinitely when inside a function call.') = False,
        skip_calls: Arg.Counts('-C', group='CALL',
            help='Skip function calls entirely. Use twice to treat each call as allocating memory.') = 0,
        stack_size: Arg.Number('-S', help='Optionally specify the stack size. The default is 0x{default:X}.') = 0x10000,
        stack_push: Arg('-u', action='append', type=str, metavar='REG',
            help='Push the value of a register to the stack before beginning emulation; implies -r.') = None,
        log_api_calls: Arg.Switch('-A', help='Log API calls when using Speakeasy.') = False,
        block_size: Arg.Number('-B', help='Standard memory block size for the emulator, 0x{default:X} by default.') = 0x1000,
        max_visits: Arg.Number('-V', help='Maximum number of times a code address is visited. Default is {default}.') = 0x10000,
        log_writes_in_calls: Arg.Switch('-W', help='Log writes of values that occur in functions calls.') = False,
        log_stack_addresses: Arg.Switch('-X', help='Log writes of values that are stack addresses.') = False,
        log_other_addresses: Arg.Switch('-Y', help='Log writes of values that are addresses to mapped segments.') = False,
        log_zero_overwrites: Arg.Switch('-Z', help='Log writes of zeros to memory that contained nonzero values.') = False,
        log_stack_cookies  : Arg.Switch('-E', help='Log writes that look like stack cookies.') = False,
    ):
        if sum((se, uc, ic)) > 1:
            raise ValueError('Too many emulators selected.')
        elif se:
            engine = _engine.speakeasy
        elif ic:
            engine = _engine.icicle
        elif uc:
            engine = _engine.unicorn

        super().__init__(
            address=address,
            stop=stop,
            base=base,
            arch=Arg.AsOption(arch, Arch),
            engine=Arg.AsOption(engine, _engine),
            meta_registers=meta_registers,
            timeout=timeout,
            patch_range=patch_range,
            write_range=write_range,
            wait=wait,
            stack_size=stack_size,
            stack_push=stack_push,
            wait_calls=wait_calls,
            skip_calls=skip_calls,
            block_size=block_size,
            max_visits=max_visits,
            log_api_calls=log_api_calls,
            log_writes_in_calls=log_writes_in_calls,
            log_stack_addresses=log_stack_addresses,
            log_other_addresses=log_other_addresses,
            log_zero_overwrites=log_zero_overwrites,
            log_stack_cookies=log_stack_cookies
        )

    def process(self, data):
        meta = metavars(data)
        args = self.args

        engine: _engine = args.engine
        flags = Hook.Default
        self.log_debug(F'attempting to use {engine.name}')
        getattr(self, F'_{engine.name}')

        if engine is _engine.speakeasy:
            flags |= Hook.ApiCall

        class Emu(engine.value, VStackEmulatorMixin):
            pass

        emu = Emu(
            data,
            args.base,
            args.arch,
            flags,
            args.block_size,
            args.stack_size,
        )

        cfg = EmuConfig(
            args.wait_calls,
            args.skip_calls,
            args.write_range,
            args.wait,
            args.block_size,
            args.stack_size,
            args.max_visits,
            args.log_stack_cookies,
            args.log_writes_in_calls,
            args.log_stack_addresses,
            args.log_other_addresses,
            args.log_zero_overwrites,
            args.log_api_calls,
        )

        register_values = {}
        emu.reset(None)

        if args.meta_registers or args.stack_push:
            for var, value in list(meta.items()):
                try:
                    register = emu.lookup_register(var)
                except LookupError:
                    continue
                meta.discard(var)
                register_values[register] = var, value

        def parse_address(a: Union[int, bytes]):
            if isinstance(a, int):
                return a
            a = a.decode(self.codec)
            if m := re.fullmatch('(?i)(?:sub_|fun_|0x)?([A-F0-9]+)H?', a):
                return int(m[1], 16)
            try:
                return PythonExpression.Evaluate(a, meta)
            except ParserVariableMissing:
                pass
            symbols = list(emu.exe.symbols())
            for filter in [
                lambda s: s.get_name().casefold() == a.casefold(),
                lambda s: s.name == a,
                lambda s: s.code,
                lambda s: s.exported
            ]:
                symbols = [s for s in symbols if filter(s)]
                if len(symbols) == 1:
                    return symbols[0].address
            if len(symbols) > 1:
                raise RuntimeError(F'there are {len(symbols)} exported function symbol named "{a}", please specify the address')
            if not symbols:
                raise LookupError(F'no symbol with name "{a}" was found')

        addresses = [parse_address(a) for a in args.address]

        if not addresses:
            for symbol in emu.exe.symbols():
                if symbol.name is None:
                    addresses.append(symbol.address)
                    break

        for cursor in addresses:
            state = EmuState(cfg, cursor, emu.exe.pointer_size // 4, stop=args.stop)
            emu.reset(state)

            for reg in emu.general_purpose_registers():
                if reg not in register_values:
                    state.init_registers.append(reg)

            for reg, (var, value) in register_values.items():
                if isinstance(value, int):
                    self.log_info(F'setting {var} to integer value 0x{value:X}')
                    emu.set_register(reg, value)
                    continue
                if isinstance(value, str):
                    value = value.encode()
                if isbuffer(value):
                    start = emu.malloc(len(value))
                    emu.mem_write(start, bytes(value))
                    emu.set_register(reg, start)
                    self.log_info(F'setting {var} to mapped buffer of size 0x{len(value):X}')
                    continue
                _tn = value.__class__.__name__
                self.log_warn(F'canot interpret value of type {_tn} for register {var}')

            if push := args.stack_push:
                for reg in push:
                    emu.push_register(reg)

            timeout = args.timeout
            if timeout is not None:
                self.log_info(F'setting timeout of {timeout} steps')
                state.ticks = timeout

            try:
                emu.emulate(
                    emu.base_exe_to_emu(cursor),
                    emu.base_exe_to_emu(args.stop),
                )
            except EmulationError:
                pass

            for patch, api in state.synthesized.items():
                chunk = self.labelled(patch, src=api)
                yield chunk

            valid = bounds[args.patch_range]
            for base, patch in state.memory:
                if len(patch) not in valid or not any(patch):
                    continue
                self.log_info(F'memory patch at {state.fmt(base)} of size {len(patch)}')
                chunk = self.labelled(patch, src=base)
                yield chunk

class winreg (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract values from a Windows registry hive or from a registry export (.reg file).

Expand source code Browse git

class winreg(PathExtractorUnit):
    """
    Extract values from a Windows registry hive or from a registry export (.reg file).
    """
    @PathExtractorUnit.Requires('python-registry', 'formats')
    def _registry():
        import Registry
        import Registry.Registry
        import Registry.RegistryParse
        return Registry

    @staticmethod
    def _walk(patterns: List[PathPattern], key: RegistryKey, *path: str):
        here = '/'.join(path)
        if not any(p.reach(here) for p in patterns):
            winreg.log_debug(F'pruning search at {here}')
            return
        for value in key.values():
            def raw(v: RegistryValue = value):
                return v.raw_data()
            vpath = here
            vname = value.name()
            if vname != '(default)':
                vpath = F'{vpath}/{vname}'
            yield UnpackResult(vpath, raw)
        for subkey in key.subkeys():
            yield from winreg._walk(patterns, subkey, *path, subkey.name())

    def _unpack_hive(self, data: bytearray):
        try:
            with MemoryFile(data) as stream:
                root = self._registry.Registry.Registry(stream).root()
                yield from self._walk(self._patterns, root, root.name())
        except self._registry.RegistryParse.ParseException:
            raise ParseException

    def _decode_registry_export(self, data: str):
        def REG_BINARY(data: str) -> bytes:
            return bytes.fromhex(re.sub('[^a-f0-9]+', '', data))

        def REG_SZ(data: str) -> bytes:
            return data.encode(self.codec) | esc(quoted=True) | bytes

        def REG_EXPAND_SZ(data: str):
            return REG_BINARY(data).decode('UTF-16LE').rstrip('\0').encode(self.codec)

        def REG_MULTI_SZ(data: str):
            data = REG_BINARY(data).decode('UTF-16LE').split('\0')
            for string in data:
                if string:
                    yield string.encode(self.codec)

        def REG_DWORD(data: str):
            value = int(data, 16)
            return F'0x{value:X}'.encode(self.codec)

        def REG_QWORD(data: str):
            value = int.from_bytes(REG_BINARY(data), 'little')
            return F'0x{value:X}'.encode(self.codec)

        class Missing:
            def __init__(self, name: str): self.name = name
            def __str__(self): return self.name

        REG_NONE = REG_EXPAND_SZ
        REG_DWORD_BIG_ENDIAN = Missing('REG_DWORD_BIG_ENDIAN')
        REG_LINK = Missing('REG_LINK')
        REG_RESOURCE_LIST = Missing('REG_RESOURCE_LIST')
        REG_FULL_RESOURCE_DESCRIPTOR = Missing('REG_FULL_RESOURCE_DESCRIPTOR')
        REG_RESOURCE_REQUIREMENTS_LIST = Missing('REG_RESOURCE_REQUIREMENTS_LIST')

        prefix, _, encoded = data.partition(':')

        try:
            decoder = {
                'hex(0)' : REG_NONE,
                'hex(1)' : REG_SZ,
                'hex(2)' : REG_EXPAND_SZ,
                'hex(3)' : REG_BINARY,
                'hex'    : REG_BINARY,
                'hex(4)' : REG_DWORD,
                'dword'  : REG_DWORD,
                'hex(5)' : REG_DWORD_BIG_ENDIAN,
                'hex(6)' : REG_LINK,
                'hex(7)' : REG_MULTI_SZ,
                'hex(8)' : REG_RESOURCE_LIST,
                'hex(9)' : REG_FULL_RESOURCE_DESCRIPTOR,
                'hex(a)' : REG_RESOURCE_REQUIREMENTS_LIST,
                'hex(b)' : REG_QWORD,
            }[prefix]
        except KeyError:
            decoder = REG_SZ
            encoded = data

        if isinstance(decoder, Missing):
            self.log_warn(F'Found registry type {decoder!s}; no decoder implemented.')
            return
        self.log_debug(F'decoding as {decoder.__name__}: {encoded}')
        it = decoder(encoded)
        if not inspect.isgenerator(it):
            it = (it,)
        yield from it

    def _unpack_file(self, data: bytearray):
        for codec in ('utf16', 'utf-16le', 'utf8'):
            try:
                reg = data.decode(codec).splitlines(keepends=True)
            except UnicodeError:
                continue
            if reg[0].startswith('Windows Registry Editor'):
                break
        else:
            raise ParseException
        config = WinRegFileParser()
        config.read_string(''.join(reg[1:]))
        for key in config.sections():
            self.log_debug(key)
            for value in config[key]:
                name = next(iter(shlex.split(value)))
                path = Path(key)
                if name != '@':
                    path = path / Path(name)
                data = config[key][value]
                decoded = list(self._decode_registry_export(data))
                if len(decoded) == 1:
                    yield UnpackResult(str(path), decoded[0])
                    continue
                for k, d in enumerate(decoded):
                    yield UnpackResult(F'{path!s}.{k}', d)

    def unpack(self, data):
        with contextlib.suppress(ParseException):
            yield from self._unpack_hive(data)
            return
        yield from self._unpack_file(data)

    @classmethod
    def handles(cls, data):
        if data[:4] == B'regf':
            return True
        if data[:31] == b'Windows Registry Editor Version':
            return True
        return False

class wshenc (marker=True)

Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).

Expand source code Browse git

class wshenc(Unit):
    """
    Windows Scripting Host encoding and decoding of VBScript (VBS/VBE) and JScript (JS/JSE).
    """

    _MARKER_INIT = RB'#@~^BINREF=='
    _MARKER_STOP = RB'BINREF==^#~@'

    _CHUNKS = (
        0x57, 0x6E, 0x7B, 0x4A, 0x4C, 0x41, 0x0B, 0x0B, 0x0B, 0x0C, 0x0C, 0x0C, 0x4A, 0x4C, 0x41,
        0x0E, 0x0E, 0x0E, 0x0F, 0x0F, 0x0F, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12,
        0x13, 0x13, 0x13, 0x14, 0x14, 0x14, 0x15, 0x15, 0x15, 0x16, 0x16, 0x16, 0x17, 0x17, 0x17,
        0x18, 0x18, 0x18, 0x19, 0x19, 0x19, 0x1A, 0x1A, 0x1A, 0x1B, 0x1B, 0x1B, 0x1C, 0x1C, 0x1C,
        0x1D, 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x2E, 0x2D, 0x32, 0x47, 0x75, 0x30,
        0x7A, 0x52, 0x21, 0x56, 0x60, 0x29, 0x42, 0x71, 0x5B, 0x6A, 0x5E, 0x38, 0x2F, 0x49, 0x33,
        0x26, 0x5C, 0x3D, 0x49, 0x62, 0x58, 0x41, 0x7D, 0x3A, 0x34, 0x29, 0x35, 0x32, 0x36, 0x65,
        0x5B, 0x20, 0x39, 0x76, 0x7C, 0x5C, 0x72, 0x7A, 0x56, 0x43, 0x7F, 0x73, 0x38, 0x6B, 0x66,
        0x39, 0x63, 0x4E, 0x70, 0x33, 0x45, 0x45, 0x2B, 0x6B, 0x68, 0x68, 0x62, 0x71, 0x51, 0x59,
        0x4F, 0x66, 0x78, 0x09, 0x76, 0x5E, 0x62, 0x31, 0x7D, 0x44, 0x64, 0x4A, 0x23, 0x54, 0x6D,
        0x75, 0x43, 0x71, 0x4A, 0x4C, 0x41, 0x7E, 0x3A, 0x60, 0x4A, 0x4C, 0x41, 0x5E, 0x7E, 0x53,
        0x40, 0x4C, 0x40, 0x77, 0x45, 0x42, 0x4A, 0x2C, 0x27, 0x61, 0x2A, 0x48, 0x5D, 0x74, 0x72,
        0x22, 0x27, 0x75, 0x4B, 0x37, 0x31, 0x6F, 0x44, 0x37, 0x4E, 0x79, 0x4D, 0x3B, 0x59, 0x52,
        0x4C, 0x2F, 0x22, 0x50, 0x6F, 0x54, 0x67, 0x26, 0x6A, 0x2A, 0x72, 0x47, 0x7D, 0x6A, 0x64,
        0x74, 0x39, 0x2D, 0x54, 0x7B, 0x20, 0x2B, 0x3F, 0x7F, 0x2D, 0x38, 0x2E, 0x2C, 0x77, 0x4C,
        0x30, 0x67, 0x5D, 0x6E, 0x53, 0x7E, 0x6B, 0x47, 0x6C, 0x66, 0x34, 0x6F, 0x35, 0x78, 0x79,
        0x25, 0x5D, 0x74, 0x21, 0x30, 0x43, 0x64, 0x23, 0x26, 0x4D, 0x5A, 0x76, 0x52, 0x5B, 0x25,
        0x63, 0x6C, 0x24, 0x3F, 0x48, 0x2B, 0x7B, 0x55, 0x28, 0x78, 0x70, 0x23, 0x29, 0x69, 0x41,
        0x28, 0x2E, 0x34, 0x73, 0x4C, 0x09, 0x59, 0x21, 0x2A, 0x33, 0x24, 0x44, 0x7F, 0x4E, 0x3F,
        0x6D, 0x50, 0x77, 0x55, 0x09, 0x3B, 0x53, 0x56, 0x55, 0x7C, 0x73, 0x69, 0x3A, 0x35, 0x61,
        0x5F, 0x61, 0x63, 0x65, 0x4B, 0x50, 0x46, 0x58, 0x67, 0x58, 0x3B, 0x51, 0x31, 0x57, 0x49,
        0x69, 0x22, 0x4F, 0x6C, 0x6D, 0x46, 0x5A, 0x4D, 0x68, 0x48, 0x25, 0x7C, 0x27, 0x28, 0x36,
        0x5C, 0x46, 0x70, 0x3D, 0x4A, 0x6E, 0x24, 0x32, 0x7A, 0x79, 0x41, 0x2F, 0x37, 0x3D, 0x5F,
        0x60, 0x5F, 0x4B, 0x51, 0x4F, 0x5A, 0x20, 0x42, 0x2C, 0x36, 0x65, 0x57)
    _OFFSETS = (
        0, 1, 2, 0, 1, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1,
        0, 0, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 1)
    _ENCODER = {
        0x09 : [0x37, 0x69, 0x64], 0x0B : [0x0B, 0x0B, 0x0B], 0x0C : [0x0C, 0x0C, 0x0C],
        0x0E : [0x0E, 0x0E, 0x0E], 0x0F : [0x0F, 0x0F, 0x0F], 0x10 : [0x10, 0x10, 0x10],
        0x11 : [0x11, 0x11, 0x11], 0x12 : [0x12, 0x12, 0x12], 0x13 : [0x13, 0x13, 0x13],
        0x14 : [0x14, 0x14, 0x14], 0x15 : [0x15, 0x15, 0x15], 0x16 : [0x16, 0x16, 0x16],
        0x17 : [0x17, 0x17, 0x17], 0x18 : [0x18, 0x18, 0x18], 0x19 : [0x19, 0x19, 0x19],
        0x1A : [0x1A, 0x1A, 0x1A], 0x1B : [0x1B, 0x1B, 0x1B], 0x1C : [0x1C, 0x1C, 0x1C],
        0x1D : [0x1D, 0x1D, 0x1D], 0x1E : [0x1E, 0x1E, 0x1E], 0x1F : [0x1F, 0x1F, 0x1F],
        0x20 : [0x7E, 0x2C, 0x50], 0x21 : [0x5A, 0x65, 0x22], 0x22 : [0x45, 0x72, 0x4A],
        0x23 : [0x3A, 0x5B, 0x61], 0x24 : [0x79, 0x66, 0x5E], 0x25 : [0x59, 0x75, 0x5D],
        0x26 : [0x27, 0x4C, 0x5B], 0x27 : [0x76, 0x45, 0x42], 0x28 : [0x63, 0x76, 0x60],
        0x29 : [0x62, 0x2A, 0x23], 0x2A : [0x4D, 0x43, 0x65], 0x2B : [0x51, 0x33, 0x5F],
        0x2C : [0x53, 0x42, 0x7E], 0x2D : [0x52, 0x20, 0x4F], 0x2E : [0x20, 0x63, 0x52],
        0x2F : [0x26, 0x4A, 0x7A], 0x30 : [0x54, 0x5A, 0x21], 0x31 : [0x71, 0x38, 0x46],
        0x32 : [0x2B, 0x79, 0x20], 0x33 : [0x66, 0x32, 0x26], 0x34 : [0x2A, 0x57, 0x63],
        0x35 : [0x58, 0x6C, 0x2A], 0x36 : [0x7F, 0x2B, 0x76], 0x37 : [0x7B, 0x46, 0x47],
        0x38 : [0x30, 0x52, 0x25], 0x39 : [0x31, 0x4F, 0x2C], 0x3A : [0x6C, 0x3D, 0x29],
        0x3B : [0x49, 0x70, 0x69], 0x3D : [0x78, 0x7B, 0x27], 0x3F : [0x5F, 0x51, 0x67],
        0x40 : [0x40, 0x40, 0x40], 0x41 : [0x29, 0x7A, 0x62], 0x42 : [0x24, 0x7E, 0x41], # noqa
        0x43 : [0x2F, 0x3B, 0x5A], 0x44 : [0x39, 0x47, 0x66], 0x45 : [0x33, 0x41, 0x32],
        0x46 : [0x6F, 0x77, 0x73], 0x47 : [0x21, 0x56, 0x4D], 0x48 : [0x75, 0x5F, 0x43],
        0x49 : [0x28, 0x26, 0x71], 0x4A : [0x42, 0x78, 0x39], 0x4B : [0x46, 0x6E, 0x7C],
        0x4C : [0x4A, 0x64, 0x53], 0x4D : [0x5C, 0x74, 0x48], 0x4E : [0x48, 0x67, 0x31],
        0x4F : [0x36, 0x7D, 0x72], 0x50 : [0x4B, 0x68, 0x6E], 0x51 : [0x7D, 0x35, 0x70],
        0x52 : [0x5D, 0x22, 0x49], 0x53 : [0x6A, 0x55, 0x3F], 0x54 : [0x50, 0x3A, 0x4B],
        0x55 : [0x69, 0x60, 0x6A], 0x56 : [0x23, 0x6A, 0x2E], 0x57 : [0x09, 0x71, 0x7F],
        0x58 : [0x70, 0x6F, 0x28], 0x59 : [0x65, 0x49, 0x35], 0x5A : [0x74, 0x5C, 0x7D],
        0x5B : [0x2C, 0x5D, 0x24], 0x5C : [0x77, 0x27, 0x2D], 0x5D : [0x44, 0x59, 0x54],
        0x5E : [0x3F, 0x25, 0x37], 0x5F : [0x6D, 0x7C, 0x7B], 0x60 : [0x7C, 0x23, 0x3D],
        0x61 : [0x43, 0x6D, 0x6C], 0x62 : [0x38, 0x28, 0x34], 0x63 : [0x5E, 0x31, 0x6D],
        0x64 : [0x5B, 0x39, 0x4E], 0x65 : [0x6E, 0x7F, 0x2B], 0x66 : [0x57, 0x36, 0x30],
        0x67 : [0x4C, 0x54, 0x6F], 0x68 : [0x34, 0x34, 0x74], 0x69 : [0x72, 0x62, 0x6B],
        0x6A : [0x25, 0x4E, 0x4C], 0x6B : [0x56, 0x30, 0x33], 0x6C : [0x73, 0x5E, 0x56],
        0x6D : [0x68, 0x73, 0x3A], 0x6E : [0x55, 0x09, 0x78], 0x6F : [0x47, 0x4B, 0x57],
        0x70 : [0x32, 0x61, 0x77], 0x71 : [0x35, 0x24, 0x3B], 0x72 : [0x2E, 0x4D, 0x44],
        0x73 : [0x64, 0x6B, 0x2F], 0x74 : [0x4F, 0x44, 0x59], 0x75 : [0x3B, 0x21, 0x45],
        0x76 : [0x2D, 0x37, 0x5C], 0x77 : [0x41, 0x53, 0x68], 0x78 : [0x61, 0x58, 0x36],
        0x79 : [0x7A, 0x48, 0x58], 0x7A : [0x22, 0x2E, 0x79], 0x7B : [0x60, 0x50, 0x09],
        0x7C : [0x6B, 0x2D, 0x75], 0x7D : [0x4E, 0x29, 0x38], 0x7E : [0x3D, 0x3F, 0x55],
        0x7F : [0x67, 0x2F, 0x51]
    }

    _ESCAPE = {
        0x40: B'@$',
        0x3C: B'@!',
        0x3E: B'@*',
        0x0D: B'@#',
        0x0A: B'@&',
    }

    _UNESCAPE = {
        B'@$': B'@',
        B'@!': B'<',
        B'@*': B'>',
        B'@#': B'\r',
        B'@&': B'\n',
    }

    def __init__(
        self,
        marker: Arg.Switch('-m', '--no-marker', off=True, help=(
            'Do not require magic marker when encoding and do not search for '
            'marker when decoding.')
        ) = True
    ):
        super().__init__(marker=marker)

    @classmethod
    def _chunk(cls, byte, index):
        k = byte - 9
        c = cls._CHUNKS[k * 3 : k * 3 + 3]
        return c[cls._OFFSETS[index % 64]]

    def _escape(self, iterable):
        if self.args.marker:
            yield from self._MARKER_INIT
        for byte in iterable:
            try:
                yield from self._ESCAPE[byte]
            except KeyError:
                yield byte
        if self.args.marker:
            yield from self._MARKER_STOP

    def _unescape(self, data):
        def unescaper(m): return self._UNESCAPE[m[0]]
        return re.sub(RB'@[$!*#&]', unescaper, data)

    @classmethod
    def _decoded(cls, data):
        index = -1
        for byte in data:
            if byte < 128:
                index += 1
            if byte == 9 or byte in range(32, 128) and byte not in (60, 62, 64):
                byte = cls._chunk(byte, index)
            yield byte

    @classmethod
    def _encoded(cls, data):
        for i, byte in enumerate(data):
            try:
                sequence = cls._ENCODER[byte]
            except KeyError:
                yield byte
            else:
                offset = cls._OFFSETS[i % 0x40]
                yield sequence[offset]

    def reverse(self, data):
        return bytearray(self._escape(self._encoded(data)))

    def process(self, data):
        if self.args.marker:
            match = formats.wshenc.search(data)
            if not match:
                raise ValueError('Encoded script marker was not found.')
            data = match[0][12:-12]
        return bytearray(self._decoded(self._unescape(data)))

    @classmethod
    def handles(cls, data: bytearray):
        view = memoryview(data)
        if not re.search(BR'#@~\^[!-~]{6}==', view[:+64]):
            return None
        if not re.search(BR'[!-~]{6}==\^#~@', view[-64:]):
            return None
        return True

class xchacha (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

XChaCha encryption and decryption. The nonce must be 24 bytes long.

Expand source code Browse git

class xchacha(LatinCipherUnit):
    """
    XChaCha encryption and decryption. The nonce must be 24 bytes long.
    """
    def keystream(self) -> Iterable[int]:
        kdp, kdn, nonce = struct.unpack('<Q8s8s', self.args.nonce)
        yield from LatinX(
            ChaChaCipher,
            (0, 1, 2, 3, 12, 13, 14, 15),
            self.args.key,
            kdn,
            kdp,
            nonce,
            self.args.magic,
            self.args.rounds,
            self.args.offset,
        )

class xfcc (variable='count', relative=False)

The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames of input. It consumes all frames at its current level of the frame tree and counts the number of times each item occurs in each of them. It converts a frame tree of depth 2 into a new frame tree of depth 2 where the parent of every leaf has this leaf as its only child. The leaves of this tree have been enriched with a meta variable containing the number of times the corresponding chunk has occurred in the input frame tree. The variable that stores this information is scoped at the first layer of this subtree, which means that a frame can be closed once after invocation of xfcc and the variable remains accessible. This unit can be used to compute set intersections across frames as follows:

(1) [| (2) [| dedup | xfcc -r t ]| iff t==1 | (3) ]

A sequence of chunks is emitted at (1), each of which has chunks extracted at (2). It is then important to use dedup before calling xfcc, since xfcc performs an absolute count. The frame at (3) contains the intersection of all datasets that were extracted at (2).

Expand source code Browse git

class xfcc(Unit):
    """
    The cross frame chunk count unit! It computes the number of times a chunk occurs across several frames
    of input. It consumes all frames at its current level of the frame tree and counts the number of times
    each item occurs in each of them. It converts a frame tree of depth 2 into a new frame tree of depth 2
    where the parent of every leaf has this leaf as its only child. The leaves of this tree have been
    enriched with a meta variable containing the number of times the corresponding chunk has occurred in
    the input frame tree. The variable that stores this information is scoped at the first layer of this
    subtree, which means that a frame can be closed once after invocation of xfcc and the variable remains
    accessible. This unit can be used to compute set intersections across frames as follows:

        (1) [| (2) [| dedup | xfcc -r t ]| iff t==1 | (3) ]

    A sequence of chunks is emitted at (1), each of which has chunks extracted at (2). It is then important
    to use dedup before calling xfcc, since xfcc performs an absolute count. The frame at (3) contains the
    intersection of all datasets that were extracted at (2).
    """
    def __init__(
        self,
        variable: Arg(help='The variable which is used as the accumulator') = 'count',
        relative: Arg.Switch('-r', help='Normalize the accumulator to a number between 0 and 1.') = False
    ):
        super().__init__(variable=variable, relative=relative)
        self._trunk = None
        self._store: Dict[Chunk, int] = defaultdict(int)

    def finish(self):
        vn = self.args.variable
        rc = self.args.relative
        if rc and self._store:
            maximum = max(self._store.values())
        for index, (chunk, count) in enumerate(self._store.items()):
            if rc:
                count /= maximum
            chunk.path[-2] = 0
            chunk.path[-1] = index
            chunk.meta[vn] = count
            chunk.meta.set_scope(vn, chunk.scope - 1)
            yield chunk
        self._store.clear()

    def _getcount(self, chunk):
        try:
            count = int(chunk.meta[self.args.variable])
        except (AttributeError, KeyError, TypeError):
            return 1
        else:
            return count

    def filter(self, chunks: Iterable[Chunk]):
        it = iter(chunks)
        try:
            head = next(it)
        except StopIteration:
            return
        if len(head.path) < 2:
            self.log_warn(F'the current frame is nested {len(head.path)} layers deep, at least two layers are required.')
            yield head
            yield from it
            return
        trunk = head.path[:-2]
        store = self._store
        if trunk != self._trunk:
            yield from self.finish()
            self._trunk = trunk
        store[head] += self._getcount(head)
        for chunk in it:
            store[chunk] += self._getcount(chunk)

class xj0 (fmt='', all=False, one=False, raw=False)

Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to extract remaining fields as metadata: String values are extracted only if they do not exceed 80 characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and lists of the latter are also extracted.

Expand source code Browse git

class xj0(Unit):
    """
    Extracts a single field from a JSON document at depth 0. By default, the unit applies a heuristic to
    extract remaining fields as metadata: String values are extracted only if they do not exceed 80
    characters in length and do not contain any line breaks. Floating-point, integer, boolean values, and
    lists of the latter are also extracted.
    """
    def __init__(
        self,
        fmt: Unit.Arg.String(help=(
            'Format expression for the output chunk; may use previously extracted JSON items. The default '
            'is {default}, which represents the input data.')) = '',
        all: Unit.Arg.Switch('-a', group='META', help='Extract all other fields as metadata regardless of length and type.') = False,
        one: Unit.Arg.Switch('-x', group='META', help='Do not extract any other fields as metadata.') = False,
        raw: Unit.Arg.Switch('-r', help='Disable conversion of JSON strings to binary strings in metadata') = False,
    ):
        super().__init__(fmt=fmt, one=one, raw=raw, all=all)

    def process(self, data: Chunk):

        def convert(value, iskey=False):
            if self.args.raw:
                return value
            if isinstance(value, (float, int, bool)):
                return value
            if isinstance(value, str):
                return value.encode(self.codec)
            if iskey:
                raise TypeError
            if isinstance(value, dict):
                return {convert(k): convert(v) for k, v in value.items()}
            if isinstance(value, list):
                return [convert(k) for k in value]

        def acceptable(key, value, nested=False, convert=False):
            if not is_valid_variable_name(key):
                self.log_info(F'rejecting item with invalid name {key}')
                return None
            if isinstance(value, (float, int, bool)):
                return value
            if isinstance(value, dict):
                if not self.args.all:
                    self.log_info(F'rejecting item {key} with dictionary value')
                    return False
                return True
            if isinstance(value, list):
                if nested:
                    self.log_info(F'rejecting item {key} containing a doubly nested list')
                    return False
                return all(acceptable(key, t, True) for t in value)
            if isinstance(value, str):
                if not self.args.all:
                    if len(value) not in range(1, 80):
                        self.log_info(F'rejecting string item {key} because {len(value)} exceeds the length limit')
                        return False
                    if '\n' in value:
                        self.log_info(F'rejecting string item {key} because it contains line breaks')
                        return False
                return True
            return False

        jdoc: dict = json.loads(data)
        if not isinstance(jdoc, dict):
            raise ValueError('The input must be a JSON dictionary.')
        meta = metavars(data)
        args = {k: convert(v) for k, v in jdoc.items() if acceptable(k, v)}
        used = set()
        data[:] = meta.format_bin(self.args.fmt, self.codec, [data], args, used)
        for u in used:
            args.pop(u, None)
        if not self.args.one:
            data.meta.update(args)
        return data

class xjl

Returns all JSON elements from a JSON iterable as individual outputs. When reversed, the unit collects all chunks in the frame and wraps them as a JSON list.

Expand source code Browse git

class xjl(Unit):
    """
    Returns all JSON elements from a JSON iterable as individual outputs. When reversed, the unit
    collects all chunks in the frame and wraps them as a JSON list.
    """

    def process(self, data):
        try:
            doc: Union[list, dict] = json.loads(data)
        except Exception:
            from refinery.units.pattern.carve_json import carve_json
            doc = data | carve_json | json.loads
        try:
            it = doc.values()
        except AttributeError:
            it = doc
        for item in it:
            yield json.dumps(item, indent=4).encode(self.codec)

    def reverse(self, data):
        return json.dumps(data.temp).encode(self.codec)

    def filter(self, chunks: Iterable[Chunk]):
        if not self.args.reverse:
            yield from chunks

        from refinery.lib.tools import begin

        if it := begin(chunks):
            head, rest = it
            collected = [head.decode(self.codec)]
            collected.extend(chunk.decode(self.codec) for chunk in rest)
            head.temp = collected
            yield head

class xkey (range=slice(1, 32, None), plaintext=None, searchpos=slice(0, None, None), alph=False, crib=False, freq=False)

The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

Known plaintext cribs: The unit contains a library of file signatures that are expected to occur at specific offsets. It uses these to attempt a known-plaintext attack against the input. If a key is found that is at most half the size of such a crib, it is returned.
Known alphabets: For each given key length, the input is split into slices that would have been encrypted with a single byte for keys of that length. Each such slice undergoes a character frequency analysis. If the histogram indicates that an alphabet of a small size was used (i.e. base64), then the unit attempts to determine the key based on this.
Known high frequency glyph: Works if the plaintext contains one letter that occurs with very high frequency, i.e. zero padding in PE or ELF files, and the space character in text. Based on this assumption, the unit computes the most likely key. This method will work best on uncompressed files that were encrypted with a short key.

Expand source code Browse git

class xkey(Unit):
    """
    The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For
    both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

    1. Known plaintext cribs: The unit contains a library of file signatures that are expected to
       occur at specific offsets. It uses these to attempt a known-plaintext attack against the
       input. If a key is found that is at most half the size of such a crib, it is returned.
    2. Known alphabets: For each given key length, the input is split into slices that would have
       been encrypted with a single byte for keys of that length. Each such slice undergoes a
       character frequency analysis. If the histogram indicates that an alphabet of a small size
       was used (i.e. base64), then the unit attempts to determine the key based on this.
    3. Known high frequency glyph: Works if the plaintext contains one letter that occurs with
       very high frequency, i.e. zero padding in PE or ELF files, and the space character in text.
       Based on this assumption, the unit computes the most likely key. This method will work best
       on uncompressed files that were encrypted with a short key.

    When no option is set, the unit uses all the above methods by default. When at least one of
    the methods is selected, it will attempt only selected methods. When a custom plaintext is given,
    the other methods are disabled by default.
    """

    _CRIBS: dict[range, dict[str, bytes | tuple[bytes | tuple[bytes, ...], ...]]] = {
        range(0, 64, 4): {
            'ZIP'           : (B'PK\x03\x04', (B'\x14\x00', B'\x0A\x00'), (B'\x08\x00', B'\x00\x00')),
            'RAR'           : (B'Rar!\x1A\x07', (B'\x01\x00', B'\x00')),
            'ZPAQ'          : (B'\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3\x7A\x50\x51'),
            'ZSTD'          : (B'\x28\xB5\x2F\xFD'),
            'ZZip'          : (B'7z\xBC\xAF\x27\x1C', (B'\x00\x02', B'\x00\x03', B'\x00\x04')),
            'APLib'         : (B'AP32\x18\0\0\0'),
            'BZip'          : (B'BZh'),
            'DDS'           : (B'\x00\x00\x00\x01Bud1'),
            'ELF'           : (B'\x7FELF'),
            'JavaClass'     : (B'\xCA\xFE\xBA\xBE'),
            'LZIP'          : (B'LZIP'),
            'SZDD'          : (B'SZDD\x88\xF0\x27\x33'),
            'LZMA'          : (B'\x5D\x00\x00\x00'),
            'LZMA/XZ'       : (B'\xFD7zXZ'),
            'LZO'           : (B'\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a'),
            'MachO/BE'      : (B'\xCA\xFE\xBA\xBE'),
            'MachO/LE'      : (B'\xBE\xBA\xFE\xCA'),
            'MSCF'          : (B'\x0A\x51\xE5\xC0'),
            'OleDocument'   : (B'\xD0\xCF\x11\xE0', (B'', B'\xA1\xB1\x1A\xE1'), (B'', B'\0\0\0\0\0\0\0\0')),
            'PdfDocument'   : (B'%PDF-', _S(B'12'), (B'.'), _S(B'0123456789'), _S(B'\r\n')),
            'SQLite'        : (B'SQLite format 3\0'),
            'GIF'           : (B'GIF87a', B'GIF89a'),
            'PNG'           : (B'\x89PNG\r\n\x1A\n'),
            'DEX'           : (B'dex\n035\0'),
            'JPG'           : (B'\xFF\xD8\xFF', _S(B'\xE0\xE1\xEE'), (B'\x00\x10\x4A\x46\x49\x46\x00\x01', B'')),
            'OneNote'       : (B'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3'),
            'A3xScript'     : (B'\xA3\x48\x4B\xBE\x98\x6C\x4A\xA9\x99\x4C\x53\x0A\x86\xD6\x48\x7D'),
            'RTFDocument'   : (B'{\\rtf1', (B'\\adeflang', B'\\ansi', B'')),
            'CallToPop'     : (B'\xE8\0\0\0\0', (
                               B'\x41\x58', B'\x41\x59', B'\x41\x5A', B'\x41\x5B',
                                   B'\x58',     B'\x59',     B'\x5A',     B'\x5B',   # noqa
                                   B'\x5C',     B'\x5D',     B'\x5E',     B'\x5F',   # noqa
                               )),
            'Cert'          : (B'-----BEGIN CERTIFICATE-----'),
            'PrivateKey'    : (B'-----BEGIN PRIVATE KEY-----'),
            'PrivateKeyDSA' : (B'-----BEGIN DSA PRIVATE KEY-----'),
            'PrivateKeyRSA' : (B'-----BEGIN RSA PRIVATE KEY-----'),
            'PrivateKeySSH' : (B'-----BEGIN OPENSSH PRIVATE KEY-----'),
            'PEM'           : (B'-----BEGIN '),
            'PuTTY-Key'     : (B'PuTTY-User-Key-File-', (B'2:', B'3:')),
            'MsAccess'      : (B'\0\01\0\0Standard ', (B'ACE', B'Jet'), B' DB'),
        },
        range(0x10, 0x11): {
            'ASAR'          : (B'{"files":{"'),
        },
        range(0x10): {
            'DocTypeLower'  : (B'<!doctype\x20'),
            'DocTypeUpper'  : (B'<!DOCTYPE\x20'),
            'HTMLLower'     : (B'<html>'),
            'HTMLUpper'     : (B'<HTML>'),
            'XML'           : (B'<?xml version="'),
            'Ace'           : (B'**ACE**'),
        },
        range(0x36, 0x41): {
            'PEStub': (
                B'\0\x0E\x1F\xBA\x0E\x00\xB4\x09\xCD\x21\xB8\x01\x4C\xCD\x21'
                B'This program cannot be run in DOS mode.\r'
            ),
            'PEDelphiStub': (
                B'\0\xBA\x10\x00\x0E\x1F\xB4\x09\xCD\x21\xB8\x01\x4C\xCD\x21\x90\x90'
                B'This program must be run under Win', (B'32', B'64'), B'\x0D\x0A'
            ),
        },
        range(0x48, 0x60): {
            'PEStubMsg'      : (B'This program cannot be run in DOS mode.\r'),
            'PEDelphiStubMsg': (B'This program must be run under Win', (B'32', B'64'), B'\x0D\x0A'),
        },
        range(0xD0, 0xD1): {
            'Tar'           : (B'\x00' * 0x30 + B'ustar', (B'\x20\x20\x00', B'\x00\x30\x30')),
        },
    }

    _ENC_ALPHABETS = [
        B'0123456789,',
        B'0123456789;',
        B'0123456789ABCDEF',
        B'0123456789abcdef',
        B'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567',
        B'abcdefghijklmnopqrstuvwxyz234567',
        B'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ+/',
        B'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_',
    ]

    _WSH_ALPHABET = bytes(set(range(0x20, 0x80)) - {0x3C, 0x3E} | {0x09})

    class _rt(enum.IntEnum):
        crib = 0
        alph = 1
        freq = 2

    class _result(NamedTuple):
        key: bytes
        how: xkey._rt
        xor: Optional[bool] = None

    def __init__(
        self,
        range: Arg.Bounds(help='range of length values to try in Python slice syntax, the default is {default}.') = slice(1, 32),
        plaintext: Arg.Binary('-p', help='Provide a buffer of known plaintext.') = None,
        searchpos: Arg.Bounds('-s', metavar='S:E', help=(
            'Only used when a known plaintext buffer is provided; In this case it narrows the search range '
            'for the offset of that data to between S and E.')) = slice(0, None),
        alph: Arg.Switch('-a', help='enable search for keys via known encoder alphabets') = False,
        crib: Arg.Switch('-c', help='enable search for keys via known plaintext cribs') = False,
        freq: Arg.Switch('-f', help='enable search for keys via frequency analysis') = False,
    ):
        if not any((alph, crib, freq)) and plaintext is None:
            alph = crib = freq = True
        super().__init__(
            range=range,
            plaintext=plaintext,
            searchpos=searchpos,
            alph=alph,
            crib=crib,
            freq=freq,
        )

    def process(self, data: bytearray):
        for result in self._attack(data):
            out = result.key
            if how := result.how:
                out = self.labelled(out, method=how)
            return out

    def _attack(self, data: bytearray):
        bounds: slice = self.args.range
        view = memoryview(data)
        length = len(view)

        if length <= 1:
            return

        if length >= 0x100:
            view = view[:-4]

        start = bounds.start or 1
        stop = min(bounds.stop or length, length)

        if (step := bounds.step) is None:
            step = 1
        elif bounds.start is None:
            start *= step

        self.log_debug(
            F'received input range [{bounds.start}:{bounds.stop}:{bounds.step}], '
            F'using [{start}:{stop}:{step}]')

        criblist: list[tuple[range, dict[str, bytes | tuple[bytes | tuple[bytes, ...], ...]]]] = []

        if p := self.args.plaintext:
            pos: slice = self.args.searchpos
            end = len(data) - len(p) if pos.stop is None else pos.stop
            criblist.append((range(pos.start or 0, end + 1), {'Plaintext': [p]}))
        if self.args.crib:
            for r, byname in self._CRIBS.items():
                compiled = {
                    name: list(_generate_cribs(cribs))
                    for name, cribs in byname.items()
                }
                criblist.append((r, compiled))

        if self.args.alph:
            alphabets: dict[int, list[bytes]] = {}
            for alphabet in self._ENC_ALPHABETS:
                for suffix in (B'', B'\x20', B'\x0A', B'\x20\x0A'):
                    a = alphabet + suffix
                    alphabets.setdefault(len(a), []).append(a)
            alphabets[len(self._WSH_ALPHABET)] = self._WSH_ALPHABET
        else:
            alphabets = None

        for xor in (True, False):
            if key := self._process_crib(view, xor, criblist):
                yield self._result(key, self._rt.crib, xor)

        hist = {}
        freq = []

        for xor in (True, False):
            result = self._process_freq(view, (start, stop, step), alphabets, xor, hist)
            if result is None or not result.key:
                continue
            if result.how == self._rt.freq:
                freq.append(result)
                continue
            yield result

        yield from freq

    def _process_crib(
        self,
        view: memoryview,
        xor: bool,
        criblist: list[tuple[range, dict[str, list[bytes]]]]
    ):
        for offsets, cribs_by_type in criblist:
            for name, cribs in cribs_by_type.items():
                for crib in cribs:
                    cn = len(crib)
                    for offset in offsets:
                        test = view[offset:offset + cn]
                        if len(test) != cn:
                            continue
                        key = strxor(test, crib) if xor else bytes(
                            a - b & 0xFF for a, b in zip(test, crib))
                        if key := _cyclic_base(key):
                            self.log_info(F'found key via crib {name}:', crib, clip=True)
                            shift = -offset % len(key)
                            return key[shift:] + key[:shift]

    def _process_freq(
        self,
        view: memoryview,
        bounds: tuple[int, int, int],
        alphabets: dict[int, list[bytes]] | None,
        xor: bool,
        hist: dict[int, tuple[list[bytes], list[Counter]]],
    ):
        n = len(view)
        start, stop, step = bounds
        score = 0
        guess = None
        first = not hist

        for keylen in range(start, stop + 1, step):
            try:
                cached = hist[keylen]
            except KeyError:
                patches = [view[j::keylen] for j in range(keylen)]
                histograms = [Counter(p) for p in patches]
                hist[keylen] = patches, histograms
            else:
                patches, histograms = cached

            if alphabets is not None:
                hlc = Counter(len(h) for h in histograms)
                base, coverage = hlc.most_common(1)[0]

                if coverage * 2 > keylen and base in alphabets:
                    self.log_debug(F'solving for potential plaintext alphabet of size 0x{base:02X} at {keylen}')
                    keys: dict[bytes, bytes] = {}
                    for alphabet in alphabets[base]:
                        key = bytearray(keylen)
                        for k, patch in enumerate(patches):
                            keybyte = set(range(0x100))
                            for c in patch:
                                keybyte &= (
                                    {c ^ p & 0xFF for p in alphabet}
                                ) if xor else (
                                    {c - p & 0xFF for p in alphabet}
                                )
                                if len(keybyte) == 1:
                                    key[k] = next(iter(keybyte))
                                    break
                            else:
                                key = None
                                break
                        if key is not None:
                            keys[alphabet] = key
                    if len(keys) == 1:
                        self.log_debug(F'discovered plaintext alphabet of size 0x{base:02X} at {keylen}')
                        alphabet, key = keys.popitem()
                        return self._result(bytes(key), self._rt.alph, xor)

            if not first or not self.args.freq:
                continue

            _guess = [h.most_common(1)[0] for h in histograms]
            _score = sum(letter_count for _, letter_count in _guess) / n
            # This scaling accounts for the smaller probability of larger keys. No proper statistical analysis has been
            # conducted to derive it; there might be plenty of room for improvement here.
            _score = _score * ((n - keylen) / (n - 1)) ** keylen

            logmsg = F'got score {_score * 100:05.2f}% for length {keylen}'
            if _score > score:
                self.log_info(logmsg)
                score = _score
                guess = bytes(value for value, _ in _guess)
            else:
                self.log_debug(logmsg)

        if guess is not None:
            return self._result(guess, self._rt.freq)

class xlmdeobf (extract_only=False, sort_formulas=False, with_ms_excel=False, day=-1, output_formula_format='CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]', extract_formula_format='CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]', no_indent=False, start_point='', password='', output_level=0, timeout=0)

Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.

Expand source code Browse git

class xlmdeobf(Unit):
    """
    Wrapper around XLMMacroDeobfuscator to decode obfuscated Excel v4.0 (XLM) macros.
    """

    def __init__(
        self,
        extract_only: Unit.Arg.Switch(
            '-x', help='Only extract cells without any emulation.'
        ) = False,
        sort_formulas: Unit.Arg.Switch(
            '-s', '--sort-formulas',
            help='Sort extracted formulas based on their cell address (implies -x).',
        ) = False,
        with_ms_excel: Unit.Arg.Switch(
            '-X', '--with-ms-excel', help='Use MS Excel to process XLS files.'
        ) = False,
        day: Unit.Arg.Number(
            '-d',
            '--day',
            help='Specify the day of month',
        ) = -1,
        output_formula_format: Unit.Arg(
            '-O', '--output-format',
            type=str,
            metavar='FMT',
            help='Specify the format for output formulas (using [[CELL-ADDR]], [[INT-FORMULA]], and [[STATUS]])',
        ) = 'CELL:[[CELL-ADDR]], [[STATUS]], [[INT-FORMULA]]',
        extract_formula_format: Unit.Arg(
            '-E', '--extract-format',
            metavar='FMT',
            type=str,
            help='Specify the format for extracted formulas (using [[CELL-ADDR]], [[CELL-FORMULA]], and [[CELL-VALUE]])',
        ) = 'CELL:[[CELL-ADDR]], [[CELL-FORMULA]], [[CELL-VALUE]]',
        no_indent: Unit.Arg.Switch(
            '-I', '--no-indent',
            help='Do not show indent before formulas',
        ) = False,
        start_point: Unit.Arg(
            '-c', '--start-point',
            type=str,
            help='Start interpretation from a specific cell address',
            metavar='CELL',
        ) = '',
        password: Unit.Arg(
            '-p',
            '--password',
            type=str,
            help='Password to decrypt the protected document',
        ) = '',
        output_level: Unit.Arg.Number(
            '-o',
            '--output-level',
            help=(
                'Set the level of details to be shown (0:all commands, 1: commands no jump 2:important '
                'commands 3:strings in important commands).'
            ),
        ) = 0,
        timeout: Unit.Arg.Number(
            '-t',
            '--timeout',
            help='Stop emulation after N seconds (0: not interruption N>0: stop emulation after N seconds)',
        ) = 0,
    ):
        extract_only = sort_formulas or extract_only
        self.superinit(super(), **vars())

    @Unit.Requires('XLMMacroDeobfuscator', 'formats', 'office')
    def _process_file():
        with NoLogging(NoLogging.Mode.ALL):
            from XLMMacroDeobfuscator.configs import settings
            settings.SILENT = True
            from XLMMacroDeobfuscator.deobfuscator import process_file
            return process_file

    def process(self, data: bytearray):
        with VirtualFileSystem() as vfs, NoLogging(NoLogging.Mode.ALL):
            result = self._process_file(
                file=vfs.new(data),
                noninteractive=True,
                return_deobfuscated=True,
                extract_only=self.args.extract_only,
                silent=True,
                sort_formulas=self.args.sort_formulas,
                defined_names=False,
                with_ms_excel=self.args.with_ms_excel,
                start_with_shell=False,
                day=self.args.day,
                output_formula_format=self.args.output_formula_format,
                extract_formula_format=self.args.extract_formula_format,
                no_indent=self.args.no_indent,
                start_point=self.args.start_point,
                password=self.args.password,
                output_level=self.args.output_level,
                timeout=self.args.timeout,
            )
        return '\n'.join(result).encode(self.codec)

class xlxtr (*references)

Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet reference is of the form B1 or 1.2, both specifying the first cell of the second column. A cell range can be specified as B1:C12, or 1.2:C12, or 1.2:12.3. Finally, the unit will always refer to the first sheet in the document and to change this, specify the sheet name or index separated by a hashtag, i.e. sheet#B1:C12 or 1#B1:C12. Note that indices are 1-based. To get all elements of one sheet, use sheet#. The unit If parsing a sheet reference fails, the script will assume that the given reference specifies a sheet.

Expand source code Browse git

class xlxtr(_ExcelUnit):
    """
    Extract data from Microsoft Excel documents, both Legacy and new XML type documents. A sheet
    reference is of the form `B1` or `1.2`, both specifying the first cell of the second column.
    A cell range can be specified as `B1:C12`, or `1.2:C12`, or `1.2:12.3`. Finally, the unit will
    always refer to the first sheet in the document and to change this, specify the sheet name or
    index separated by a hashtag, i.e. `sheet#B1:C12` or `1#B1:C12`. Note that indices are
    1-based. To get all elements of one sheet, use `sheet#`. The unit If parsing a sheet reference
    fails, the script will assume that the given reference specifies a sheet.
    """
    def __init__(self, *references: Arg(metavar='reference', type=SheetReference, help=(
        'A sheet reference to be extracted. '
        'If no sheet references are given, the unit lists all sheet names.'
    ))):
        if not references:
            references = [SheetReference('*')]
        super().__init__(references=references)

    def process(self, data):
        try:
            wb = Workbook(data, self)
        except ImportError:
            raise
        except Exception as E:
            raise ValueError('Input not recognized as Excel document.') from E
        for ref in self.args.references:
            ref: SheetReference
            for k, name in enumerate(wb.sheets()):
                if not ref.match(k, name):
                    continue
                try:
                    data = wb.get_sheet_data(name)
                except Exception as error:
                    self.log_info(F'error reading sheet {name}:', error)
                    continue
                for r, row in enumerate(data, 1):
                    for c, value in enumerate(row, 1):
                        if (r, c) not in ref:
                            continue
                        if value is None:
                            continue
                        yield self.labelled(
                            str(value).encode(self.codec),
                            row=r,
                            col=c,
                            ref=_rc2ref(r, c),
                            sheet=name
                        )

class xor (*argument, bigendian=False, blocksize=None)

Form the exclusive or of the input data with the given argument.

Expand source code Browse git

class xor(BinaryOperationWithAutoBlockAdjustment):
    """
    Form the exclusive or of the input data with the given argument.
    """
    @staticmethod
    def operate(a, b): return a ^ b
    @staticmethod
    def inplace(a, b): a ^= b

    def _fastblock(self, data):
        try:
            return super()._fastblock(data)
        except FastBlockError as E:
            try:
                from Cryptodome.Util.strxor import strxor
            except ModuleNotFoundError:
                raise E
            else:
                from itertools import islice
                size = len(data)
                arg0 = self._normalize_argument(*self._argument_parse_hook(self.args.argument[0]))
                take = len(data) // self.blocksize + 1
                argb = self.unchunk(islice(arg0, take))
                del argb[size:]
                return strxor(data, argb)

class xsalsa (key, stateful=False, discard=0, nonce=b'REFINERY', magic=b'', offset=0, rounds=20)

XSalsa encryption and decryption. The nonce must be 24 bytes long.

Expand source code Browse git

class xsalsa(LatinCipherUnit):
    """
    XSalsa encryption and decryption. The nonce must be 24 bytes long.
    """
    def keystream(self) -> Iterable[int]:
        kdn, kdp, nonce = struct.unpack('<8sQ8s', self.args.nonce)
        yield from LatinX(
            SalsaCipher,
            (0, 5, 10, 15, 6, 7, 8, 9),
            self.args.key,
            kdn,
            kdp,
            nonce,
            self.args.magic,
            self.args.rounds,
            self.args.offset,
        )

class xt (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

This unit generically extracts files from archives. It attempts to identify the archive format and use the corresponding specific extractor from among the ones implemented in refinery.

This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xt --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xt [| dump {path} ]

Expand source code Browse git

class xt(ArchiveUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    This unit generically extracts files from archives. It attempts to identify the archive format
    and use the corresponding specific extractor from among the ones implemented in refinery.
    """
    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        out = False
        for engine in cls.handlers():
            engine_verdict = engine.handles(data)
            if engine_verdict is True:
                return True
            if engine_verdict is None:
                out = None
        return out

    @staticmethod
    def handlers():
        """
        Returns all archive handlers supported by the unit.
        """
        # units that check fixed offsets
        from refinery.units.formats.archive.xttar import xttar        ; yield xttar     # noqa
        from refinery.units.formats.archive.xtiso import xtiso        ; yield xtiso     # noqa
        from refinery.units.formats.office.xtrtf import xtrtf         ; yield xtrtf     # noqa
        from refinery.units.formats.archive.xtcab import xtcab        ; yield xtcab     # noqa
        from refinery.units.formats.archive.xtace import xtace        ; yield xtace     # noqa
        from refinery.units.formats.archive.xtmacho import xtmacho    ; yield xtmacho   # noqa
        from refinery.units.formats.archive.xtasar import xtasar      ; yield xtasar    # noqa
        from refinery.units.formats.pdf import xtpdf                  ; yield xtpdf     # noqa
        from refinery.units.formats.winreg import winreg              ; yield winreg    # noqa
        from refinery.units.formats.archive.xtgz import xtgz          ; yield xtgz      # noqa
        from refinery.units.formats.archive.xtcpio import xtcpio      ; yield xtcpio    # noqa
        # units that use fixed offsets + file magic
        from refinery.units.formats.msi import xtmsi                  ; yield xtmsi     # noqa
        # units that search for markers
        from refinery.units.formats.archive.xt7z import xt7z          ; yield xt7z      # noqa
        from refinery.units.formats.office.xtdoc import xtdoc         ; yield xtdoc     # noqa
        from refinery.units.formats.archive.xtzip import xtzip        ; yield xtzip     # noqa
        from refinery.units.formats.pe.dotnet.dnsfx import dnsfx      ; yield dnsfx     # noqa
        from refinery.units.formats.archive.xtinno import xtinno      ; yield xtinno    # noqa
        from refinery.units.formats.archive.xtiss import xtiss        ; yield xtiss     # noqa
        from refinery.units.formats.archive.xtnsis import xtnsis      ; yield xtnsis    # noqa
        from refinery.units.formats.archive.xtpyi import xtpyi        ; yield xtpyi     # noqa
        from refinery.units.formats.a3x import a3x                    ; yield a3x       # noqa
        from refinery.units.formats.archive.xtnode import xtnode      ; yield xtnode    # noqa
        from refinery.units.formats.archive.xtzpaq import xtzpaq      ; yield xtzpaq    # noqa
        from refinery.units.formats.email import xtmail               ; yield xtmail    # noqa
        from refinery.units.formats.office.xtone import xtone         ; yield xtone     # noqa
        # units that implement more complex parsing / searching:
        from refinery.units.formats.archive.xtsim import xtsim        ; yield xtsim     # noqa
        from refinery.units.formats.archive.xtnuitka import xtnuitka  ; yield xtnuitka  # noqa
        # fallbacks that have to be attempted last
        from refinery.units.formats.json import xtjson                ; yield xtjson    # noqa
        from refinery.units.formats.exe.vsect import vsect            ; yield vsect     # noqa

    def unpack(self, data):
        fallback: List[Type[PathExtractorUnit]] = []
        errors = {}
        pos_args = self.args.paths
        key_args = dict(
            list=self.args.list,
            path=self.args.path,
            date=self.args.date,
            join_path=self.args.join,
            drop_path=self.args.drop,
        )
        if self.args.pwd:
            key_args.update(pwd=self.args.pwd)
        if self.args.regex:
            key_args.update(regex=self.args.regex)

        class unpacker:
            unit = self

            def __init__(self, handler: Type[PathExtractorUnit], fallback: bool):
                self.success = False
                self.handler = handler
                self.fallback = fallback
                self.count = 0

            def __iter__(self):
                handler = self.handler
                if self.fallback:
                    verdict = True
                else:
                    verdict = handler.handles(data)
                if verdict is False:
                    self.unit.log_info(F'rejected: {handler.name}')
                elif verdict is True:
                    if not self.fallback:
                        self.unit.log_info(F'accepted: {handler.name}')
                    try:
                        unit = handler(*pos_args, **key_args)
                        unit.args.lenient = self.unit.args.lenient
                        unit.args.quiet = self.unit.args.quiet
                        unit.log_level = self.unit.log_level
                    except TypeError as error:
                        self.unit.log_debug('handler construction failed:', error)
                        return
                    try:
                        test_unpack = not self.unit.args.list
                        for item in unit.unpack(data):
                            if test_unpack:
                                item.get_data()
                                test_unpack = False
                            self.count += 1
                            yield item
                    except Exception as error:
                        if not self.fallback:
                            errors[handler.name] = error
                        if isinstance(error, MultipleArchives):
                            self.unit.log_warn(error)
                        else:
                            self.unit.log_debug('handler unpacking failed:', error)
                    else:
                        self.success = True
                elif verdict is None:
                    fallback.append(handler)

        extracted = 0

        for handler in self.handlers():
            self.CustomPathSeparator = handler.CustomPathSeparator
            it = unpacker(handler, fallback=False)
            yield from it
            if it.success:
                extracted += it.count
                if extracted != 0:
                    break
                self.log_debug('handler extracted zero items, continuing')

        if extracted > 0:
            return

        self.log_debug('fallback order:', lambda: ', '.join(h.name for h in fallback))

        for handler in fallback:
            it = unpacker(handler, fallback=True)
            yield from it
            if it.success:
                return

        if not errors:
            raise ValueError('input data did not match any known archive format')
        for name, error in errors.items():
            self.log_info(F'error when trying to unpack with {name}:', error)
        raise RefineryException('none of the available unpackers could handle this data')

class xt7z (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from a 7zip archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xt7z --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xt7z [| dump {path} ]

Expand source code Browse git

class xt7z(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a 7zip archive.
    """
    @ArchiveUnit.Requires('py7zr', 'arc', 'default', 'extended')
    def _py7zr():
        import py7zr
        import py7zr.exceptions
        return py7zr

    def unpack(self, data: bytearray):
        for match in re.finditer(re.escape(B'7z\xBC\xAF\x27\x1C'), data):
            start = match.start()
            if start != 0:
                self.log_info(F'found a header at offset 0x{start:X}, trying to extract from there.')
            try:
                yield from self._unpack_from(data, start)
            except self._py7zr.Bad7zFile:
                continue
            else:
                break

    def _unpack_from(self, data: bytearray, zp: int = 0):
        def mk7z(**keywords):
            return self._py7zr.SevenZipFile(MemoryFile(mv[zp:]), **keywords)

        pwd = self.args.pwd
        mv = memoryview(data)

        def test(archive: SevenZipFile):
            if self.args.list:
                archive.list()
                return False
            return archive.testzip()

        if pwd:
            try:
                archive = mk7z(password=pwd.decode(self.codec))
            except self._py7zr.Bad7zFile:
                raise ValueError('corrupt archive; the password is likely invalid.')
        else:
            def passwords():
                yield None
                yield from self._COMMON_PASSWORDS
            for pwd in passwords():
                if pwd is None:
                    self.log_debug(U'trying empty password')
                else:
                    self.log_debug(F'trying password: {pwd}')
                try:
                    archive = mk7z(password=pwd)
                    problem = test(archive)
                except self._py7zr.PasswordRequired:
                    problem = True
                except self._py7zr.UnsupportedCompressionMethodError as E:
                    raise ValueError(E.message)
                except self._py7zr.exceptions.InternalError:
                    # ignore internal errors during testzip
                    break
                except SystemError:
                    problem = True
                except Exception:
                    if pwd is None:
                        raise
                    problem = True
                if not problem:
                    break
            else:
                raise ValueError('a password is required and none of the default passwords worked.')

        has_read_method = hasattr(archive, 'read')

        for info in archive.list():
            if has_read_method:
                def extract(archive: SevenZipFile = archive, name: str = info.filename):
                    archive.reset()
                    io = archive.read([name])
                    io = io[name]
                    io.seek(0)
                    return io.read()
            else:
                def extract(archive: SevenZipFile = archive, name: str = info.filename):
                    io = _IOFactory()
                    archive.reset()
                    archive.extract(None, [name], factory=io)
                    return io.buffer.getvalue()

            if info.is_directory:
                continue

            yield self._pack(
                info.filename,
                info.creationtime,
                extract,
                crc32=info.crc32,
                uncompressed=info.uncompressed
            )

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return B'7z\xBC\xAF\x27\x1C' in data

class xtace (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from an ACE archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtace --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtace [| dump {path} ]

Expand source code Browse git

class xtace(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from an ACE archive.
    """
    def unpack(self, data):
        ace = acefile.open(MemoryFile(data, read_as_bytes=True))
        for member in ace.getmembers():
            member: acefile.AceMember
            comment = {} if not member.comment else {'comment': member.comment}
            yield self._pack(
                member.filename,
                member.datetime,
                lambda a=ace, m=member: a.read(m, pwd=self.args.pwd),
                **comment
            )

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return b'**ACE**' in data[:0x100]

class xtasar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from a ASAR archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtasar --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtasar [| dump {path} ]

Expand source code Browse git

class xtasar(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a ASAR archive.
    """
    def unpack(self, data: bytearray):
        def _unpack(dir: JSONDict, *path):
            for name, listing in dir.get('files', {}).items():
                yield from _unpack(listing, *path, name)
            try:
                offset = dir['offset']
                size = dir['size']
            except KeyError:
                return
            try:
                offset = int(offset) + header.base
                end = int(size) + offset
            except TypeError:
                self.log_warn(F'unable to convert offset "{offset}" and size "{size}" to integers')
                return
            if not path:
                self.log_warn(F'not processing item at root with offset {offset} and size {size}')
                return
            yield UnpackResult(
                '/'.join(path),
                lambda a=offset, b=end: data[a:b],
                offset=offset
            )

        header = AsarHeader(data)
        self.log_debug(F'header read successfully, base offset is {header.base}.')
        yield from _unpack(header.directory)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return data[:4] == b'\04\0\0\0' and data[0x10:0x18] == B'{"files"'

class xtcab (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from CAB (cabinet) archives. Multi-volume archives can be extracted if all required disks are present as chunks within the current frame.

emit something | xtcab --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtcab [| dump {path} ]

Expand source code Browse git

class xtcab(ArchiveUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    Extract files from CAB (cabinet) archives. Multi-volume archives can be extracted if all
    required disks are present as chunks within the current frame.
    """
    def unpack(self, data: Chunk):
        arc: Cabinet = data.temp
        arc.check()
        arc.process()
        one = len(arc.files) == 1
        self.log_info(F'processing CAB with {len(arc)} disks')
        for id, files in arc.files.items():
            for file in files:
                path = file.name
                if not one:
                    path = F'CAB{id:04X}/{path}'
                yield self._pack(path, file.timestamp, lambda f=file: f.decompress())

    def filter(self, inputs):
        box = None
        cab = Cabinet()
        for chunk in inputs:
            if box is None:
                box = chunk
                box.temp = cab
            if cab.needs_more_disks():
                cab.append(chunk)
            else:
                yield box
                box = chunk
                cab = box.temp = Cabinet()
        if box:
            yield box

    @classmethod
    def handles(cls, data: bytearray):
        return data.startswith(CabDisk.MAGIC)

class xtcpio (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from a CPIO archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtcpio --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtcpio [| dump {path} ]

Expand source code Browse git

class xtcpio(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a CPIO archive.
    """
    def unpack(self, data):
        def cpio():
            with suppress(EOF):
                return CPIOEntry(reader)
        reader = StructReader(memoryview(data))
        for entry in iter(cpio, None):
            if entry.name == 'TRAILER!!!':
                break
            yield self._pack(entry.name, entry.mtime, entry.data)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return data[:6] == B'070701'

class xtdoc (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract files from an OLE document such as a Microsoft Word DOCX file.

Expand source code Browse git

class xtdoc(PathExtractorUnit):
    """
    Extract files from an OLE document such as a Microsoft Word DOCX file.
    """

    @PathExtractorUnit.Requires('olefile', 'formats', 'office', 'extended')
    def _olefile():
        import olefile
        return olefile

    def unpack(self, data):
        with MemoryFile(data) as stream:
            try:
                oledoc = self._olefile.OleFileIO(stream)
            except OSError as error:
                self.log_info(F'error, {error}, treating input as zip file')
                yield from xtzip().unpack(data)
                return
            for item in oledoc.listdir():
                if not item or not item[-1]:
                    continue
                path = '/'.join(item)
                olestream = oledoc.openstream(path)
                c0 = ord(item[-1][:1])
                if c0 < 20:
                    item[-1] = F'[{c0:d}]{item[-1][1:]}'
                    path = '/'.join(item)
                path = convert_msi_name(path)
                self.log_debug('exploring:', path)
                yield UnpackResult(path, olestream.read())

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        if data[:4] == B'\xD0\xCF\x11\xE0':
            return True
        if xtzip.handles(data):
            return sum(1 for marker in [
                B'[Content_Types].xml',
                B'word/document.xml',
                B'docProps/core.xml',
            ] if marker in data) >= 2

class xtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, rounds=32)

XTEA encryption and decryption.

Expand source code Browse git

class xtea(TEAUnit, cipher=BlockCipherFactory(XTEA)):
    """
    XTEA encryption and decryption.
    """
    pass

class xtgz (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract a file from a GZip archive.

Expand source code Browse git

class xtgz(ArchiveUnit):
    """
    Extract a file from a GZip archive.
    """
    def unpack(self, data: bytearray):
        archive = GzipHeader(data)
        path = archive.name
        date = archive.mtime
        date = date and datetime.fromtimestamp(date) or None
        if path is None:
            try:
                meta = metavars(data)
                path = Path(meta['path'])
            except KeyError:
                path = 'ungz'
            else:
                self.log_warn(path)
                suffix = path.suffix
                if suffix.lower() == '.gz':
                    path = path.with_suffix('')
                else:
                    path = path.with_suffix(F'{suffix}.ungz')
                path = path.as_posix()
        yield self._pack(path, date, archive.data)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return data.startswith(B'\x1F\x8B')

class xthtml (*paths, outer=False, attributes=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the given tag. The main purpose is to extract scripts from HTML documents.

Expand source code Browse git

class xthtml(XMLToPathExtractorUnit):
    """
    The unit processes an HTML document and extracts the contents of all elemnts in the DOM of the
    given tag. The main purpose is to extract scripts from HTML documents.
    """
    def __init__(
        self, *paths,
        outer: Arg.Switch('-o', help='Include the HTML tags for an extracted element.') = False,
        attributes: Arg.Switch('-a', help='Populate chunk metadata with HTML tag attributes.') = False,
        list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path'
    ):
        super().__init__(
            *paths,
            outer=outer,
            attributes=attributes,
            format='{tag}',
            path=path,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
        )

    def unpack(self, data):
        try:
            text = data.decode(self.codec)
        except UnicodeDecodeError:
            text = data.decode('latin1')

        html = HTMLTreeParser()
        html.feed(text)
        root = html.tos
        root.reindex()

        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        while root.parent:
            self.log_info(F'tag was not closed: {root.tag}')
            root = root.parent

        while len(root.children) == 1:
            child, = root.children
            if child.tag != root.tag:
                break
            root = child

        def tree(root: HTMLNode, *parts: str):

            def outer(root: HTMLNode = root):
                return root.recover(inner=False).encode(self.codec)

            def inner(root: HTMLNode = root):
                return root.recover().encode(self.codec)

            tagpath = '/'.join(parts)
            meta = {}

            if self.args.attributes:
                meta.update(root.attributes)

            if root.root:
                yield UnpackResult(tagpath, inner, **meta)
            elif self.args.outer:
                yield UnpackResult(tagpath, outer, **meta)
            else:
                yield UnpackResult(tagpath, inner, **meta)

            for child in root.children:
                if child.textual:
                    continue
                yield from tree(child, *parts, path(child))

        yield from tree(root, path(root))

    @classmethod
    def handles(self, data: bytearray):
        from refinery.lib import mime
        info = mime.get_cached_file_magic_info(data)
        if info.extension == 'html':
            return True
        if info.mime.endswith('html'):
            return True
        return False

class xtinno (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from InnoSetup archives: This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtinno --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtinno [| dump {path} ]

Note: This unit generates the following synthetic metadata files under the "meta" directory:

setup.bin contains the raw bytes for the setup metadata
setup.template contains the raw and unprocessed metadata in JSON format
setup.json contains the setup metadata with all format fields expanded

Similarly, there are files.bin, files.template, and files.json that contain the metadata of the archived files. The files that are extracted under the "embedded" directory are usually parts of the InnoSetup installer and not user data. All archived files are extracted within the directory named "data".

Expand source code Browse git

class xtinno(ArchiveUnit, _ps, docs='{0} {PathExtractorUnit}{p}{_ps}'):
    """
    Extract files from InnoSetup archives:
    """
    def unpack(self, data: bytearray):
        def post_process_json(doc):
            if isinstance(doc, dict):
                return {key: post_process_json(val) for key, val in doc.items()}
            if isinstance(doc, list):
                return [post_process_json(entry) for entry in doc]
            if not isinstance(doc, str):
                return doc
            try:
                return inno.emulator.reset().expand_constant(doc)
            except Exception:
                return doc

        inno = InnoArchive(data, self)

        password: bytes = self.args.pwd
        password = password.decode(self.codec) if password else None

        if any(file.encrypted for file in inno.files) and password is None:
            self.log_info('some files are password-protected and no password was given')

        with BytesAsArrayEncoder as encoder:
            yield self._pack('meta/setup.bin', None, inno.streams.TSetup.data)
            doc = inno.setup_info.json()
            yield self._pack('meta/setup.template', None, encoder.dumps(doc).encode(self.codec))
            doc = post_process_json(doc)
            yield self._pack('meta/setup.json', None, encoder.dumps(doc).encode(self.codec))

        with BytesAsArrayEncoder as encoder:
            yield self._pack('meta/files.bin', None, inno.streams.TData.data)
            doc = inno.setup_data.json()
            yield self._pack('meta/files.template', None, encoder.dumps(doc).encode(self.codec))
            doc = post_process_json(doc)
            yield self._pack('meta/files.json', None, encoder.dumps(doc).encode(self.codec))

        def _uninstaller(i=inno):
            return i.read_stream(i.streams.Uninstaller)
        yield self._pack('embedded/uninstaller.exe', None, _uninstaller)

        if license := inno.setup_info.Header.get_license():
            yield self._pack('embedded/license.rtf', None, license.encode(self.codec))

        if script := inno.setup_info.Header.get_script():
            yield self._pack('embedded/script.bin', None, script)
            yield self._pack('embedded/script.ps', None,
                lambda i=inno: i.ifps.disassembly().encode(self.codec))

        if dll := inno.setup_info.get_decompress_dll():
            yield self._pack(F'embedded/decompress.{magic(dll).extension}', None, dll)

        if dll := inno.setup_info.get_decryption_dll():
            yield self._pack(F'embedded/decryption.{magic(dll).extension}', None, dll)

        for size, images in (
            ('small', inno.setup_info.get_wizard_images_small()),
            ('large', inno.setup_info.get_wizard_images_large()),
        ):
            _formatting = len(str(len(images) + 1))
            for k, img in enumerate(images, 1):
                yield self._pack(F'embedded/images/{size}{k:0{_formatting}d}.{magic(img).extension}', None, img)

        for file in inno.files:
            if file.dupe:
                continue

            def _read(inno=inno, file=file, pwd=password):
                if pwd is None:
                    inno.guess_password(10)
                if self.leniency > 0:
                    return inno.read_file(file, pwd)
                try:
                    return inno.read_file_and_check(file, pwd)
                except InvalidPassword:
                    raise
                except Exception as E:
                    raise ValueError(F'{E!s} [ignore this check with -L]') from E

            yield self._pack(file.path, file.date, _read,
                tags=[t.name for t in SetupFileFlags if t & file.tags])

    @classmethod
    def handles(cls, data):
        if data[:2] != B'MZ':
            return False
        if re.search(re.escape(InnoArchive.ChunkPrefix), data) is None:
            return False
        for marker in [
            B'Inno Setup Setup Data',
            B'Inno Setup Messages',
            B'<description>Inno Setup</description>',
            B'InnoSetupLdrWindow',
            B'This installation was built with Inno Setup',
        ]:
            if re.search(re.escape(marker), data):
                return True
        for marker in TSetupMagicToVersion.keys():
            if re.search(re.escape(marker), data) is not None:
                return True
        return False

class xtiso (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', fs='auto')

Extract files from a ISO archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtiso --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtiso [| dump {path} ]

Expand source code Browse git

class xtiso(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a ISO archive.
    """
    def __init__(
        self,
        *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
        fs: Arg.Choice('-s', metavar='TYPE', choices=_ISO_FILE_SYSTEMS, help=(
            'Specify a file system ({choices}) extension to use. The default setting {default} will automatically '
            'detect the first of the other available options and use it.')) = 'auto'
    ):
        if fs not in _ISO_FILE_SYSTEMS:
            raise ValueError(F'invalid file system {fs}: must be udf, joliet, rr, iso, or auto.')
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            path=path,
            date=date,
            fs=fs
        )

    @ArchiveUnit.Requires('pycdlib', 'arc', 'default', 'extended')
    def _pycdlib():
        import pycdlib
        import pycdlib.dates

        def fixed_parse(self, datestr):
            datestr = datestr[:-3] + b'00\0'
            return original_parse(self, datestr)

        original_parse = pycdlib.dates.VolumeDescriptorDate.parse
        pycdlib.dates.VolumeDescriptorDate.parse = fixed_parse
        return pycdlib

    @staticmethod
    def _strip_revision(name: str):
        base, split, revision = name.partition(';')
        return base if split and revision.isdigit() else name

    def unpack(self, data):
        if not self.handles(data):
            self.log_warn('The data does not look like an ISO file.')
        with MemoryFile(data, read_as_bytes=True) as stream:
            iso = self._pycdlib.PyCdlib()
            iso.open_fp(stream)
            fs = self.args.fs
            if fs != 'auto':
                mkfacade = {
                    'iso'    : iso.get_iso9660_facade,
                    'udf'    : iso.get_udf_facade,
                    'joliet' : iso.get_joliet_facade,
                    'rr'     : iso.get_rock_ridge_facade,
                }
                facade = mkfacade[fs]()
            elif iso.has_udf():
                self.log_info('using format: udf')
                facade = iso.get_udf_facade()
            elif iso.has_joliet():
                self.log_info('using format: joliet')
                facade = iso.get_joliet_facade()
            elif iso.has_rock_ridge():
                self.log_info('using format: rr')
                facade = iso.get_rock_ridge_facade()
            else:
                self.log_info('using format: iso')
                facade = iso.get_iso9660_facade()

            for root, _, files in facade.walk('/'):
                root = root.rstrip('/')
                for name in files:
                    name = name.lstrip('/')
                    path = F'{root}/{name}'
                    try:
                        info = facade.get_record(path)
                        date = info.date
                    except Exception:
                        info = None
                        date = None
                    else:
                        date = datetime.datetime(
                            date.years_since_1900 + 1900,
                            date.month,
                            date.day_of_month,
                            date.hour,
                            date.minute,
                            date.second,
                            tzinfo=datetime.timezone(datetime.timedelta(minutes=15 * date.gmtoffset))
                        )

                    def extract(info=info, path=path):
                        if info:
                            buffer = MemoryFile(bytearray(info.data_length))
                        else:
                            buffer = MemoryFile(bytearray())
                        facade.get_file_from_iso_fp(buffer, path)
                        return buffer.getvalue()

                    yield self._pack(self._strip_revision(path), date, extract)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return any(data[k] == B'CD001' for k in (
            slice(0x8001, 0x8006),
            slice(0x8801, 0x8806),
            slice(0x9001, 0x9006),
        ))

class xtiss (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extracts files from Install Shield Setup files. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtiss --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtiss [| dump {path} ]

Expand source code Browse git

class xtiss(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extracts files from Install Shield Setup files.
    """
    def unpack(self, data: bytearray):
        offset = max(data.rfind(magic) for magic in ISSReader.MAGIC)
        if offset < 0:
            raise ValueError('ISS magic not found.')
        data[:offset] = []

        reader = ISSReader(data)
        count = reader.iss_archive_header()

        self.log_info(F'archive contains {count} files according to header')

        for _ in range(count):
            name, data = reader.iss_file()
            yield self._pack(name, None, data)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return data.startswith(B'MZ') and any(data.find(m) > 0 for m in ISSReader.MAGIC)

class xtjson (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract values from a JSON document.

Expand source code Browse git

class xtjson(PathExtractorUnit):
    """
    Extract values from a JSON document.
    """
    CustomPathSeparator = '.'

    def unpack(self, data):

        sep = self.CustomPathSeparator

        def crawl(path, cursor):
            if isinstance(cursor, dict):
                for key, value in cursor.items():
                    yield from crawl(F'{path}{sep}{key}', value)
            elif isinstance(cursor, list):
                for key, value in enumerate(cursor):
                    yield from crawl(F'{path}{sep}{key:d}', value)
            if path:
                yield path, cursor, cursor.__class__.__name__

        if not isinstance(data, (dict, list)):
            data = json.loads(data)

        for path, item, typename in crawl('', data):
            def extract(item=item):
                if isinstance(item, (list, dict)):
                    dumped = json.dumps(item, indent=4)
                else:
                    dumped = str(item)
                try:
                    return dumped.encode('latin1')
                except UnicodeEncodeError:
                    return dumped.encode('utf8')

            yield UnpackResult(path, extract, type=typename)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        from refinery.units.pattern.carve_json import JSONCarver
        carver = JSONCarver(data)
        try:
            _, c = next(carver)
        except Exception:
            return False
        try:
            _, _ = next(carver)
        except StopIteration:
            return c == data.strip()
        else:
            return False

class xtmacho (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)."

Expand source code Browse git

class xtmacho(ArchiveUnit):
    """
    Extract the individual executables from a MachO universal binary (sometimes called a MachO fat file)."
    """
    _SIGNATURE_BE = B'\xCA\xFE\xBA\xBE'
    _SIGNATURE_LE = B'\xBE\xBA\xFE\xCA'

    def unpack(self, data: bytearray):
        view = memoryview(data)
        signature = bytes(view[:4])
        try:
            reader = StructReader(view, bigendian={
                self._SIGNATURE_BE: True,
                self._SIGNATURE_LE: False,
            }[signature])
        except KeyError as KE:
            raise ValueError('Not a MachO universal binary; invalid magic header bytes.') from KE
        else:
            reader.seekset(4)
        count = reader.u32()
        self.log_info(F'reading {count} embedded executables')
        while count > 0:
            fa = FatArch(reader)
            self.log_info(F'reading item of size 0x{len(fa.data):08X}, arch {fa.cputype.name}')
            yield self._pack(fa.cputype.name, None, fa.data)
            count -= 1

    @classmethod
    def handles(cls, data: bytearray):
        return data[:4] in (
            cls._SIGNATURE_BE,
            cls._SIGNATURE_LE,
        )

class xtmagtape

Extract files from SIMH magtape files.

Expand source code Browse git

class xtmagtape(Unit):
    """
    Extract files from SIMH magtape files.
    """
    def process(self, data: bytearray):
        reader = StructReader(data)

        for r in itertools.count():
            buffer = MemoryFile()

            for k in itertools.count():
                try:
                    head = reader.peek(4)
                    size = reader.read_integer(24)
                    mark = reader.read_byte()
                except EOFError:
                    self.log_info('end of file while reading chunk header, terminating')
                    return
                if not any(head):
                    if k == 0:
                        return
                    break
                if mark != 0:
                    self.log_warn(F'error code 0x{mark:02X} in record {r}.{k}')
                buffer.write(reader.read(size))
                if reader.peek(4) != head:
                    if reader.tell() % 2 and reader.peek(5)[1:] == head:
                        padding = reader.read_byte()
                        if padding != 0:
                            self.log_info(F'nonzero padding byte in record {r}.{k}')
                    else:
                        raise ValueError('Invalid footer, data is corrupted.')
                reader.seekrel(4)

            yield buffer.getvalue()

class xtmail (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract files and body from EMail messages. The unit supports both the Outlook message format and regular MIME documents.

Expand source code Browse git

class xtmail(PathExtractorUnit):
    """
    Extract files and body from EMail messages. The unit supports both the Outlook message format
    and regular MIME documents.
    """
    def _get_headparts(self, head: dict[str, str]):
        mw = mimewords()
        mw = partial(_unwrap(mw.process), mw)
        jh: dict[str, list[str]] = defaultdict(list)
        for key, value in head:
            jh[key].append(mw(''.join(re.sub(R'\A\s+', '\x20', t) for t in value.splitlines(False))))
        jh = {k: v[0] if len(v) == 1 else [t for t in v if t] for k, v in jh.items()}
        yield UnpackResult('headers.txt',
            lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec))
        received = []
        for recv in jh.get('Received', []):
            if not recv.startswith('from '):
                received = None
                break
            recv = recv[5:]
            src, _, rest = recv.partition(' by ')
            dst, _, rest = rest.partition(' with ')
            received.append({
                'Target': src.partition('\x20')[0],
                'Source': dst.partition('\x20')[0],
            })
        if received:
            received.reverse()
            jh['ReceivedTrace'] = received
        yield UnpackResult('headers.json',
            lambda jsn=jh: json.dumps(jsn, indent=4).encode(self.codec))

    @PathExtractorUnit.Requires('extract-msg<=0.54.0', 'formats', 'office', 'default', 'extended')
    def _extract_msg():
        import extract_msg.enums
        return extract_msg

    def _get_parts_outlook(self, data):
        def ensure_bytes(data: bytes | str):
            return data if isinstance(data, bytes) else data.encode(self.codec)

        def make_message(name, msg: Message):
            bodies = msg.detectedBodies
            BT = self._extract_msg.enums.BodyTypes
            if bodies & BT.HTML:
                def htm(msg=msg):
                    with suppress(Exception), NoLogging():
                        return ensure_bytes(msg.htmlBody)
                yield UnpackResult(F'{name}.htm', htm)
            if bodies & BT.PLAIN:
                def txt(msg=msg):
                    with suppress(Exception), NoLogging():
                        return ensure_bytes(msg.body)
                yield UnpackResult(F'{name}.txt', txt)
            if bodies & BT.RTF:
                def rtf(msg=msg):
                    with suppress(Exception), NoLogging():
                        return ensure_bytes(msg.rtfBody)
                yield UnpackResult(F'{name}.rtf', rtf)

        msgcount = 0

        with NoLogging():
            class ForgivingMessage(self._extract_msg.Message):
                """
                If parsing the input bytes fails early, the "__open" private attribute may not
                yet exist. This hack prevents an exception to occur in the destructor.
                """
                def __getattr__(self, key: str):
                    if key.endswith('_open'):
                        return False
                    raise AttributeError(key)
            msg = ForgivingMessage(bytes(data))

        yield from self._get_headparts(msg.header.items())
        yield from make_message('body', msg)

        def attachments(msg):
            for attachment in getattr(msg, 'attachments', ()):
                yield attachment
                if attachment.type == 'data':
                    continue
                yield from attachments(attachment.data)

        for attachment in attachments(msg):
            at = attachment.type
            if at is self._extract_msg.enums.AttachmentType.MSG:
                msgcount += 1
                yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data)
                continue
            if not isbuffer(attachment.data):
                self.log_warn(F'unknown attachment of type {at}, please report this!')
                continue
            path = attachment.longFilename or attachment.shortFilename
            yield UnpackResult(F'attachments/{path}', attachment.data)

    @PathExtractorUnit.Requires('chardet', 'default', 'extended')
    def _chardet():
        import chardet
        return chardet

    def _get_parts_regular(self, data: bytes):
        try:
            info = self._chardet.detect(data)
            msg = data.decode(info['encoding'])
        except UnicodeDecodeError:
            raise ValueError('This is not a plaintext email message.')
        else:
            msg = Parser().parsestr(msg)

        yield from self._get_headparts(msg.items())

        for k, part in enumerate(msg.walk()):
            path = part.get_filename()
            elog = None
            if path is None:
                extension = file_extension(part.get_content_type(), 'txt')
                path = F'body.{extension}'
            else:
                path = path | mimewords | str
                path = F'attachments/{path}'
            try:
                data = part.get_payload(decode=True)
            except Exception as E:
                try:
                    data = part.get_payload(decode=False)
                except Exception as E:
                    elog = str(E)
                    data = None
                else:
                    from refinery import carve
                    self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}')
                    if isinstance(data, str):
                        data = data.encode('latin1')
                    if isbuffer(data):
                        data = next(data | carve('b64', stripspace=True, single=True, decode=True))
                    else:
                        elog = str(E)
                        data = None
            if not data:
                if elog is not None:
                    self.log_warn(F'could not get content of message part {k}: {elog!s}')
                continue
            yield UnpackResult(path, data)

    def unpack(self, data):
        try:
            yield from self._get_parts_outlook(data)
        except Exception:
            self.log_debug('failed parsing input as Outlook message')
            yield from self._get_parts_regular(data)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        markers = [
            b'\nReceived:\x20from'
            b'\nSubject:\x20',
            b'\nTo:\x20',
            b'\nFrom:\x20',
            B'\nMessage-ID:\x20',
            b'\nBcc:\x20',
            b'\nContent-Transfer-Encoding:\x20',
            b'\nContent-Type:\x20',
            b'\nReturn-Path:\x20',
        ]
        if data.startswith(B'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
            markers = [marker.decode('latin1').encode('utf-16le') for marker in markers]
        counter = 0
        for marker in markers:
            if re.search(re.escape(marker), data, flags=re.IGNORECASE):
                counter += 1
            if counter >= 3:
                return True
        return False

class xtmsi (*paths, list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, nocab=False)

Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file MsiTables.json contains parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in a virtual folder named "Action".

Expand source code Browse git

class xtmsi(xtdoc):
    """
    Extract files and metadata from Microsoft Installer (MSI) archives. The synthetic file {FN} contains
    parsed MSI table information, similar to the output of the Orca tool. Binary streams are placed in a
    virtual folder called "Binary", and extracted scripts from custom actions are separately extracted in
    a virtual folder named "Action".
    """

    _SYNTHETIC_STREAMS_FILENAME = 'MsiTables.json'
    _SYNTHETIC_STREAMS_TOPLEVEL = 'MsiTables'

    # https://learn.microsoft.com/en-us/windows/win32/msi/summary-list-of-all-custom-action-types
    _CUSTOM_ACTION_TYPES = {
        0x01: 'DLL file stored in a Binary table stream.',
        0x02: 'EXE file stored in a Binary table stream.',
        0x05: 'JScript file stored in a Binary table stream.',
        0x06: 'VBScript file stored in a Binary table stream.',
        0x11: 'DLL file that is installed with a product.',
        0x12: 'EXE file that is installed with a product.',
        0x13: 'Displays a specified error message and returns failure, terminating the installation.',
        0x15: 'JScript file that is installed with a product.',
        0x16: 'VBScript file that is installed with a product.',
        0x22: 'EXE file having a path referencing a directory.',
        0x23: 'Directory set with formatted text.',
        0x25: 'JScript text stored in this sequence table.',
        0x26: 'VBScript text stored in this sequence table.',
        0x32: 'EXE file having a path specified by a property value.',
        0x33: 'Property set with formatted text.',
        0x35: 'JScript text specified by a property value.',
        0x36: 'VBScript text specified by a property value.',
    }

    def __init__(
            self, *paths,
            list=False, path=b'path', join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
            nocab: Arg.Switch('-N', help='Do not list and extract embedded CAB archives.') = False, **kw,
    ):
        super().__init__(
            *paths,
            list=list,
            path=path,
            join_path=join_path,
            drop_path=drop_path,
            nocab=nocab,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            **kw,
        )

    def unpack(self, data):
        streams = {result.path: result for result in super().unpack(data)}

        def stream(name: str):
            return streams.pop(name).get_data()

        def column_formats(table: Dict[str, MSITableColumnInfo]) -> str:
            return ''.join(v.struct_format for v in table.values())

        def stream_to_rows(data: ByteStr, row_format: str):
            row_size = struct.calcsize(F'<{row_format}')
            row_count = int(len(data) / row_size)
            reader = StructReader(data)
            columns = [reader.read_struct(F'<{sc * row_count}') for sc in row_format]
            for i in range(row_count):
                yield [c[i] for c in columns]

        tables: Dict[str, Dict[str, MSITableColumnInfo]] = collections.defaultdict(collections.OrderedDict)
        strings = MSIStringData(stream('!_StringData'), stream('!_StringPool'))

        for tbl_name_id, col_number, col_name_id, col_attributes in stream_to_rows(stream('!_Columns'), 'HHHH'):
            tbl_name = strings.ref(tbl_name_id)
            col_name = strings.ref(col_name_id)
            tables[tbl_name][col_name] = MSITableColumnInfo(col_number, col_attributes)

        table_names_given = {strings.ref(k) for k in chunks.unpack(stream('!_Tables'), 2, False)}
        table_names_known = set(tables)

        for name in table_names_known - table_names_given:
            self.log_warn(F'table name known but not given: {name}')
        for name in table_names_given - table_names_known:
            self.log_warn(F'table name given but not known: {name}')

        class ScriptItem(NamedTuple):
            row_index: int
            extension: Optional[str]

        processed_table_data: Dict[str, List[Dict[str, str]]] = {}
        tbl_properties: Dict[str, str] = {}
        tbl_files: Dict[str, str] = {}
        tbl_components: Dict[str, str] = {}
        postprocessing: List[ScriptItem] = []

        def format_string(string: str):
            # https://learn.microsoft.com/en-us/windows/win32/msi/formatted
            def _replace(match: re.Match[str]):
                _replace.done = False
                prefix, name = match.groups()
                if not prefix:
                    tbl = tbl_properties
                elif prefix in '%':
                    name = name.rstrip('%').upper()
                    return F'%{name}%'
                elif prefix in '!#':
                    tbl = tbl_files
                elif prefix in '$':
                    tbl = tbl_components
                else:
                    raise ValueError
                return tbl.get(name, '')
            while True:
                _replace.done = True
                string = re.sub(R'''(?x)
                    \[             # open square bracket
                      (?![~\\])    # not followed by escapes
                      ([%$!#]?)    # any of the valid prefix characters
                      ([^[\]{}]+)  # no brackets or braces
                    \]''', _replace, string)
                if _replace.done:
                    break
            string = re.sub(r'\[\\(.)\]', r'\1', string)
            string = string.replace('[~]', '\0')
            return string

        for table_name, table in tables.items():
            stream_name = F'!{table_name}'
            if stream_name not in streams:
                continue
            processed = []
            info = list(table.values())
            keys = list(table.keys())
            temp = [k.strip('_') for k in keys]
            if len(set(keys)) == len(set(temp)):
                keys = temp
            for r, row in enumerate(stream_to_rows(stream(stream_name), column_formats(table))):
                values = []
                for index, value in enumerate(row):
                    vt = info[index].type
                    if vt is MsiType.Long:
                        if value != 0:
                            value -= 0x80000000
                    elif vt is MsiType.Short:
                        if value != 0:
                            value -= 0x8000
                    elif value in strings:
                        value = strings.ref(value)
                    elif not info[index].is_integer:
                        value = ''
                    values.append(value)
                if table_name == 'Property':
                    tbl_properties[values[0]] = values[1]
                if table_name == 'File':
                    tbl_properties[values[0]] = values[2]
                if table_name == 'Component':
                    tbl_properties[values[0]] = F'%{values[2]}%'
                entry = dict(zip(keys, values))
                einfo = {t: i for t, i in zip(keys, info)}
                if table_name == 'MsiFileHash':
                    entry['Hash'] = struct.pack(
                        '<IIII',
                        row[2] ^ 0x80000000,
                        row[3] ^ 0x80000000,
                        row[4] ^ 0x80000000,
                        row[5] ^ 0x80000000,
                    ).hex()
                if table_name == 'CustomAction':
                    code = row[1] & 0x3F
                    try:
                        entry['Comment'] = self._CUSTOM_ACTION_TYPES[code]
                    except LookupError:
                        pass
                    t = einfo.get('Target')
                    c = {0x25: 'js', 0x26: 'vbs', 0x33: None}
                    if code in c and t and not t.is_integer:
                        postprocessing.append(ScriptItem(r, c[code]))
                processed.append(entry)
            if processed:
                processed_table_data[table_name] = processed

        ca = processed_table_data.get('CustomAction', None)
        for item in postprocessing:
            entry = ca[item.row_index]
            try:
                path: str = entry['Action']
                data: str = entry['Target']
            except KeyError:
                continue
            root = F'Action/{path}'
            if item.extension:
                path = F'{root}.{item.extension}'
                streams[path] = UnpackResult(path, data.encode(self.codec))
                continue
            data = format_string(data)
            parts = [part.partition('\x02') for part in data.split('\x01')]
            if not all(part[1] == '\x02' for part in parts):
                continue
            for name, _, script in parts:
                if not name.lower().startswith('script'):
                    continue
                if not script:
                    continue
                path = F'{root}.{name}'
                streams[path] = UnpackResult(path, script.encode(self.codec))

        for ignored_stream in [
            '[5]SummaryInformation',
            '[5]DocumentSummaryInformation',
            '[5]DigitalSignature',
            '[5]MsiDigitalSignatureEx'
        ]:
            streams.pop(ignored_stream, None)

        inconsistencies = 0
        w1 = len(str(len(strings)))
        w2 = len(str(max(max(strings.computed_ref_count), max(strings.provided_ref_count))))
        for k in range(len(strings)):
            c = strings.computed_ref_count[k]
            p = strings.provided_ref_count[k]
            if c != p and not self.log_debug(F'string {k:0{w1}d} reference count computed={c:0{w2}d} provided={p:0{w2}d}'):
                inconsistencies += 1
        if inconsistencies:
            self.log_info(F'found {inconsistencies} incorrect string reference counts')

        def fix_msi_path(path: str):
            prefix, dot, name = path.partition('.')
            if dot == '.' and prefix in processed_table_data:
                path = F'{prefix}/{name}'
            return path

        if self.args.nocab:
            cabs = {}
        else:
            def _iscab(path):
                return media_info and any(item.get('Cabinet', '') == F'#{path}' for item in media_info)
            media_info: List[JSONDict] = processed_table_data.get('Media', [])
            cabs: Dict[str, UnpackResult] = {
                path: item for path, item in streams.items() if _iscab(path)}
            for cab in cabs:
                self.log_info(F'found cab file: {cab}')
        if cabs:
            file_names: Dict[str, JSONDict] = {}

            for file_info in processed_table_data.get('File', []):
                try:
                    src_name = file_info['File']
                    dst_name = file_info['FileName']
                except KeyError:
                    continue
                _, _, long = dst_name.partition('|')
                dst_name = long or dst_name
                file_names[src_name] = dst_name

            for path, cab in cabs.items():
                try:
                    _cabinet = Cabinet(cab.get_data())
                    unpacked = _cabinet.process().get_files()
                except Exception as e:
                    self.log_info(F'unable to extract embedded cab file: {e!s}')
                    continue
                base, dot, ext = path.rpartition('.')
                if dot == '.' and ext.lower() == 'cab':
                    path = base
                else:
                    del streams[path]
                    cab.path = F'{path}.cab'
                    streams[cab.path] = cab
                for result in unpacked:
                    sub_path = file_names.get(result.name, result.name)
                    sub_path = self._get_path_separator().join((path, sub_path))
                    streams[sub_path] = UnpackResult(sub_path, lambda r=result: r.decompress())

        streams = {fix_msi_path(path): item for path, item in streams.items()}
        ds = UnpackResult(self._SYNTHETIC_STREAMS_FILENAME,
                json.dumps(processed_table_data, indent=4).encode(self.codec))
        streams[ds.path] = ds

        converter = csv()
        for key, data in processed_table_data.items():
            sk = key.strip('_')
            if sk not in processed_table_data:
                key = sk
            try:
                tbl = UnpackResult(F'{self._SYNTHETIC_STREAMS_TOPLEVEL}/{key}.csv', converter.json_to_csv(data))
            except Exception:
                continue
            streams[tbl.path] = tbl

        for path in sorted(streams):
            streams[path].path = path
            yield streams[path]

    @classmethod
    def handles(cls, data: bytearray):
        if not data[:4] == B'\xD0\xCF\x11\xE0':
            return False
        return FileMagicInfo(data).extension == 'msi'

class xtnode (*paths, entry=False, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date')

Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two utilities that are commonly used to generate stand-alone executables.

emit something | xtnode --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtnode [| dump {path} ]

Expand source code Browse git

class xtnode(ArchiveUnit, docs='{0}{p}{PathExtractorUnit}'):
    """
    Extracts and decompiles files from compiled Node.Js applications. Supports both nexe and pkg, two
    utilities that are commonly used to generate stand-alone executables.
    """

    _NEXE_SENTINEL = B'<nexe~~sentinel>'
    _PKG_PAYLOAD_P = B'PAYLOAD_POSITION'
    _PKG_PAYLOAD_S = B'PAYLOAD_SIZE'
    _PKG_PRELUDE_P = B'PRELUDE_POSITION'
    _PKG_PRELUDE_S = B'PRELUDE_SIZE'
    _PKG_COMMON_JS = B'sourceMappingURL=common.js.map'

    def __init__(
        self, *paths, entry: Arg.Switch('-u', help='Only extract the entry point.') = False,
        list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
    ):
        super().__init__(*paths, entry=entry,
            list=list, join_path=join_path, drop_path=drop_path, fuzzy=fuzzy, exact=exact, regex=regex,
            path=path, date=date)

    def unpack(self, data: ByteStr) -> Iterable[UnpackResult]:
        if self._is_nexe(data):
            self.log_info('unpacking as nexe')
            yield from self._unpack_nexe(data)
            return
        if self._is_pkg(data):
            self.log_info('unpacking as pkg')
            yield from self._unpack_pkg(data)
            return

    def _unpack_nexe(self, data: ByteStr):
        try:
            ep = re.compile(
                RB"entry\s*=\s*path\.resolve\(path\.dirname\(process\.execPath\),\s*(%s)\)" % formats.string)
            ep, = ep.finditer(data)
        except Exception:
            ep = None
            self.log_info('could not identify entry point')
        else:
            ep = ep.group(1) | esc(quoted=True) | str
            self.log_info(F'entry point: {ep}')
        view = memoryview(data)
        for marker in re.finditer(re.escape(self._NEXE_SENTINEL), data):
            end = marker.end() + 16
            sizes = data[marker.end():end]
            if sizes.startswith(b"')"):
                continue
            reader = StructReader(sizes)
            code_size = int(reader.f64())
            blob_size = int(reader.f64())
            start = marker.start() - code_size - blob_size
            try:
                reader = StructReader(view[start:end])
                code = reader.read_exactly(code_size)
                blob = reader.read_exactly(blob_size)
            except EOFError:
                self.log_debug(F'found marker at 0x{marker.start():X}, but failed to read data')
                continue
            else:
                self.log_debug(F'found marker at 0x{marker.start():X}, data start at {start:X}')
            for rsrc in re.finditer(RB'process\.__nexe\s*=', code):
                rsrc = JSONReader(code[rsrc.end():])
                rsrc = rsrc.read_json()
                if len(rsrc) == 1:
                    _, rsrc = rsrc.popitem()
                for path, (offset, length) in rsrc.items():
                    end = offset + length
                    if ep and self.args.entry and path != ep:
                        continue
                    yield UnpackResult(path, blob[offset:end])

    def _unpack_pkg(self, data: ByteStr):
        def _extract_coordinates(*v: bytes):
            for name in v:
                pattern = name + BR'''\s{0,3}=\s{0,3}(['"])([\s\d]+)\1'''
                value, = re.finditer(pattern, data)
                yield int(value.group(2).decode('utf8').strip(), 0)

        def _extract_data(*v: bytes):
            try:
                offset, length = _extract_coordinates(*v)
            except Exception:
                return None
            return data[offset:offset + length]

        payload = _extract_data(self._PKG_PAYLOAD_P, self._PKG_PAYLOAD_S)
        if not payload:
            raise ValueError('unable to extract payload')
        prelude = _extract_data(self._PKG_PRELUDE_P, self._PKG_PRELUDE_S)
        if not prelude:
            raise ValueError('unable to extract prelude')
        mapping = re.search(re.escape(self._PKG_COMMON_JS) + BR'\s*\},\s*\{', prelude)
        if not mapping:
            raise ValueError('unable to find common.js mapping')

        reader = JSONReader(prelude[mapping.end() - 1:])

        files: Dict[str, dict] = reader.read_json()

        if files is None:
            raise ValueError('failed to read file list')

        entry = reader.skip_comma().read_string()
        links = reader.skip_comma().read_json()

        # _unknown1 = reader.skip_comma().read_json()
        # _unknown2 = reader.skip_comma().read_terminated_array(B')').strip()

        root = next(iter(files))
        skip = 0
        view = memoryview(payload)

        for k in range(len(root) + 1):
            test = root[:k].rstrip('/').rstrip('\\')
            if not all(path.startswith(test) for path in files):
                root = test[:-1]
                skip = k - 1
                break

        entry = entry[skip:]
        self.log_info(F'detected root directory {root}, entry point is {entry}')

        for src, dst in links.items():
            new_files = {}
            self.log_info('link src:', src[skip:])
            self.log_info('link dst:', dst[skip:])
            for path, location in files.items():
                if not path.startswith(src):
                    continue
                new_path = dst + path[len(src):]
                new_files[new_path] = location
                self.log_debug('synthesizing linked file:', new_path)
            files.update(new_files)

        for path, location in files.items():
            path = path[skip:]
            if entry and self.args.entry and path != entry:
                continue
            data = None
            for kind, (offset, length) in location.items():
                stop = offset + length
                if kind == '3':  # metadata
                    continue
                if kind == '2':  # unknown
                    continue
                if kind in '01':
                    data = view[offset:stop]
            if data is not None:
                yield UnpackResult(path, data)

    @classmethod
    def _is_nexe(cls, data: ByteStr) -> bool:
        return cls._NEXE_SENTINEL in data

    @classmethod
    def _is_pkg(cls, data: ByteStr) -> bool:
        if cls._PKG_PAYLOAD_P not in data:
            return False
        if cls._PKG_PAYLOAD_S not in data:
            return False
        if cls._PKG_PRELUDE_P not in data:
            return False
        if cls._PKG_PRELUDE_S not in data:
            return False
        if cls._PKG_COMMON_JS not in data:
            return False
        return True

    @classmethod
    def handles(cls, data: ByteStr) -> Optional[bool]:
        return cls._is_nexe(data) or cls._is_pkg(data)

class xtnsis (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from NSIS archives. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtnsis --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtnsis [| dump {path} ]

Expand source code Browse git

class xtnsis(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from NSIS archives.
    """

    @classmethod
    def _find_archive_offset(cls, data: bytearray, before: int = -1, flawmax=2):
        def signatures(*magics):
            for changes in range(flawmax + 1):
                for magic in magics:
                    if not changes:
                        yield 0, magic
                        continue
                    for positions in itertools.permutations(range(len(magic)), r=changes):
                        signature = bytearray(magic)
                        for p in positions:
                            signature[p] = 0x2E
                        yield changes, bytes(signature)
        best_guess = None
        search_space = memoryview(data)
        for flaws, sig in signatures(*NSArchive.MAGICS):
            if flaws > 1:
                search_space = search_space[:0x20_000]
            matches = [m.start() - 4 for m in re.finditer(sig, search_space, flags=re.DOTALL)]
            if before >= 0:
                matches = [match for match in matches if match < before]
            matches.reverse()
            archive = None
            for match in matches:
                if match % 0x200 == 0:
                    archive = match
                    break
            if not archive:
                if matches and not best_guess:
                    best_guess = matches[-1]
            else:
                msg = F'Archive signature was found at offset 0x{archive:X}'
                if flaws > 0:
                    msg = F'{msg}; it has {flaws} imperfections and was likely modified'
                cls.log_info(F'{msg}.')
                return archive
        if best_guess:
            cls.log_info(F'A signature was found at offset 0x{best_guess:08X}; it is not properly aligned.')
            return best_guess
        return None

    def unpack(self, data):
        memory = memoryview(data)
        before = -1
        _error = None
        while True:
            offset = self._find_archive_offset(data, before)
            if offset is None:
                _error = _error or ValueError('Unable to find an NSIS archive marker.')
                raise _error
            try:
                arc = NSArchive(memory[offset:])
            except Exception as e:
                _error = e
                before = offset
            else:
                break

        def info():
            yield F'{arc.header.type.name} archive'
            yield F'compression type {arc.method.value}'
            yield F'mystery value 0x{arc.header.unknown_value:X}'
            yield 'solid archive' if arc.solid else 'fragmented archive'
            yield '64-bit header' if arc.header.is64bit else '32-bit header'
            yield 'unicode' if arc.header.unicode else 'ascii'

        self.log_info(', '.join(info()))

        for item in arc.header.items:
            yield self._pack(item.path, item.mtime, lambda i=item: arc._extract_item(i).data)

        yield self._pack('setup.bin', None, arc.header_data)
        yield self._pack('setup.nsis', None, arc.script.encode(self.codec))

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return any(magic in data for magic in NSArchive.MAGICS)

class xtnuitka (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extracts files packed by Nuitka using the –onefile option.

Expand source code Browse git

class xtnuitka(PathExtractorUnit):
    """
    Extracts files packed by Nuitka using the --onefile option.
    """
    _MAGIC = B'KA'

    @PathExtractorUnit.Requires('pyzstd', 'arc')
    def _pyzstd():
        import pyzstd
        return pyzstd

    def unpack(self, data: ByteStr) -> Iterable[UnpackResult]:
        class NuitkaData(Struct):
            unit = self

            def __init__(self, reader: StructReader):
                self.magic = reader.read_exactly(2)
                self.compression_flag = reader.read_exactly(1)
                if self.compressed:
                    zd = self.unit._pyzstd.ZstdDecompressor()
                    reader = StructReader(zd.decompress(reader.read()))
                self.files = {}
                self.truncated = False
                while not reader.eof:
                    path = reader.read_w_string('utf-16')
                    if not path:
                        break
                    size = reader.u64()
                    data = reader.read(size)
                    if len(data) == size:
                        self.files[path] = data
                    else:
                        self.truncated = True

            @property
            def compressed(self):
                return self.compression_flag == b'Y'

        if data.startswith(b'MZ'):
            arcs = list(self._pe_candidates(data))
        else:
            arcs = [data]

        for arc in arcs:
            archive = NuitkaData(arc)
            if archive.truncated:
                self.log_warn('the archive is truncated')
            if archive.magic != self._MAGIC:
                self.log_warn('the archive data does not start with the correct magic sequence')
            for path, data in archive.files.items():
                yield UnpackResult(path, data)

    @classmethod
    def handles(cls, data: ByteStr) -> Optional[bool]:
        if data[:2] == b'MZ':
            try:
                next(cls._pe_candidates(data))
            except StopIteration:
                return False
        else:
            return data[:2] == cls._MAGIC

    @classmethod
    def _pe_candidates(cls, data: ByteStr):

        from refinery.units.formats.pe.peoverlay import peoverlay
        blob = data | peoverlay | bytearray
        if blob.startswith(cls._MAGIC):
            yield blob

        from refinery.units.formats.pe.perc import perc
        for blob in data | perc:
            if blob.startswith(cls._MAGIC):
                yield blob

class xtone (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract embedded files from OneNote documents.

Expand source code Browse git

class xtone(PathExtractorUnit):
    """
    Extract embedded files from OneNote documents.
    """
    @PathExtractorUnit.Requires('pyonenote', 'formats', 'office', 'extended')
    def _pyOneNote():
        import pyOneNote
        import pyOneNote.OneDocument
        return pyOneNote.OneDocument

    def unpack(self, data: bytearray):
        with MemoryFile(memoryview(data)) as stream:
            one = self._pyOneNote.OneDocment(stream)
        for guid, file in one.get_files().items():
            chunk = file['content']
            try:
                extension = file['extension']
            except KeyError:
                extension = F'.{get_cached_file_magic_info(chunk).extension}'
            yield UnpackResult(F'{guid}{extension}', chunk)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return UUID('e4525c7b-8cd8-a74d-aeb1-5378d02996d3').bytes in data

class xtp (*pattern, filter=0, min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None)

Extract Patterns: Uses regular expressions to extract indicators from the input data and optionally filters these results heuristically. The unit is designed to extract indicators such as domain names and IP addresses, see below for a complete list. To extract data formats such as hex-encoded data, use carve.

Expand source code Browse git

class xtp(PatternExtractor):
    """
    Extract Patterns: Uses regular expressions to extract indicators from the input data and
    optionally filters these results heuristically. The unit is designed to extract indicators
    such as domain names and IP addresses, see below for a complete list. To extract data
    formats such as hex-encoded data, use `refinery.carve`.
    """

    def __init__(
        self,
        *pattern: Arg('pattern', type=str,
            default=(
                indicators.hostname.name,
                indicators.url.name,
                indicators.email.name,
            ), help=(
                'Choose the pattern to extract. The unit uses {{default}} by default. Use an '
                'asterix character to select all available patterns. The available patterns '
                'are: {}'.format(', '.join(p.display for p in indicators))
            )
        ),
        filter: Arg('-f', dest='filter', action='count',
            help=(
                'If this setting is enabled, the xtp unit will attempt to reduce the number '
                'of false positives by certain crude heuristics. Specify multiple times to '
                'make the filtering more aggressive.'
            )
        ) = 0,
        min=1, max=None, len=None, stripspace=False, duplicates=False, longest=False, take=None
    ):
        self.superinit(super(), **vars(), ascii=True, utf16=True)

        patterns = {
            p for name in pattern for p in indicators if fnmatch(p.display, name)
        }
        # if indicators.hostname in patterns:
        #     patterns.remove(indicators.hostname)
        #     patterns.add(indicators.ipv4)
        #     patterns.add(indicators.domain)
        patterns = [F'(?P<{p.name}>{p.value})' for p in patterns]
        if not patterns:
            raise RefineryCriticalException('The given mask does not match any known indicator pattern.')
        pattern = '|'.join(patterns)
        self.args.pattern = re.compile(pattern.encode(self.codec), flags=re.DOTALL)
        self.args.filter = filter

    _ALPHABETIC = ascii_letters.encode('ASCII')

    _LEGITIMATE_HOSTS = {
        'acm.org'                 : 1,
        'adobe.com'               : 1,
        'aka.ms'                  : 1,
        'android.com'             : 1,
        'apache.org'              : 1,
        'apple.com'               : 1,
        'archive.org'             : 2,
        'azure.com'               : 1,
        'baidu.com'               : 2,
        'bootstrapcdn.com'        : 2,
        'cdnjs.cloudflare.com'    : 4,
        'comodo.net'              : 1,
        'comodoca.com'            : 1,
        'curl.haxx.se'            : 1,
        'curl.se'                 : 1,
        'digicert.com'            : 1,
        'dublincore.org'          : 1,
        'facebook.com'            : 4,
        'fontawesome.com'         : 1,
        'github.com'              : 3,
        'globalsign.com'          : 1,
        'globalsign.net'          : 1,
        'godaddy.com'             : 1,
        'golang.org'              : 1,
        'google.com'              : 4,
        'googleapis.com'          : 5,
        'googleusercontent.com'   : 5,
        'gov'                     : 2,
        'gstatic.com'             : 2,
        'iana.org'                : 1,
        'intel.com'               : 1,
        'jquery.com'              : 1,
        'jsdelivr.net'            : 2,
        'libssh.org'              : 1,
        'live.com'                : 1,
        'microsoft.com'           : 1,
        'mozilla.org'             : 1,
        'msdn.com'                : 1,
        'msn.com'                 : 1,
        'newtonsoft.com'          : 3, # json.net
        'nuget.org'               : 3,
        'office.com'              : 1,
        'office365.com'           : 2,
        'openssl.org'             : 1,
        'openssh.com'             : 1,
        'openxmlformats.org'      : 1,
        'oracle.com'              : 1,
        'purl.org'                : 1,
        'python.org'              : 1,
        'schema.org'              : 2,
        'sectigo.com'             : 1,
        'skype.com'               : 1,
        'sourceforge.net'         : 4,
        'stackoverflow.com'       : 1,
        'sun.com'                 : 1,
        'sway-cdn.com'            : 1,
        'sway-extensions.com'     : 1,
        'symantec.com'            : 1,
        'symauth.com'             : 1,
        'symcb.com'               : 1,
        'symcd.com'               : 1,
        'sysinternals.com'        : 3,
        'thawte.com'              : 1,
        'unicode.org'             : 2,
        'usertrust.com'           : 1,
        'verisign.com'            : 1,
        'w3.org'                  : 1,
        'wikipedia.org'           : 1,
        'wolfram.com'             : 1,
        'xml.org'                 : 1,
        'xmlsoap.org'             : 1,
        'yahoo.com'               : 1,
    }

    for _ext in [
        'build',
        'data',
        'do',
        'help',
        'java',
        'md',
        'mov',
        'name',
        'py',
        'so',
        'sys',
        'zip',
    ]:
        _LEGITIMATE_HOSTS[_ext] = 4

    _DOMAIN_WHITELIST = {
        'system.net',
        'wscript.shell',
    }

    _BRACKETING = {
        B"'"[0]: B"'",
        B'"'[0]: B'"',
        B'('[0]: B')',
        B'{'[0]: B'}',
        B'['[0]: B']',
        B'<'[0]: B'>',
    }

    def _check_host(self, host: str, text: str):
        hl = host.lower()
        if hl in self._DOMAIN_WHITELIST:
            self.log_info(F'excluding indicator because domain {hl} is forcefully ignored: {text}')
            return False
        for white, level in self._LEGITIMATE_HOSTS.items():
            if self.args.filter >= level and (hl == white or hl.endswith(F'.{white}')):
                self.log_info(F'excluding indicator because domain {hl} is whitelisted: {text}', clip=True)
                self.log_debug(F'reduce level below {level} to allow, current level is {self.args.filter}')
                return False
        return True

    def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str, value: bytes):
        term = self._BRACKETING.get(data[pos - 1], None)
        text = value.decode(self.codec)
        if term:
            pos = value.find(term)
            if pos > 0:
                value = value[:pos]
        if not self.args.filter:
            return value
        if name == indicators.hostname.name:
            if all(part.isdigit() for part in value.split(B'.')):
                name = indicators.ipv4.name
            elif B'.' not in value:
                name = indicators.ipv6.name
            else:
                name = indicators.domain.name
        if name == indicators.ipv4.name:
            ocets = [int(x) for x in value.split(B'.')]
            if ocets.count(0) >= 3:
                self.log_info(F'excluding ipv4 because it contains many zeros: {text}')
                return None
            if self.args.filter > 2 and sum(ocets) < 10:
                self.log_info(F'excluding ipv4 because of low value ocets: {text}')
                return None
            if ocets[0] <= 5 * self.args.filter:
                for area in (
                    bytes(data[pos - 20 : pos + 20]),
                    bytes(data[pos * 2 - 40 : pos * 2 + 40 : 2]),
                    bytes(data[pos * 2 - 41 : pos * 2 + 39 : 2]),
                ):
                    check = area.lower()
                    if B'version' in check or b'build' in check:
                        self.log_info(F'excluding ipv4 because it might be a version: {text}')
                        return None
            ip = ip_address(text)
            if not ip.is_global:
                if self.args.filter >= 3 or not ip.is_private:
                    self.log_info(F'excluding ipv4 because it is not global: {text}')
                    return None
        elif name in {
            indicators.url.name,
            indicators.socket.name,
            indicators.hostname.name,
            indicators.domain.name,
            indicators.subdomain.name
        }:
            if self.args.filter >= 2:
                if LetterWeights.IOC(value) < 0.6:
                    self.log_info(F'excluding indicator because with low score: {text}', clip=True)
                    return None
                if name != indicators.url.name and len(value) > 0x100:
                    self.log_info(F'excluding indicator because it is too long: {text}', clip=True)
                    return None
            ioc = text
            if '://' not in ioc:
                ioc = F'tcp://{ioc}'
            parts = urlparse(ioc)
            host, _, _ = parts.netloc.partition(':')
            if not self._check_host(host, text):
                return None
            if name == indicators.url.name:
                scheme = parts.scheme.lower()
                for p in ('http', 'https', 'ftp', 'file', 'mailto'):
                    if scheme.endswith(p):
                        pos = scheme.find(p)
                        value = value[pos:]
                        break
            if name in {
                indicators.hostname.name,
                indicators.domain.name,
                indicators.subdomain.name
            }:
                if data[pos - 1] in b'/\\' and self.args.filter >= 2:
                    return None
                hostparts = host.split('.')
                if self.args.filter >= 2:
                    if not all(p.isdigit() for p in hostparts) and all(len(p) < 4 for p in hostparts):
                        self.log_info(F'excluding host with too many short parts: {text}')
                        return None
                if self.args.filter >= 3:
                    if len(hostparts) <= sum(3 for p in hostparts if p != p.lower() and p != p.upper()):
                        self.log_info(F'excluding host with too many mixed case parts: {text}')
                        return None
                # These heuristics attempt to filter out member access to variables in
                # scripts which can be mistaken for domains because of the TLD inflation
                # we've had.
                uppercase = sum(1 for c in host if c.isalpha() and c.upper() == c)
                lowercase = sum(1 for c in host if c.isalpha() and c.lower() == c)
                if lowercase and uppercase:
                    caseratio = uppercase / lowercase
                    if 0.1 < caseratio < 0.9:
                        self.log_info(F'excluding indicator with too much uppercase letters: {text}')
                        return None
                if all(x.isidentifier() for x in hostparts):
                    if len(hostparts) == 2 and hostparts[0] in ('this', 'self'):
                        self.log_info(F'excluding host that looks like a code snippet: {text}')
                        return None
                    if len(hostparts[-2]) < 3:
                        self.log_info(F'excluding host with too short root domain name: {text}')
                        return None
                    if any(x.startswith('_') for x in hostparts):
                        self.log_info(F'excluding host with underscores: {text}')
                        return None
                    if len(hostparts[-1]) > 3:
                        prefix = '.'.join(hostparts[:-1])
                        seen_before = len(set(re.findall(
                            R'{}(?:\.\w+)+'.format(prefix).encode('ascii'), data)))
                        if seen_before > 2:
                            self.log_debug(F'excluding indicator that was already seen: {text}')
                            return None
        elif name == indicators.email.name:
            _, _, host = value.partition(B'@')
            host = host.decode(self.codec)
            if not self._check_host(host, text):
                return None
            at = value.find(B'@')
            ix = 0
            while value[ix] not in self._ALPHABETIC:
                ix += 1
            return None if at - ix < 3 else value[ix:]
        elif name in (
            indicators.path.name,
            indicators.winpath.name,
            indicators.nixpath.name,
        ):
            if len(value) < 8:
                self.log_info(F'excluding path because it is too short: {text}')
                return None
            if len(value) > 16 and len(re.findall(RB'\\x\d\d', value)) > len(value) // 10:
                self.log_info(F'excluding long path containign hex: {text}', clip=True)
                return None
            try:
                path_string = text
            except Exception:
                self.log_debug(F'excluding path which did not decode: {value!r}', clip=True)
                return None
            try:
                path = Path(path_string)
            except Exception as E:
                self.log_debug(F'error parsing path "{path}": {E!s}')
                return None
            path_likeness = sum(v for v, x in [
                (1, path.suffix),
                (1, path_string.startswith('/')),
                (2, path_string.startswith('%')),
                (2, path_string.startswith('\\\\')),
                (2, path_string[1:3] == ':\\'),
            ] if x)
            if 2 + path_likeness < min(self.args.filter, 2):
                self.log_info(F'excluding long path because it has no characteristic parts: {text}')
                return None
            bad_parts = 0
            all_parts = len(path.parts)
            if self.args.filter >= 1:
                date_likeness = sum(1
                    for t in ['yyyy', 'yy', 'mm', 'dd', 'hh', 'ss']
                    if t in path.parts or t.upper() in path.parts)
                if len(value) < 20 and date_likeness >= all_parts - 1:
                    self.log_info(F'excluding path that looks like a date format: {text}', clip=True)
                    return None
            if self.args.filter >= 2:
                for k, part in enumerate(path.parts):
                    if not k:
                        drive, colon, slash = part.partition(':')
                        if colon and len(drive) == 1 and len(slash) <= 1:
                            continue
                        if part[0] == part[~0] == '%':
                            continue
                        if len(part) == 1:
                            continue
                    if (
                        LetterWeights.Path(part) < 0.5 + (min(self.args.filter, 4) * 0.1)
                        or (self.args.filter >= 2 and LetterWeights.Path(part[:1]) < 0.5)
                    ):
                        bad_parts += 1
                        self.log_debug(F'bad part {k + 1} in path: {part}')
            for filter_limit in (2, 3, 4):
                bad_ratio = 2 ** (filter_limit - 1)
                if self.args.filter >= filter_limit and bad_parts * bad_ratio >= all_parts:
                    self.log_info(F'excluding path with bad parts: {text}', clip=True)
                    return None
        return value

    def process(self, data):
        whitelist = set()

        def check(match: re.Match):
            for name, value in match.groupdict().items():
                if value is not None:
                    break
            else:
                raise RefineryCriticalException('Received empty match.')
            if value in whitelist:
                return None
            result = self._check_match(match.string, match.start(), name, value)
            if result is not None:
                return self.labelled(result, pattern=name)
            whitelist.add(value)

        transforms = [check]
        yield from self.matches_filtered(memoryview(data), self.args.pattern, *transforms)

class xtpdf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract objects from PDF documents.

Expand source code Browse git

class xtpdf(PathExtractorUnit):
    """
    Extract objects from PDF documents.
    """
    @PathExtractorUnit.Requires('pypdf>=3.1.0', 'formats', 'default', 'extended')
    def _pypdf2():
        import pypdf
        import pypdf.generic
        return pypdf

    def _walk(self, blob, memo: Optional[Set[int]] = None, *path):
        while isinstance(blob, self._pypdf2.generic.IndirectObject):
            try:
                blob = blob.get_object()
            except Exception:
                break
        if memo is None:
            memo = {id(blob)}
        elif id(blob) in memo:
            return
        else:
            memo.add(id(blob))
        try:
            name = blob['/F']
            blob = blob['/EF']['/F']
        except Exception:
            pass
        else:
            def unhex(match):
                return bytes.fromhex(match[1]).decode('latin1')
            name = re.sub('#([0-9a-fA-F]{2})', unhex, name)
            path = *path[:-1], F'/{name}'
        try:
            def extract():
                with NoLogging():
                    return get_data()
            if TYPE_CHECKING:
                blob = cast(EncodedStreamObject, blob)
            get_data = blob.get_data
        except AttributeError:
            pass
        else:
            yield UnpackResult(''.join(path), extract, kind='object')
            return

        if isinstance(blob, self._pypdf2.generic.ByteStringObject):
            yield UnpackResult(''.join(path), blob, kind='bytes')
            return
        if isinstance(blob, self._pypdf2.generic.TextStringObject):
            yield UnpackResult(''.join(path), blob.encode(self.codec), kind='string')
            return

        if isinstance(blob, (
            self._pypdf2.generic.BooleanObject,
            self._pypdf2.generic.ByteStringObject,
            self._pypdf2.generic.FloatObject,
            self._pypdf2.generic.NameObject,
            self._pypdf2.generic.NullObject,
            self._pypdf2.generic.NumberObject,
            self._pypdf2.generic.RectangleObject,
        )):
            # unhandled PDF objects
            return

        if isinstance(blob, self._pypdf2.generic.TreeObject):
            blob = list(blob)

        pdf = self._pypdf2.generic.PdfObject

        if isinstance(blob, list):
            if (
                len(blob) % 2 == 0
                and all(isinstance(key, str) for key in islice(iter(blob), 0, None, 2))
                and all(isinstance(key, pdf) for key in islice(iter(blob), 1, None, 2))
            ):
                blob = dict(zip(*([iter(blob)] * 2)))
            else:
                for key, value in enumerate(blob):
                    yield from self._walk(value, memo, *path, F'/{key}')
                return

        if not isdict(blob):
            return

        for key, value in blob.items():
            if not isinstance(key, str):
                continue
            if not key.startswith('/'):
                key = F'/{key}'
            yield from self._walk(value, memo, *path, key)

    def unpack(self, data):
        with MemoryFile(data, read_as_bytes=True) as stream:
            with NoLogging():
                pdf = self._pypdf2.PdfReader(stream)
                catalog = pdf.trailer['/Root']
                yield from self._walk(catalog)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return data.startswith(B'%PDF-')

class xtpyi (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', decompile=False, user_code=False, unmarshal=0)

Extracts and decompiles files from a Python Installer (aka PyInstaller) archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtpyi --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtpyi [| dump {path} ]

Expand source code Browse git

class xtpyi(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extracts and decompiles files from a Python Installer (aka PyInstaller) archive.
    """
    def __init__(
        self, *paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False,
        path=b'path', date=b'date',
        decompile: Arg.Switch('-c', help='Attempt to decompile PYC files.') = False,
        user_code: Arg.Switch('-u', group='FILTER', help=(
            'Extract only source code files from the root of the archive. These usually implement '
            'the actual domain logic. This implies the --decompile option.')) = False,
        unmarshal: Arg('-y', action='count', group='FILTER', help=(
            '(DANGEROUS) Unmarshal embedded PYZ archives. Warning: Maliciously crafted packages can '
            'potentially exploit this to execute code. It is advised to only use this option inside '
            'an isolated environment. Specify twice to decompile unmarshalled Python bytecode.'
        )) = 0
    ):
        super().__init__(
            *paths,
            list=list,
            join_path=join_path,
            drop_path=drop_path,
            fuzzy=fuzzy,
            exact=exact,
            regex=regex,
            path=path,
            date=date,
            decompile=decompile,
            unmarshal=unmarshal,
            user_code=user_code,
        )

    @ArchiveUnit.Requires('xdis', 'arc', 'python', 'extended')
    def _xdis():
        import xdis.load
        import xdis.magics
        import xdis.marsh
        import xdis.op_imports
        import xdis.version_info
        import xdis
        A, B, C, *_ = sys.version_info
        version = F'{A}.{B}.{C}'
        canonic = F'{A}.{B}'
        if version not in xdis.magics.canonic_python_version:
            import importlib.util
            magic = importlib.util.MAGIC_NUMBER
            xdis.magics.add_magic_from_int(xdis.magics.magic2int(magic), version)
            xdis.magics.by_magic.setdefault(magic, set()).add(version)
            xdis.magics.by_version[version] = magic
            xdis.magics.magics[canonic] = magic
            xdis.magics.canonic_python_version[canonic] = canonic
            xdis.magics.add_canonic_versions(version, canonic)
            xdis.op_imports.op_imports.setdefault(canonic,
                next(iter(reversed(xdis.op_imports.op_imports.values()))))
        del A, B, C, version
        import xdis.std
        return xdis

    @ArchiveUnit.Requires('uncompyle6', 'arc', 'python', 'extended')
    def _uncompyle6():
        import uncompyle6
        import uncompyle6.main
        return uncompyle6

    @ArchiveUnit.Requires('decompyle3', 'arc', 'python')
    def _decompyle3():
        import decompyle3
        import decompyle3.main
        return decompyle3

    def unpack(self, data):
        view = memoryview(data)
        positions = [m.start() for m in re.finditer(re.escape(PyInstallerArchiveEpilogue.MagicSignature), view)]
        mode = Unmarshal(min(2, int(self.args.unmarshal)))
        self.log_debug(F'unmarshal mode: {mode.name}')
        if not positions:
            raise LookupError('unable to find PyInstaller signature')
        if len(positions) > 2:
            # first position is expected to be the sentinel value in the unpacker stub
            width = max(len(F'{p:X}') for p in positions)
            for position in positions:
                self.log_info(F'magic signature found at offset 0x{position:0{width}X}')
            self.log_warn(F'found {len(positions) - 1} potential PyInstaller epilogue markers; using last one.')
        decompile = self.args.decompile
        uc_target = PiType.USERCODE if decompile else PiType.SOURCE
        archive = PyInstallerArchiveEpilogue(view, positions[-1], mode, decompile)
        for name, file in archive.files.items():
            if self.args.user_code:
                if file.type != uc_target:
                    continue
                if name.startswith('pyiboot'):
                    continue
            yield self._pack(name, None, file.data, type=file.type.name)

    @classmethod
    def handles(cls, data: ByteStr) -> Optional[bool]:
        return PyInstallerArchiveEpilogue.MagicSignature in data

class xtrtf (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract embedded objects in RTF documents.

Expand source code Browse git

class xtrtf(PathExtractorUnit):
    """
    Extract embedded objects in RTF documents.
    """
    @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended')
    def _oletools():
        import oletools
        import oletools.rtfobj
        import oletools.oleobj
        return oletools

    def unpack(self, data):
        parser = self._oletools.rtfobj.RtfObjParser(data)
        parser.parse()
        width = len(str(len(parser.objects)))
        for k, item in enumerate(parser.objects):
            item: RtfObject
            path = item.filename or F'carve{k:0{width}}.bin'
            data = item.rawdata
            meta = {}
            if item.is_ole:
                if item.format_id == self._oletools.oleobj.OleObject.TYPE_EMBEDDED:
                    meta['ole_type'] = 'EMBEDDED'
                elif item.format_id == self._oletools.oleobj.OleObject.TYPE_LINKED:
                    meta['ole_type'] = 'LINKED'
                if item.is_package:
                    meta['src_path'] = item.src_path
                    meta['tmp_path'] = item.temp_path
                if item.clsid is not None:
                    meta['ole_info'] = item.clsid_desc
                    meta['ole_guid'] = item.clsid
                meta['ole_name'] = item.class_name
            if item.oledata:
                data = item.oledata
                pos = item.rawdata.find(data)
                if pos > 0:
                    meta['raw_header'] = item.rawdata[:pos]
                if item.olepkgdata:
                    data = item.olepkgdata
                    pos = item.oledata.find(data)
                    if pos >= 0:
                        meta['ole_header'] = item.oledata[:pos]
            yield UnpackResult(path, data, **meta)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        import re
        return bool(re.search(bR'^\s{0,500}\{\\rtf', memoryview(data)[:505]))

class xtsim (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from Smart Install Maker (SIM) executables. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtsim --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtsim [| dump {path} ]

Expand source code Browse git

class xtsim(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from Smart Install Maker (SIM) executables.
    """

    _RUNTIME_MAPPING = {
        '4.tmp'  : 'header.png',
        '5.tmp'  : 'wizard.bmp',
        '6.tmp'  : 'background.bmp',
        '7.tmp'  : 'folder.png',
        '8.tmp'  : 'group.png',
        '9.tmp'  : 'password.png',
        '15.tmp' : 'license1.rtf',
        '16.tmp' : 'information.rtf',
        '20.tmp' : 'license2.rtf',
    }

    _DIRECTORY_MASKS = {
        '@$&%01': 'ProgramFiles',
        '@$&%02': 'WindowsDir',
        '@$&%03': 'SystemDir',
        '@$&%04': 'InstallPath',
        '@$&%05': 'TempDir',
        '@$&%06': 'Desktop',
        '@$&%07': 'QuickLaunch',
        '@$&%08': 'ProgramsDir',
        '@$&%09': 'StartMenu',
        '@$&%10': 'MyDocuments',
        '@$&%11': 'Favorites',
        '@$&%12': 'SendTo',
        '@$&%13': 'UserProfile',
        '@$&%14': 'StartUp',
        '@$&%15': 'FontsDir',
        '@$&%16': 'CommonFiles',
        '@$&%17': 'SystemDrive',
        '@$&%18': 'CurrentDirectory',
        '@$&%20': 'UserName',
        '@$&%21': 'Language',
        '@$&%22': 'ComputerName',
        '@$&%26': 'AppData',
        '@$&%27': 'CommonAppData',
        '@$&%28': 'CommonDesktop',
        '@$&%29': 'CommonDocuments',
        '@$&%30': 'CommonFavourites',
        '@$&%31': 'CommonPrograms',
        '@$&%32': 'CommonStartMenu',
        '@$&%33': 'CommonStartup',
        '@$&%34': 'Templates',
        '@$&%35': 'CommonTemplates',
        '@$&%36': 'ProgramFiles64',
    }

    def unpack(self, data):
        mem = memoryview(data)
        sim = self.get_offsets(data)

        if sim is None:
            return B''

        strings = StructReader(mem[sim.strings_offset:sim.runtime_offset])
        runtime = StructReader(mem[sim.runtime_offset:sim.content_offset])
        content = StructReader(mem[sim.content_offset:sim.archive_end])

        header = [strings.read_c_string() for _ in range(sim.nr_of_strings)]
        tables: dict[str, list[list[str | int]]] = {}
        unknown_tables: dict[str, list[list[str | int]]] = {}

        def sc(k: int):
            return int(header[k])

        for size, index, name in [
            (4, 98, None),
            (7, 50, 'registry'),    # (2=HKLM/1=HKCU,key)
            (3, 96, None),
            (2, 31, 'fonts'),
            (8, 54, 'shortcuts'),   # (?,0=Menu/1=Desktop,filename,target_path,comment,icon_path1,icon_path2)
            (3, 67, 'filenames'),
            (2, 93, None),
            (6, 40, 'install'),     #
            (6, 25, 'uninstall'),
            (6, 24, 'ini'),         # 34991da998ece07d4a941394c6630ce74955fb4800e5915f6766180d12a8dc61
            (2, 45, None),
            (2, 20, None),
            (4, 26, 'languages'),
        ]:
            count = sc(index)
            if not count:
                continue
            table = [[
                strings.read_c_string() for _ in range(size)
            ] for _ in range(count)]
            if name is None:
                unknown_tables[F'T{index}'] = table
            else:
                tables[name] = table

        unknown_marker = strings.read_c_string()

        language_count = sc(26)
        message_matrix = [[
            strings.read_c_string() for _ in range(sc(57))
        ] for _ in range(language_count)]

        len_chunks = sc(117)
        chunk_size = sc(95)
        chunk_rest = sc(118)

        def check_empty_reader(r: StructReader, name: str):
            if _c := r.remaining_bytes:
                self.log_warn(F'{name} reader had 0x{_c:08X} bytes remaining:', r.peek(), clip=True)

        check_empty_reader(strings, 'strings')

        lngid = tables['languages'][0]
        if not lngid[2].isdigit():
            lname: bytes = lngid[1]
            lname = lname.decode('latin1')
            lngid = max(LCID, key=lambda k: longest_common_substring(LCID[k], lname))
        else:
            lngid = int(lngid[2])

        codec = DEFAULT_CODEPAGE.get(lngid, 'latin1')

        def decode(cell: bytes, codec: str):
            try:
                cell = cell.decode(codec)
            except UnicodeDecodeError:
                cell = cell.decode('latin1')
                self.log_debug('failed to decode string:', cell, clip=True)
            if cell.isdigit():
                return int(cell)
            if not cell:
                return None
            for key, val in self._DIRECTORY_MASKS.items():
                cell = cell.replace(key, F'${val}')
            return cell

        header[:] = [decode(s, codec) for s in header]

        for t in (tables, unknown_tables):
            for name, table in t.items():
                for row in table:
                    row[:] = [decode(cell, codec) for cell in row]

        messages = {}

        for array, lng in zip(message_matrix, tables['languages']):
            lng_codec = DEFAULT_CODEPAGE.get(lng[2], 'latin1')
            messages[lng[1]] = [decode(cell, lng_codec) for cell in array]

        tables['messages'] = messages
        tables['header'] = header

        if unknown_tables:
            tables['unknown_tables'] = unknown_tables
        if unknown_marker:
            tables['unknown_marker'] = decode(unknown_marker, codec)

        yield self._pack('setup.json', None,
            json.dumps(tables, indent=4).encode(self.codec))

        def runtime_path(name: str):
            root, backslash, temp = name.rpartition('\\')
            if backslash and root == '$inst' and (t := self._RUNTIME_MAPPING.get(temp)):
                name = t
            return F'runtime/{name}'

        if sim.runtime_is_cab:
            runtime_cab = Cabinet(runtime.read(), no_magic=True)
            for file in runtime_cab.process().get_files():
                yield self._pack(runtime_path(file.name), file.timestamp, lambda f=file: f.decompress())
        else:
            for _ in range(sim.nr_of_runtime):
                name = decode(runtime.read_c_string(), codec)
                path = runtime_path(name)
                size = int(runtime.read_c_string())
                yield self._pack(path, None, runtime.read(size))
            check_empty_reader(runtime, 'runtime')

        def no_abs_path(p: str):
            drive, d, rest = p.partition(':\\')
            if d and len(drive) == 1:
                return F'$Drive{drive.upper()}\\{rest}'
            return p

        if len_chunks + chunk_rest == 0:
            for file in tables['filenames']:
                path = no_abs_path(file[1])
                content.u32() # unknown
                size = content.u32()
                content.u32() # unknown
                content.u32() # unknown
                content.u32() # unknown
                content.u32() # unknown
                yield self._pack(F'data/{path}', None, content.read(size))
        else:
            content_cab = Cabinet(no_magic=True)
            content_cab.extend(content.read(chunk_size) for _ in range(len_chunks))
            if chunk_rest > 0:
                content_cab.append(content.read(chunk_rest))
            for file in content_cab.process().get_files():
                try:
                    path = tables['filenames'][int(file.name)][1]
                except Exception:
                    path = file.name
                path = F'content/{no_abs_path(path)}'
                yield self._pack(path, file.timestamp, lambda f=file: f.decompress())

        check_empty_reader(content, 'content')

    @classmethod
    def get_offsets(cls, data: bytearray) -> Optional[SIMOffsets]:
        if len(data) < 0x1000:
            return None

        def sane(offsets: SIMOffsets):
            if offsets.sim_signature != _SIGBYTE:
                return False
            for offset in (
                offsets.strings_offset,
                offsets.runtime_offset,
                offsets.content_offset,
            ):
                if offset not in range(0x1000, 0x100000000):
                    return False
            if offsets.strings_offset >= offsets.runtime_offset:
                return False
            return offsets.content_offset >= offsets.runtime_offset + offsets.runtime_length

        end = len(data) - 0x24
        offsets = SIMOffsets(end, *struct.unpack('<QQQQ?BBB', data[end:]))

        if sane(offsets):
            pos = offsets.strings_offset
            end = pos + len(_SIMNAME)
            if data[pos:end] == _SIMNAME:
                return offsets
            pos = data.rfind(_SIMNAME)
            if pos > 0:
                return offsets.rebase(pos)

        view = memoryview(data)

        for stub in re.finditer(rb'MZ.{78}This program must be run under Win', data):
            pos_zero = stub.start()
            pos_data = data.find(_SIMNAME, pos_zero)
            if pos_data < 0:
                continue
            pattern = re.escape((pos_data - pos_zero).to_bytes(8, 'little')) + B'.{27}\\xF1'
            if match := re.search(pattern, view[pos_zero:]):
                end = match.start()
                offsets = SIMOffsets(end, *struct.unpack('<QQQQ?BBB', match[0]))
                if sane(offsets):
                    return offsets.rebase(pos_zero)

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        return cls.get_offsets(data) is not None

class xtsql (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract files from SQLite3 databases.

Expand source code Browse git

class xtsql(PathExtractorUnit):
    """
    Extract files from SQLite3 databases.
    """
    def unpack(self, data: bytearray):
        def _json(object):
            with BytesAsStringEncoder as encoder:
                return encoder.dumps(object).encode(self.codec)

        if sys.version_info[:2] < (3, 11):
            raise NotImplementedError(F'python 3.11 is required to use {self.__class__.__name__}.')

        database = sqlite3.connect(':memory:')
        database.deserialize(data)
        cursor = database.cursor()
        result: dict[str, list[dict[str, int | float | str | bytes]]] = {}

        listing: list[tuple[str, str]] = cursor.execute(
            "SELECT name, sql FROM sqlite_master WHERE type='table';").fetchall()

        for table, spec in listing:
            result[table] = t = []
            ct, _table, names = spec.partition(table)
            names = names.strip()
            if (
                table != _table
                or ct.strip().upper().split() != ['CREATE', 'TABLE']
                or not names.endswith(')')
                or not names.startswith('(')
            ):
                raise ValueError(F'Unexpeted SQL statement in master table: {spec}')
            names = [next(iter(name.strip().split()))
                for name in names[1:-1].split(',')]
            for row in cursor.execute(F'SELECT * FROM {table}').fetchall():
                t.append(dict(zip(names, row)))

        yield UnpackResult('db', functools.partial(_json, result))

        for table, rows in result.items():

            yield UnpackResult(F'db/{table}', functools.partial(_json, rows))

            for k, row in enumerate(rows):

                root = F'db/{table}/{k}'
                yield UnpackResult(root, functools.partial(_json, row))

                for name, value in row.items():
                    path = F'{root}/{name}'
                    if value is None:
                        continue
                    if isinstance(value, (int, float)):
                        value = str(value)
                    if isinstance(value, str):
                        value = value.encode(self.codec)
                    if isinstance(value, bytes):
                        yield UnpackResult(path, value)

    @classmethod
    def handles(cls, data: bytearray):
        return memoryview(data)[:15] == B'SQLite format 3'

class xttar (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from a Tar archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xttar --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xttar [| dump {path} ]

Expand source code Browse git

class xttar(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a Tar archive.
    """
    def unpack(self, data: bytearray):
        with MemoryFile(data) as stream:
            try:
                archive = tarfile.open(fileobj=stream)
            except Exception:
                ustar = data.find(B'ustar')
                if ustar < 257:
                    raise
                stream.seek(ustar - 257)
                archive = tarfile.open(fileobj=stream)
        for info in archive.getmembers():
            if not info.isfile():
                continue
            extractor = archive.extractfile(info)
            if extractor is None:
                continue
            date = datetime.datetime.fromtimestamp(info.mtime)
            yield self._pack(info.name, date, lambda e=extractor: e.read())

    @classmethod
    def handles(cls, data: bytearray) -> bool:
        if data[257:262] != B'ustar':
            return False
        return data[5:8] in (B'\x00\x30\x30', B'\x20\x20\x00')

class xtvba (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract VBA macro code from Office documents.

Expand source code Browse git

class xtvba(PathExtractorUnit):
    """
    Extract VBA macro code from Office documents.
    """
    @PathExtractorUnit.Requires('oletools', 'formats', 'office', 'extended')
    def _olevba():
        with NoLogging(NoLogging.Mode.ALL):
            import oletools.olevba
            return oletools.olevba

    def unpack(self, data):
        sentinel = str(uuid4())
        try:
            parser = self._olevba.VBA_Parser(sentinel, data=bytes(data), relaxed=True)
        except self._olevba.FileOpenError:
            raise ValueError('Input data not recognized by VBA parser')
        for p1, stream_path, p2, code in parser.extract_all_macros():
            code: str
            if not stream_path:
                if p1 == sentinel:
                    continue
                if p2 == sentinel:
                    continue
            yield UnpackResult(stream_path, code.encode(self.codec))

    @classmethod
    def handles(cls, data: bytearray):
        if data.startswith(b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
            return True
        if data[:2] == B'PK':
            return B'xl/vbaProject.bin' in data
        return any(ns in data for ns in [
            b'http://schemas.microsoft.com/office/word/2003/wordml',
            b'http://schemas.microsoft.com/office/2006/xmlPackage',
        ])

class xtw (stripspace=False, duplicates=False, longest=False, take=None)

Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address. This works similar to the xtp unit.

Expand source code Browse git

class xtw(PatternExtractor):
    """
    Extract Wallets: Extracts anything that looks like a cryptocurrency wallet address.
    This works similar to the `refinery.xtp` unit.
    """

    def __init__(self, stripspace=False, duplicates=False, longest=False, take=None):
        self.superinit(super(), **vars(), ascii=True, utf16=True)

    def process(self, data):
        pattern = '|'.join(FR'(?P<{p.name}>\b{p.value}\b)' for p in wallets)
        pattern = FR'\b{pattern}\b'.encode('latin1')

        def check(match: re.Match[bytes]):
            for name, value in match.groupdict().items():
                if value is not None:
                    break
            else:
                raise RefineryCriticalException('Received empty match.')
            return self.labelled(value, kind=name)

        yield from self.matches_filtered(memoryview(data), pattern, check)

class xtxml (*paths, format=None, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract values from an XML document.

Expand source code Browse git

class xtxml(XMLToPathExtractorUnit):
    """
    Extract values from an XML document.
    """
    def unpack(self, data):
        root = xml.parse(data.strip())
        meta = metavars(data)
        path = self._make_path_builder(meta, root)

        def walk(node: xml.XMLNode, *parts: str):
            def extract(node: xml.XMLNode = node):
                if not node.children:
                    return node.content.encode(self.codec)
                with MemoryFile() as stream:
                    node.write(stream)
                    return bytes(stream.getvalue() | ppxml)

            attributes = {
                self._normalize_key(k): self._normalize_val(v)
                for k, v in node.attributes.items()
            }

            if not all(is_valid_variable_name(k) for k in attributes):
                attributes = {F'_{k}': v for k, v in attributes.items()}

            yield UnpackResult('/'.join(parts), extract, **attributes)

            for child in node.children:
                yield from walk(child, *parts, path(child))

        yield from walk(root, path(root))

    @classmethod
    def handles(self, data):
        return xml.is_xml(data)

class xtxs (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path')

Extract data from Microsoft Access Databases.

Expand source code Browse git

class xtxs(PathExtractorUnit):
    """
    Extract data from Microsoft Access Databases.
    """

    @PathExtractorUnit.Requires('access-parser', 'formats', 'office', 'extended')
    def _access_parser():
        import access_parser
        return access_parser

    def unpack(self, data):

        with VirtualFileSystem() as vfs:
            file = vfs.new(data, 'accdb')
            xsdb = self._access_parser.AccessParser(file.path)

        for name in xsdb.catalog:
            with NoLogging():
                table = xsdb.parse_table(name)
            if not table:
                continue
            length = max(len(cells) for cells in table.values())
            for k in range(length):
                for header, column in table.items():
                    try:
                        entry = column[k]
                    except IndexError:
                        continue
                    if entry is None:
                        continue

                    if isinstance(entry, (int, float)):
                        entry = str(entry)
                    if isinstance(entry, str):
                        entry = entry.encode(self.codec)
                    if isinstance(entry, bytes):
                        yield UnpackResult(F'{name}/{k}/{header}', entry)

    @classmethod
    def handles(self, data: bytearray) -> Optional[bool]:
        view = memoryview(data)
        if b'Standard ACE DB' in view[:20]:
            return True

class xtzip (*paths, list=False, join_path=False, drop_path=False, fuzzy=0, exact=False, regex=False, path=b'path', date=b'date', pwd=b'')

Extract files from a Zip archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtzip --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtzip [| dump {path} ]

Expand source code Browse git

class xtzip(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a Zip archive.
    """
    @ArchiveUnit.Requires('chardet', 'default', 'extended')
    def _chardet():
        import chardet
        return chardet

    @ArchiveUnit.Requires('pyzipper', 'arc', 'default', 'extended')
    def _pyzipper():
        import pyzipper
        return pyzipper

    @classmethod
    def _carver(cls):
        return carve_zip

    def unpack(self, data: bytearray):
        from zipfile import ZipInfo, ZipFile, BadZipFile

        def password_invalid(password: Optional[bytes]):
            nonlocal archive, fallback
            if password:
                archive.setpassword(password)
            try:
                archive.testzip()
                files = (t for t in archive.infolist() if t.filename and not t.is_dir())
                files = sorted(files, key=lambda info: info.file_size)
                for info in files:
                    self.log_debug('testing password against:', info.filename)
                    try:
                        with archive.open(info.filename, "r") as test:
                            while test.read(1024):
                                pass
                    except BadZipFile:
                        continue
                    else:
                        break
            except NotImplementedError:
                if fallback:
                    raise
                self.log_debug('compression method unsupported, switching to pyzipper')
                archive = self._pyzipper.AESZipFile(MemoryFile(data))
                fallback = True
                return password_invalid(password)
            except RuntimeError as E:
                if 'password' not in str(E):
                    raise
                return True
            else:
                if password:
                    self.log_debug('using password:', password)
                return False

        password = bytes(self.args.pwd)
        fallback = False
        archive = ZipFile(MemoryFile(data))
        passwords = [password]

        if not password:
            passwords.extend(p.encode(self.codec) for p in self._COMMON_PASSWORDS)
        for p in passwords:
            if not password_invalid(p):
                break
        else:
            raise RuntimeError('Archive is password-protected.')

        for info in archive.infolist():
            def xt(archive: ZipFile = archive, info: ZipInfo = info):
                try:
                    return archive.read(info.filename)
                except RuntimeError as E:
                    if 'password' not in str(E):
                        raise
                    if not password:
                        raise RuntimeError('archive is password-protected')
                    else:
                        raise RuntimeError(F'invalid password: {password.decode(self.codec)}') from E
            if info.filename:
                if info.is_dir():
                    continue

            # courtesy of https://stackoverflow.com/a/37773438/9130824
            filename = info.filename
            if info.flag_bits & ZIP_FILENAME_UTF8_FLAG == 0:
                filename_bytes = filename.encode('437')
                try:
                    guessed_encoding = self._chardet.detect(filename_bytes)['encoding']
                except ImportError:
                    guessed_encoding = None
                guessed_encoding = guessed_encoding or 'cp1252'
                filename = filename_bytes.decode(guessed_encoding, 'replace')

            try:
                date = datetime(*info.date_time)
            except Exception as e:
                self.log_info(F'{e!s} - unable to determine date from tuple {info.date_time} for: {filename}')
                date = None

            yield self._pack(filename, date, xt)

    @classmethod
    def handles(cls, data: bytearray):
        return data.rfind(ZipEndOfCentralDirectory.SIGNATURE) > 0 and data[:2] == B'PK'

class xtzpaq (*paths, index=False, pwd=b'', date=b'date', path=b'path', regex=False, exact=False, fuzzy=0, drop_path=False, join_path=False, list=False)

Extract files from a ZPAQ archive. This unit is a path extractor which extracts data from a hierarchical structure. Each extracted item is emitted as a separate chunk and has attached to it a meta variable that contains its path within the source structure. The positional arguments to the command are patterns that can be used to filter the extracted items by their path. To view only the paths of all chunks, use the listing switch:

emit something | xtzpaq --list

Otherwise, extracted items are written to the standard output port and usually require a frame to properly process. In order to dump all extracted data to disk, the following pipeline can be used:

emit something | xtzpaq [| dump {path} ]

Expand source code Browse git

class xtzpaq(ArchiveUnit, docs='{0}{s}{PathExtractorUnit}'):
    """
    Extract files from a ZPAQ archive.
    """

    _MAGIC = B'\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3\x7A\x50\x51'

    def __init__(
        self, *paths,
        index: Arg.Switch('-i', help='Archive is an index (no d-blocks).') = False,
        **more
    ):
        for _code, _size in {
            _TCU32: 4,
            _TCI32: 4,
            _TCU16: 2,
            _TCI16: 2,
        }.items():
            _item_size = array(_code).itemsize
            if _item_size == _size:
                continue
            raise RuntimeError(
                F'Expected array type "{_code}" to have entries of size {_size}, but the API '
                F'reports a size of {_item_size}.')

        super().__init__(*paths, index=index, **more)

    @classmethod
    def handles(cls, data: bytearray) -> Optional[bool]:
        return cls._MAGIC in data

    def unpack(self, archive: bytearray):
        def mkdate(date) -> datetime:
            date = int(date)
            year = date // 1000000 // 10000
            month = date // 100000000 % 100
            day = date // 1000000 % 100
            hour = date // 10000 % 100
            minute = date // 100 % 100
            second = date % 100
            return datetime(year, month, day, hour, minute, second, 0)

        @dataclass
        class DT:
            date: int = 0
            attr: int = 0
            name: str = ""
            frag: List[int] = field(default_factory=list)

            @property
            def dt(self) -> Optional[datetime]:
                if self.date > 0:
                    return mkdate(self.date)

        # TODO: implement password-protected archives
        # key = self.args.pwd
        index = self.args.index
        bsize: Dict[int, int] = {}  # frag ID -> d block compressed size
        dt: Dict[str, DT] = {}      # filename -> date, attr, frags
        frag: List[bytes] = []      # ID -> hash[20] size[4] data
        csize = 0                   # expected offset of next non d block
        streaming = False
        journaling = False

        done = False
        dc = Decompressor()
        src = dc.set_input(archive)

        while not done and dc.read_block():
            while not done:
                filename = dc.read_filename()
                if filename is None:
                    break
                self.log_info('reading file', filename)
                comment = dc.read_comment()
                jsize = 0
                if len(comment) >= 4 and comment[-4:] == "jDC\x01":
                    num = re.search('^\\d+', comment)
                    if not num:
                        raise RuntimeError('missing size in comment')
                    jsize = int(num[0])
                    if streaming:
                        raise RuntimeError('journaling block after streaming one')
                    journaling = True
                    self.log_info('archive type is journaling')
                else:
                    if journaling:
                        raise RuntimeError('streaming block after journaling one')
                    if index:
                        raise RuntimeError('streaming block in index')
                    streaming = True
                    self.log_info('archive type is streaming')

                # Test journaling filename. The format must be
                # jDC[YYYYMMDDHHMMSS][t][NNNNNNNNNN]
                # where YYYYMMDDHHMMSS is the date, t is the type {c,d,h,i}, and
                # NNNNNNNNNN is the 10 digit first fragment ID for types c,d,h.
                # They must be in ascending lexicographical order.

                frag_id = 0
                block_type = None

                if journaling:
                    if len(filename) != 28:
                        raise RuntimeError('filename size not 28')
                    if filename[:3] != 'jDC':
                        raise RuntimeError('filename not jDC')
                    block_type = filename[17]
                    if block_type not in 'cdhi':
                        raise RuntimeError('type not c,d,h,i')
                    try:
                        mkdate(filename[3:17])
                    except Exception as E:
                        raise RuntimeError('invalid date') from E
                    frag_id = int(filename[18:28])
                    if not 1 <= frag_id <= 4294967295:
                        raise RuntimeError('fragment ID out of range')

                seg = MemoryFile(size_limit=jsize)
                dc.set_output(seg)
                sha1 = hashlib.sha1()
                dc.set_hasher(sha1)
                dc.decompress_data()

                if journaling and len(seg) != jsize:
                    raise RuntimeError('incomplete output')

                checksum = dc.read_segment_end()
                if checksum is None:
                    self.log_debug('no checksum')
                elif checksum != sha1.digest():
                    raise RuntimeError('SHA1 mismatch')

                # check csize at first non-d block
                if csize and block_type in 'chi':
                    if csize != offset:
                        raise RuntimeError(F'csize={csize} does not point to offset={offset}')
                    csize = 0

                # get csize from c block
                seglen = len(seg)
                seg = StructReader(seg.getvalue())
                if block_type == 'c':
                    if seglen < 8:
                        raise RuntimeError("c block too small")
                    csize = seg.u64()
                    offset = src.tell() + 1
                    self.log_debug(F'csize={csize} at offset={offset}')
                    if csize >> 63:
                        self.log_warn('incomplete transaction at end of archive')
                        done = True
                    elif index and csize != 0:
                        raise RuntimeError('nonzero csize in index')
                    # Set csize to expected offset of first non d block
                    # assuming 1 more byte for unread end of block marker.
                    csize += offset

                if block_type == 'd':
                    if index:
                        raise RuntimeError('d block in index')
                    bsize[frag_id] = src.tell() + 1 - offset  # compressed size
                    self.log_debug(F' {bsize[frag_id]} -> {len(seg)}')
                    # Test frag size list at end. The format is f[id..id+n-1] fid n
                    # where fid may be id or 0. sizes must sum to the rest of block.
                    if seglen < 8:
                        raise RuntimeError('d block too small')
                    seg.seekset(-8)
                    fid = seg.u32() or frag_id
                    n = seg.u32()
                    if fid != frag_id:
                        raise RuntimeError('missing ID')
                    if n > (seglen - 8) // 4:
                        raise RuntimeError('frag list too big')
                    fragsum = 0  # computed sum of frag sizes
                    seg.seekset(-4 * (n + 2))
                    for _ in range(n):
                        fragsum += seg.u32()
                    if fragsum + n * 4 + 8 != seglen:
                        raise RuntimeError('bad frag size list')
                    # Save frag hashes and sizes. For output, save data too.
                    seg.seekset(fragsum)
                    data = seg.getvalue()
                    assert seg.remaining_bytes == n * 4 + 8
                    for i in range(n):
                        while len(frag) <= frag_id + i:
                            frag.append(B'')
                        if frag[frag_id + i]:
                            raise RuntimeError('duplicate frag ID')
                        f = seg.u32()
                        h = hashlib.sha1(data[:f]).digest()
                        frag[frag_id + i] = h + f.to_bytes(4, 'little') + data[:f]
                        data = data[f:]

                    assert len(data) == n * 4 + 8
                    assert seg.remaining_bytes == 8

                # Test and save h block. Format is: bsize (sha1[20] size)...
                # where bsize is the compressed size of the d block with the same id,
                # and each size corresonds to a fragment in that block. The list
                # must match the list in the d block if present.

                if block_type == 'h':
                    if seglen % 24 != 4:
                        raise RuntimeError('bad h block size')
                    b = seg.u32()
                    self.log_debug(F'[{frag_id}..{frag_id + seglen // 24}[ {b}')
                    fragsum = 0 # uncompressed size of all frags
                    for i in range(seglen // 24):
                        fd = seg.read(24)
                        if index:
                            while len(frag) <= frag_id + i:
                                frag.append(B'')
                            if frag[frag_id + i]:
                                raise RuntimeError('data in index')
                            frag[frag_id + i] = fd
                        elif frag_id + i >= len(frag) or len(frag[frag_id + i]) < 24:
                            raise RuntimeError('no matching d block')
                        elif frag[frag_id + i][:24] != fd:
                            raise RuntimeError('frag size or hash mismatch')
                        fragsum += int.from_bytes(fd[20:24], 'little')

                # Test i blocks and save files to extract. Format is:
                #   date filename 0 na attr[0..na) ni ptr[0..ni)   (to update)
                #   0    filename                                  (to delete)
                # Date is 64 bits in YYYYMMDDHHMMSS format.

                if block_type == 'i':
                    while not seg.eof:
                        f = DT(seg.u64())
                        f.name = seg.read_c_string('utf8')
                        if f.date > 0:
                            na = seg.u32()
                            if na > 65535:
                                raise ValueError('attr size > 65535')
                            f.attr = seg.read_integer(na * 8)
                            ni = seg.u32()
                            for i in range(ni):
                                a = seg.u32()
                                f.frag.append(a)
                                if index:
                                    continue
                                elif not 1 <= a < len(frag):
                                    raise RuntimeError('frag ID out of range')
                                elif not frag[a]:
                                    raise LookupError('missing frag data')
                        dt[f.name] = f

                if streaming:
                    yield self._pack(filename, None, seg.getvalue())

            offset = src.tell()

        self.log_debug(F'{offset} bytes of archive tested')

        if not journaling:
            return

        for name, f in dt.items():
            if not f.date:
                continue
            size = sum(
                int.from_bytes(frag[fp][20:24], 'little')
                for fp in f.frag
                if 0 < fp < len(frag) and len(frag[fp]) >= 24
            )
            out = MemoryFile()
            for fp in f.frag:
                if fp < len(frag):
                    out.write(memoryview(frag[fp])[24:])
            if len(out) != size:
                self.log_warn('invalid size during unpacking')
            yield self._pack(name, f.dt, out.getvalue())

class xxh (seed=0, text=False)

Implements the xxHash hashing algorithm.

Expand source code Browse git

class xxh(HashUnit):
    """
    Implements the xxHash hashing algorithm.
    """
    def __init__(
        self,
        seed: HashUnit.Arg.Number(metavar='seed', help='specify the seed value; the default is {default}') = 0,
        text=False
    ):
        super().__init__(text, seed=seed)

    def _algorithm(self, data):
        return xxhash(data, self.args.seed).digest()

class xxtea (key, iv=b'', padding=None, mode=None, raw=False, swap=False, block_size=1)

Expand source code Browse git

class xxtea(TEAUnit, cipher=BlockCipherFactory(XXTEA)):

    block_size: int = 8

    def __init__(
        self, key, iv=b'', padding=None, mode=None, raw=False, swap=False,
        block_size: Arg.Number('-b', help=(
            'Cipher block size in 32-bit words. The default value {default} implies that the input '
            'is treated as a single block, which is common behaviour of many implementations.')) = 1
    ):
        super().__init__(
            key, iv=iv, padding=padding, mode=mode, raw=raw, swap=swap, block_size=block_size)

    def _prepare_block(self, data: bytes):
        if self.args.block_size <= 1:
            blocks, remainder = divmod(len(data), 4)
            if remainder:
                blocks += 1
            self.block_size = blocks * 4
        else:
            self.block_size = self.args.block_size * 4

    def encrypt(self, data: bytes) -> bytes:
        self._prepare_block(data)
        return super().encrypt(data)

    def decrypt(self, data: bytes) -> bytes:
        self._prepare_block(data)
        return super().decrypt(data)

    def _new_cipher(self, **optionals) -> CipherInterface:
        return StandardBlockCipherUnit._new_cipher(self,
            big_endian=self.args.swap, block_size=self.block_size, **optionals)

class zl (level=9, window=15, zlib_header=False, gzip_header=False)

ZLib compression and decompression.

Expand source code Browse git

class zl(Unit):
    """
    ZLib compression and decompression.
    """

    def __init__(
        self,
        level  : Arg.Number('-l', bound=(0, 0X9), help='Specify a compression level between 0 and 9.') = 9,
        window : Arg.Number('-w', bound=(8, 0XF), help='Manually specify the window size between 8 and 15.') = 15,
        zlib_header: Arg.Switch('-z', group='MODE', help='Use a ZLIB header.') = False,
        gzip_header: Arg.Switch('-g', group='MODE', help='Use a GZIP header.') = False
    ):
        if zlib_header and gzip_header:
            raise ValueError('You can only specify one header type (ZLIB or GZIP).')
        return super().__init__(level=level, window=window, zlib_header=zlib_header, gzip_header=gzip_header)

    def _decompress_data(self, data, mode: int, step: int):
        zl = zlib.decompressobj(mode)
        memory = memoryview(data)
        result = bytearray()
        while not zl.eof:
            read = min(step, len(memory))
            try:
                chunk = zl.decompress(memory[:read])
            except zlib.error as e:
                raise RefineryPartialResult(exception_to_string(e), result) from e
            else:
                result.extend(chunk)
                consumed = read - len(zl.unused_data)
                if not memory or consumed == 0:
                    break
                memory = memory[consumed:]
        return result, memory

    def process(self, data):
        if data[0] == 0x78 or data[0:2] == B'\x1F\x8B' or self.args.zlib_header or self.args.gzip_header:
            modes = [self.args.window | 0x20, -self.args.window]
        else:
            modes = [-self.args.window, self.args.window | 0x20]
        modes.extend([0x10 | self.args.window, 0])
        view = memoryview(data)
        step = 32 if self.leniency > 0 else len(data)
        for k in itertools.count(1):
            error = None
            rest = view
            for mode in modes:
                try:
                    out, rest = self._decompress_data(view, mode, step)
                except Exception as e:
                    error = error or e
                else:
                    self.log_info(F'used mode {mode} to decompress chunk {k}')
                    yield out
                    error = None
                    break
            if error:
                raise error
            if not rest:
                break
            if len(rest) == len(view):
                break
            if len(rest) > len(view):
                raise RuntimeError('Decompressor returned more tail data than input data.')
            yield out
            view = rest
        if k <= 0:
            raise ValueError('Could not detect any zlib stream.')

    def reverse(self, data):
        mode = -self.args.window
        if self.args.zlib_header:
            mode = -mode
        if self.args.gzip_header:
            mode = -mode | 0x10
        self.log_debug(F'using mode {mode:+2d} for compression')
        zl = zlib.compressobj(self.args.level, zlib.DEFLATED, mode)
        zz = zl.compress(data)
        return zz + zl.flush(zlib.Z_FINISH)

    @classmethod
    def handles(self, data: bytearray):
        for sig in (
            B'\x1F\x8B',  # gzip header
            B'\x78\x01',  # zlib low compression
            B'\x78\x9C',  # zlib medium compression
            B'\x78\xDA',  # zlib high compression
        ):
            if data[:2] == sig:
                return True

class zstd

ZStandard (ZSTD) compression and decompression.

Expand source code Browse git

class zstd(Unit):
    """
    ZStandard (ZSTD) compression and decompression.
    """
    @Unit.Requires('pyzstd', 'all')
    def _pyzstd():
        import pyzstd
        return pyzstd

    def process(self, data):
        zd = self._pyzstd.ZstdDecompressor()
        out = zd.decompress(data)
        if zd.needs_input:
            raise RefineryPartialResult('Incomplete ZSTD stream.', out)
        return out

    def reverse(self, data):
        zc = self._pyzstd.ZstdCompressor()
        return zc.compress(data) + zc.flush()

    @classmethod
    def handles(self, data: bytearray) -> bool:
        return data[:4] == B'\x28\xB5\x2F\xFD'