Module refinery.units.misc.autoxor

Expand source code Browse git
from __future__ import annotations

from refinery.lib.id import get_executable_type, get_structured_data_type
from refinery.units.blockwise.sub import sub
from refinery.units.blockwise.xor import xor
from refinery.units.misc.xkey import xkey


class autoxor(xkey, docs='{0}{p}{1}'):
    """
    Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte
    with successive bytes from a key or by subtracting the respective key byte value from each
    input byte. It uses the `refinery.xkey` unit to attack the cipher and attempts to recover the
    plaintext automatically.
    """
    def process(self, data: bytearray):
        fallback: tuple[str, bytes, bytearray] | None = None

        try:
            result = next(self._attack(data))
        except StopIteration:
            result = None
        else:
            key = result.key
            units: list[type[xor] | type[sub]] = []

            if result.xor is not False:
                units.append(xor)
            if result.xor is not True:
                units.append(sub)

            for unit in units:
                self.log_debug(F'attempting {unit.name} for detected key')

                name = unit.name
                bin = data | unit(key) | bytearray
                mem = memoryview(bin)
                space = B'\0' | unit(0x20) | bytes
                check = get_structured_data_type

                for k in range(0x1000):
                    if t := check(mem[k:]):
                        self.log_info(F'method {name} resulted in non-blob data ({t.mnemonic}) at offset 0x{k:X}; returning buffer')
                        return self.labelled(bin, key=key, method=name)
                    if k == 0:
                        check = get_executable_type

                if not fallback:
                    fallback = name, key, bin

                if not any(bin):
                    continue

                as_text = bin | unit(space) | bytearray

                try:
                    decoded = as_text.decode('utf8')
                except UnicodeDecodeError:
                    is_text = False
                else:
                    import re
                    is_text = bool(re.fullmatch(r'[\s\w!-~]+', decoded))

                if is_text:
                    self.log_info('detected likely text input; automatically shifting towards space character')
                    key = (b'\x20' * len(key)) | unit(key) | bytes
                    return self.labelled(as_text, key=key, method=name)

        if fallback is None:
            self.log_warn('no key was found; returning original data')
            return data
        else:
            assert result is not None
            name, key, bin = fallback
            if result.how == self._rt.freq and result.score < 8:
                self.log_warn(
                    F'unrecognized format, no confirmed crib, low score ({result.score:.2f}%); '
                    'the output is likely junk'
                )
            return self.labelled(bin, key=key)

Classes

class autoxor (range=slice(1, 32, None), plaintext=b'', searchpos=slice(0, None, None), alph=False, crib=False, freq=False)

Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte with successive bytes from a key or by subtracting the respective key byte value from each input byte. It uses the xkey unit to attack the cipher and attempts to recover the plaintext automatically.

The unit expects encrypted input which was encrypted byte-wise with a polyalphabetic key. For both bit-wise and byte-wise addition, it can attempt do determine this key by three methods:

  1. Known plaintext cribs: The unit contains a library of file signatures that are expected to occur at specific offsets. It uses these to attempt a known-plaintext attack against the input. If a key is found that is at most half the size of such a crib, it is returned.
  2. Known alphabets: For each given key length, the input is split into slices that would have been encrypted with a single byte for keys of that length. Each such slice undergoes a character frequency analysis. If the histogram indicates that an alphabet of a small size was used (i.e. base64), then the unit attempts to determine the key based on this.
  3. Known high frequency glyph: Works if the plaintext contains one letter that occurs with very high frequency, i.e. zero padding in PE or ELF files, and the space character in text. Based on this assumption, the unit computes the most likely key. This method will work best on uncompressed files that were encrypted with a short key.

When no option is set, the unit uses all the above methods by default. When at least one of the methods is selected, it will attempt only selected methods. When a custom plaintext is given, the other methods are disabled by default.

Expand source code Browse git
class autoxor(xkey, docs='{0}{p}{1}'):
    """
    Assumes input that was encrypted with a polyalphabetic block cipher, like XOR-ing each byte
    with successive bytes from a key or by subtracting the respective key byte value from each
    input byte. It uses the `refinery.xkey` unit to attack the cipher and attempts to recover the
    plaintext automatically.
    """
    def process(self, data: bytearray):
        fallback: tuple[str, bytes, bytearray] | None = None

        try:
            result = next(self._attack(data))
        except StopIteration:
            result = None
        else:
            key = result.key
            units: list[type[xor] | type[sub]] = []

            if result.xor is not False:
                units.append(xor)
            if result.xor is not True:
                units.append(sub)

            for unit in units:
                self.log_debug(F'attempting {unit.name} for detected key')

                name = unit.name
                bin = data | unit(key) | bytearray
                mem = memoryview(bin)
                space = B'\0' | unit(0x20) | bytes
                check = get_structured_data_type

                for k in range(0x1000):
                    if t := check(mem[k:]):
                        self.log_info(F'method {name} resulted in non-blob data ({t.mnemonic}) at offset 0x{k:X}; returning buffer')
                        return self.labelled(bin, key=key, method=name)
                    if k == 0:
                        check = get_executable_type

                if not fallback:
                    fallback = name, key, bin

                if not any(bin):
                    continue

                as_text = bin | unit(space) | bytearray

                try:
                    decoded = as_text.decode('utf8')
                except UnicodeDecodeError:
                    is_text = False
                else:
                    import re
                    is_text = bool(re.fullmatch(r'[\s\w!-~]+', decoded))

                if is_text:
                    self.log_info('detected likely text input; automatically shifting towards space character')
                    key = (b'\x20' * len(key)) | unit(key) | bytes
                    return self.labelled(as_text, key=key, method=name)

        if fallback is None:
            self.log_warn('no key was found; returning original data')
            return data
        else:
            assert result is not None
            name, key, bin = fallback
            if result.how == self._rt.freq and result.score < 8:
                self.log_warn(
                    F'unrecognized format, no confirmed crib, low score ({result.score:.2f}%); '
                    'the output is likely junk'
                )
            return self.labelled(bin, key=key)

Ancestors

Subclasses

Class variables

var reverse

Inherited members