Module refinery.lib.id
This module contains functions to identify certain file formats; some of these functions are used
by units who operate on the same file format to implement the Unit.handles() method.
The method get_structured_data_type() is used to determine whether an unknown blob
is a known data format. Units like decompress or autoxor use this as part of
their heuristics to determine that a high quality output has been generated.
Expand source code Browse git
"""
This module contains functions to identify certain file formats; some of these functions are used
by units who operate on the same file format to implement the `refinery.units.Unit.handles` method.
The method `refinery.lib.id.get_structured_data_type` is used to determine whether an unknown blob
is a known data format. Units like `refinery.decompress` or `refinery.autoxor` use this as part of
their heuristics to determine that a high quality output has been generated.
"""
from __future__ import annotations
import enum
import re
from typing import Callable, NamedTuple
from refinery.lib.tools import entropy
from refinery.lib.types import buf
MimeByExtension = {
'bin' : 'application/ocet-stream',
'exe' : 'application/exe',
'sys' : 'application/exe',
'dll' : 'application/exe',
'elf' : 'application/x-elf-executable',
'macho' : 'application/x-mach-binary',
'class' : 'application/java-byte-code',
'pdf' : 'application/pdf',
'djvu' : 'image/vnd.djvu',
'pcap' : 'application/vnd.tcpdump.pcap',
'db' : 'application/x-sqlite3',
'mdb' : 'application/x-msaccess',
'doc' : 'application/msword',
'xls' : 'application/vnd.ms-excel',
'ppt' : 'application/vnd.ms-powerpoint',
'msg' : 'application/vnd.ms-outlook',
'msi' : 'application/x-msi',
'docx' : 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'pptx' : 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'xlsx' : 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'txt' : 'text/plain',
'json' : 'application/json',
'xml' : 'application/xml',
'html' : 'text/html',
'rtf' : 'application/rtf',
'vbe' : 'text/plain',
'eml' : 'message/rfc822',
'ico' : 'image/vnd.microsoft.icon',
'gif' : 'image/gif',
'tif' : 'image/tiff',
'jpg' : 'image/jpeg',
'png' : 'image/png',
'bmp' : 'image/bmp',
'ogg' : 'audio/ogg',
'wav' : 'audio/wav',
'avi' : 'video/x-msvideo',
'mp3' : 'audio/mpeg',
'm3u' : 'text/plain',
'mp4' : 'video/mp4',
'mpg' : 'video/mpeg',
'mid' : 'audio/midi',
'mkv' : 'video/x-matroska',
'swf' : 'application/x-shockwave-flash',
'tar' : 'application/x-tar',
'7z' : 'application/x-7z-compressed',
'zip' : 'application/zip',
'rar' : 'application/vnd.rar',
'cab' : 'application/vnd.ms-cab-compressed',
'bz' : 'application/x-bzip',
'bz2' : 'application/x-bzip2',
'gz' : 'application/gzip',
'xz' : 'application/x-xz',
'zstd' : 'application/x-zstd',
'zlib' : 'application/zlib',
}
class Format:
__slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details'
def __hash__(self):
return hash(tuple(self))
def __eq__(self, other):
if not isinstance(other, Format):
return False
return all(a == b for a, b in zip(self, other))
def __iter__(self):
yield self.category
yield self.extension
yield self.mnemonic
yield self.details
yield self.mime
def __init__(
self,
category: FormatCategory,
extension: str | None = None,
mnemonic: str | None = None,
details: str | None = None,
mime: str | None = None,
) -> None:
self.category = category
self.extension = extension or 'bin'
self.mnemonic = mnemonic or self.extension.upper()
self.details = details or self.mnemonic
if mime is None:
try:
mime = MimeByExtension[self.extension]
except KeyError:
if category == FormatCategory.Text:
mime = 'text/plain'
else:
mime = 'application/ocet-stream'
self.mime = mime
class FormatCategory(enum.IntEnum):
Executable = enum.auto()
Text = enum.auto()
Document = enum.auto()
Image = enum.auto()
Binary = enum.auto()
Media = enum.auto()
Archive = enum.auto()
Compression = enum.auto()
Serialized = enum.auto()
FC = FormatCategory
PycMagicPattern = re.compile(br'''(?x)
[\x02\x03]\x99\x99\x00
|(?:
| \xca\xfe
| \x89\x2e
| \x04\x17
| \x99\x4e
| \xfc\xc4
| \x87\xc6
| \x65\x34
| \x31\x61
| \x2a\xeb
| \x2d\xed
|[\x3b\x45\x59\x63\x6d\x77\x81\x8b\x8c\x95\x9f\xa9\xb3\xb7\xc7\xd1\xdb\xe5\xef\xf9]\xf2
|[\x03\x0a]\xf3
| \x61\x0a
|[\xb8\xc2\xcc\xd6\xe0\xea\xf4\xf5\xff]\x0b
|[\x09\x13\x1d\x1f\x27\x3b\x45\x4f\x58\x62\x6c\x73\x76\x80\x94\x8a\x9e\xb2\xbc\xc6\xd0\xda\xe4\xee\xf8]\x0c
|[\x02\x0c\x16\x17\x20\x21\x2a-\x2d\x2f-\x33\x3e-\x42\x48\x49\x52-\x55\x5c-\x61\x66-\x6f\x7a-\xa7\xac-\xcb\xde-\xf3]\x0d
|[\x10-\x18\x1a\x1b\x1d-\x29\x2b\x47]\x0e
|[\x30\x40\x70\xa0\xc0\xe0\xf0]\x00
|[\x00\x40\x50\x80\xa0]\x01
| \x61\x32
| \x61\x31
| \x9e\x52
|[\x20\x2a]\x53
| \xf3\x03
| \x7a\x56
) \x0D\x0A
''')
class Fmt(Format, enum.Enum):
"""
An enumeration of all known file formats that can be returned by
`refinery.lib.id.get_structured_data_type`.
"""
PE32GUI = (FC.Executable, 'exe', 'PE/32/GUI')
PE32CUI = (FC.Executable, 'exe', 'PE/32/CUI')
PE32DLL = (FC.Executable, 'dll', 'PE/32/DLL')
PE32SYS = (FC.Executable, 'sys', 'PE/32/SYS')
PE64GUI = (FC.Executable, 'exe', 'PE/64/GUI')
PE64CUI = (FC.Executable, 'exe', 'PE/64/CUI')
PE64DLL = (FC.Executable, 'dll', 'PE/64/DLL')
PE64SYS = (FC.Executable, 'sys', 'PE/64/SYS')
ELF32LE = (FC.Executable, 'elf', 'ELF/32/LE')
ELF64LE = (FC.Executable, 'elf', 'ELF/64/LE')
ELF32BE = (FC.Executable, 'elf', 'ELF/32/BE')
ELF64BE = (FC.Executable, 'elf', 'ELF/64/BE')
MACHOuvLE = (FC.Executable, 'macho', 'MachO/Fat/LE')
MACHOuvBE = (FC.Executable, 'macho', 'MachO/Fat/BE')
MACHO32LE = (FC.Executable, 'macho', 'MachO/32/LE')
MACHO64LE = (FC.Executable, 'macho', 'Macho/64/LE')
MACHO32BE = (FC.Executable, 'macho', 'MachO/32/BE')
MACHO64BE = (FC.Executable, 'macho', 'Macho/64/BE')
JAVA = (FC.Executable, 'class', 'JavaClass')
DEX = (FC.Executable, 'dex', 'Dalvik')
WASM = (FC.Executable, 'wasm', 'WASM', 'Web Assembly')
LUAC = (FC.Executable, 'luac', 'LUAC', 'LUA Bytecode')
PYC = (FC.Executable, 'pyc', 'PYC', 'Python Bytecode')
PDF = (FC.Document, 'pdf', 'PDF', 'PDF Document')
CHM = (FC.Document, 'chm', 'CHM', 'Microsoft Windows HtmlHelp Data')
DJV = (FC.Document, 'djvu')
PCAP = (FC.Binary, 'pcap', 'PCAP', 'Network Packet Capture')
PCAPNG = (FC.Binary, 'pcapng', 'PCAP/NG', 'Next-Generation Network Packet Capture')
SSP = (FC.Binary, 'ssp', 'SmartSniff', 'SmartSniff Packets File')
SQLITE = (FC.Binary, 'db', 'SQLite', 'SQLite Database')
DSS = (FC.Binary, 'DS_Store', 'DSS', 'MacOS DS Store')
A3X = (FC.Binary, 'a3x', 'A3X', 'Compiled AutoIt3')
IFPS = (FC.Binary, 'ifps', 'IFPS', 'InnerFuse PascalScript')
PPK = (FC.Binary, 'ppk', 'PuTTY', 'PuTTY Private Key File')
WIM = (FC.Binary, 'wim', 'WIM', 'Windows Imaging Format')
EVT = (FC.Binary, 'evt', 'EVT', 'Windows Event Viewer')
EVTX = (FC.Binary, 'evtx', 'EVTX', 'Windows Event Viewer XML')
LNK = (FC.Binary, 'lnk', 'LNK', 'Windows Shortcut')
REG_HIVE = (FC.Binary, 'reg', 'WinReg/Hive', 'Windows Registry Hive File', 'text/plain')
REG_TEXT = (FC.Binary, 'reg', 'WinReg/Text', 'Windows Registry Script')
MDB = (FC.Document, 'accdb', 'MDB', 'Microsoft Access Database')
DOC = (FC.Document, 'doc')
ONE = (FC.Document, 'one')
XLS = (FC.Document, 'xls')
PPT = (FC.Document, 'ppt')
MSG = (FC.Document, 'msg')
MSI = (FC.Archive, 'msi')
CFF = (FC.Binary, 'ole', 'Compound File Format')
DOCX = (FC.Document, 'docx')
XLSX = (FC.Document, 'xlsx')
PPTX = (FC.Document, 'pptx')
ASCII = (FC.Text, 'txt', 'PlainText', 'Single-Byte, Plain Text Encoding')
UTF16 = (FC.Text, 'txt', 'UTF16')
UTF32 = (FC.Text, 'txt', 'UTF32')
JSON = (FC.Text, 'json')
XML = (FC.Text, 'xml')
HTM = (FC.Text, 'html')
RTF = (FC.Text, 'rtf', 'RTF')
VBE = (FC.Text, 'vbe', 'VBE', 'Encoded VBScript')
EML = (FC.Text, 'eml', 'EML', 'Plain-Text EMail Document')
HIC = (FC.Image, 'heic', 'HEIC', 'High Efficiency Image Container')
ICO = (FC.Image, r'ico', r'ICO', 'Icon')
GIF = (FC.Image, r'gif', r'GIF', 'Graphics Interchange Format')
TIF = (FC.Image, r'tif', r'TIF', 'Tagged Image File Format')
CIN = (FC.Image, r'cin', r'CIN', 'Kodak Cineon Image')
NUI = (FC.Image, r'nui', r'NUI', 'Nuru ASCI/ANSI Image or Palette')
DPX = (FC.Image, r'dpx', r'DPX', 'SMPTE DPX Image')
BPG = (FC.Image, r'bpg', r'BPG', 'Better Portable Graphics')
EXR = (FC.Image, r'exr', r'EXR', 'OpenEXR Image')
JPG = (FC.Image, r'jpg', r'JPG', 'Joint Photographic Experts Group Image')
JP2 = (FC.Image, r'jp2', r'JP2', 'JPEG 2000')
QOI = (FC.Image, r'qoi', r'QOI', 'Quite OK Image Format')
IFF = (FC.Image, r'iff', r'IFF', 'IFF or Amiga Image')
PNG = (FC.Image, r'png', r'PNG', 'Portable Network Graphics')
PSD = (FC.Image, r'psd', r'PSD', 'Adobe Photoshop Document')
BMP = (FC.Image, r'bmp', r'BMP', 'Bitmap')
FIF = (FC.Image, 'flif', 'FLIF', 'Free Lossless Image Format')
LEP = (FC.Image, r'lep', r'LEP', 'Lepton Compressed JPEG Image')
HDR = (FC.Image, r'hdr', r'HDR', 'Radiance High Dynamic Range Image')
OGG = (FC.Media, 'ogg')
WAV = (FC.Media, 'wav')
AVI = (FC.Media, 'avi')
MP3 = (FC.Media, 'mp3')
M3U = (FC.Media, 'm3u', 'M3U', 'Multimedia Playlist')
MP4 = (FC.Media, 'mp4')
MPG = (FC.Media, 'mpg')
FLC = (FC.Media, 'flac')
MID = (FC.Media, 'mid')
MKV = (FC.Media, 'mkv')
SWF = (FC.Media, 'swf')
SIL = (FC.Media, 'sil')
ACE = (FC.Archive, 'ace')
ASAR = (FC.Archive, 'asar')
VHD = (FC.Archive, 'vhd')
VMDK = (FC.Archive, 'vmdk')
ISO = (FC.Archive, 'iso')
ISZ = (FC.Archive, 'isz', 'ISZ', 'Compressed ISO Image')
DMG = (FC.Archive, 'dmg')
XAR = (FC.Archive, 'xar', 'XAR', 'eXtensible ARchive Format')
TAR = (FC.Archive, 'tar')
OAR = (FC.Archive, 'oar')
ZIP7 = (FC.Archive, '7z', '7Zip')
ZIP = (FC.Archive, 'zip')
RAR = (FC.Archive, 'rar')
CAB = (FC.Archive, 'cab')
CPIO = (FC.Archive, 'cpio')
ZPQ = (FC.Archive, 'zpq')
S_JAV = (FC.Serialized, 'bin', 'SerializedJava')
S_DOT = (FC.Serialized, 'bin', 'SerializedDotNet')
S_PHP = (FC.Serialized, 'bin', 'SerializedPHP')
APLIB = (FC.Compression, 'ap', 'apLib')
BZ2 = (FC.Compression, 'bz2', 'BZIP')
JCALG = (FC.Compression, 'bin', 'jcAlg')
LZMA = (FC.Compression, 'lzma')
LZF = (FC.Compression, 'lzf')
LZH = (FC.Compression, 'lzh')
LZG = (FC.Compression, 'lzg')
RNC = (FC.Compression, 'rnc', 'RNC', 'Rob Northern Compression')
LZIP = (FC.Compression, 'lzip')
LZO = (FC.Compression, 'lzo')
LZ4 = (FC.Compression, 'lz4')
LZW = (FC.Compression, 'lzw')
LZFSE = (FC.Compression, 'lzfse')
MSCF = (FC.Compression, 'mscf')
SZDD = (FC.Compression, 'szdd')
GZIP = (FC.Compression, 'gz')
XZ = (FC.Compression, 'xz', 'XZ/LZMA2')
ZLIB0 = (FC.Compression, 'zlib', 'ZLIB/0')
ZLIB1 = (FC.Compression, 'zlib', 'ZLIB/1')
ZLIB2 = (FC.Compression, 'zlib', 'ZLIB/2')
ZLIB3 = (FC.Compression, 'zlib', 'ZLIB/3')
ZLIB4 = (FC.Compression, 'zlib', 'ZLIB/4')
ZLIB5 = (FC.Compression, 'zlib', 'ZLIB/5')
ZLIB6 = (FC.Compression, 'zlib', 'ZLIB/6')
ZLIB7 = (FC.Compression, 'zlib', 'ZLIB/7')
ZSTD = (FC.Compression, 'zstd')
FormatDetails = {format.mnemonic: format for format in Fmt}
StructuralChecks: list[Callable[[buf], Fmt | None]] = []
def _structural_check(fn: Callable[[buf], Fmt | None]):
StructuralChecks.append(fn)
return fn
@_structural_check
def get_pe_type(data: buf):
"""
Get the correct file type extension for a PE file, or None if the input is unlikely to be a
portable executable in the first place.
"""
if data[:2] != B'MZ':
return None
nt = data[0x3C:0x3E]
if len(nt) < 2:
return None
nt = int.from_bytes(nt, 'little')
if data[nt:nt + 4] != B'PE\0\0':
return None
arch = data[nt + 4:nt + 6]
if arch == B'\x64\x86':
dll = Fmt.PE32DLL
sub = (
Fmt.PE32SYS,
Fmt.PE32GUI,
Fmt.PE32CUI,
)
elif arch == B'\x4C\x01':
dll = Fmt.PE64DLL
sub = (
Fmt.PE64SYS,
Fmt.PE64GUI,
Fmt.PE64CUI,
)
else:
return None
if data[nt + 0x16] & 0x20:
return dll
subsystem = data[nt + 0x5C] - 1
if not 0 <= subsystem <= 2:
return None
return sub[subsystem]
@_structural_check
def get_elf_type(data: buf):
"""
Get arch and byte order information of an ELF file or return None if the input is unlikely to be one.
"""
if not data[:4] == b'\x7FELF':
return None
abo = data[4:6]
if len(data) < 0x40:
return None
elif data[6] != 1: # EI_VERSION
return None
elif abo == B'\x01\x01':
return Fmt.ELF32LE
elif abo == B'\x01\x02':
return Fmt.ELF32BE
elif abo == B'\x02\x01':
return Fmt.ELF64BE
elif abo == B'\x02\x02':
return Fmt.ELF64BE
@_structural_check
def get_macho_type(data: buf):
"""
Get arch and byte order information of a MachO file or return None if the input is unlikely to be one.
"""
order = 'little'
magic = int.from_bytes(data[:4], order)
isfat = False
if len(data) < 30:
return None
elif magic == 0xCE_FAEDFE:
order = 'big'
mtype = Fmt.MACHO32BE
elif magic == 0xCF_FAEDFE:
order = 'big'
mtype = Fmt.MACHO64BE
elif magic == 0xFEEDFACE:
mtype = Fmt.MACHO32LE
elif magic == 0xFEEDFACF:
mtype = Fmt.MACHO64BE
elif magic == 0xCAFEBABE:
mtype = Fmt.MACHOuvLE
isfat = True
elif magic == 0xBEBAFECA:
mtype = Fmt.MACHOuvBE
isfat = True
else:
return None
if isfat:
cpu = int.from_bytes(data[8:0xC], order)
else:
cpu = int.from_bytes(data[4:0x8], order)
if cpu in (
0x00000001, # vax
0x00000002, # ROMP
0x00000004, # NS32032
0x00000005, # NS32332
0x00000006, # mc680x0
0x00000007, # x32
0x01000007, # x64
0x00000008, # mips
0x00000009, # NS32352
0x0000000A, # mc98000
0x0000000B, # hppa
0x0000000C, # arm32
0x0100000C, # arm64
0x0000000D, # mc880000
0x0000000E, # sparc
0x0000000F, # i860
0x00000010, # alpha
0x00000011, # RS/6000
0x00000012, # ppc32
0x01000012, # ppc64
):
return mtype
def get_executable_type(data: buf):
"""
Determine the type of an executable.
"""
if t := get_pe_type(data):
return t
if t := get_elf_type(data):
return t
if t := get_macho_type(data):
return t
def is_likely_pe(data: buf):
"""
Tests whether the input data is likely a PE file by checking the first two bytes and the magic
bytes at the beginning of what should be the NT header.
"""
return get_pe_type(data) is not None
def buffer_offset(haystack: buf, needle: bytes, start: int = 0, end: int | None = None):
"""
Performs a substring search of `needle` in `haystack`. If `haystack` is a `bytes`-like object,
it uses the standard method. If it is a `memoryview`, it uses a regular expression search.
"""
if isinstance(haystack, (bytes, bytearray)):
return needle.find(haystack, start, end)
if m := re.search(re.escape(needle), haystack[start:end]):
return start + m.start()
return -1
def buffer_contains(haystack: buf, needle: bytes):
"""
Determines whether `haystack` contains `needle`.
"""
return buffer_offset(haystack, needle) > 0
def is_likely_pe_dotnet(data: buf):
"""
Tests whether the input data is likely a .NET PE file by running `refinery.lib.id.is_likely_pe`
and also checking for the characteristic strings `BSJB`, `#Strings`, and `#Blob`.
"""
if not is_likely_pe(data):
return False
if not buffer_contains(data, b'BSJB'):
return False
if not buffer_contains(data, b'#Strings'):
return False
if not buffer_contains(data, b'#Blob'):
return False
return True
@_structural_check
def get_reg_export_type(data: buf):
"""
Check whether the input data is a Windows registry file export.
"""
if data[:4] == b'regf':
return Fmt.REG_HIVE
if data[:31] == b'Windows Registry Editor Version':
return Fmt.REG_TEXT
class TextEncoding(NamedTuple):
bom: int = 0
lsb: int = 0
step: int = 1
def guess_text_encoding(
data: buf,
window_size: int = 0x1000,
ascii_ratio: float = 0.98,
) -> TextEncoding | None:
"""
Attempts to determine whether the input data is likely printable text. The return value is None
if the input is unlikely to be text. Otherwise, the return value is a triple of integers: First
the offset after the byte order mark (`0` in case there is none), then the offset of the first
low byte of a character (odd for big endian encodings, even for others) and finally the size of
each encoded character in bytes.
"""
view = memoryview(data)
size = window_size
step = 1
maxbad = 1 - ascii_ratio
bom = 0
lsb = 0
if data[:3] == B'\xEF\xBB\xBF':
# BOM: UTF8
bom = 3
elif data[:4] == B'\xFF\xFE\0\0':
step = bom = lsb = 4 # UTF-32LE
elif data[:2] == B'\xFF\xFE':
step = bom = lsb = 2 # UTF-16LE
elif data[:2] == B'\xFE\xFF':
step, bom, lsb = 2, 2, 3
elif data[:4] == B'\0\0\xFE\xFF':
step, bom, lsb = 4, 4, 7
elif any(data[:4] == bom for bom in (
b'\x2B\x2F\x76\x38',
b'\x2B\x2F\x76\x39',
b'\x2B\x2F\x76\x2B',
b'\x2B\x2F\x76\x2F',
)):
# UTF7 BOM
bom = 4
elif len(view) % 2 == 0:
u16le = (win := view[1:size:2]) and sum(win) / len(win) <= maxbad
u16be = (win := view[0:size:2]) and sum(win) / len(win) <= maxbad
if u16le:
if u16be:
return None
step, lsb = 2, 0
elif u16be:
step, lsb = 2, 1
if step > 1:
if len(data) % step != 0:
return None
if not (win := view[lsb:size:step]) or sum(win) / len(win) > maxbad:
return None
if len(data) <= bom:
return None
if not size:
return TextEncoding(bom, lsb, step)
if isinstance(data, (bytes, bytearray)):
histogram = [data.count(b, bom, size) for b in range(0x100)]
else:
histogram = [0] * 256
for b in view[bom:size]:
histogram[b] += 1
presence = memoryview(bytes(1 if v else 0 for v in histogram))
if sum(presence) > 102:
# 96 printable ASCII characters plus some slack for control bytes or encoding
return None
if sum(presence[0x7F:]) > 5:
# Allow for some control characters or encoding-specific values
return None
if sum(presence[:0x20]) > 5:
# Tab, CR, LF, Null, plus one byte slack
return None
bad = sum(histogram[:0x20]) + sum(histogram[0x7F:]) \
- histogram[0x0D] \
- histogram[0x0A] \
- histogram[0x09]
if step == 2:
bad -= histogram[0] // 2
if bad / sum(histogram) > maxbad:
return None
while True:
try:
win = view[lsb:size:step]
bad = sum(m.end() - m.start()
for m in re.finditer(BR'[^\t\n\r\x20-\x7E]+', win))
except TypeError:
pass
else:
if bad and bad / len(win) > maxbad:
return None
if size >= len(view):
return TextEncoding(bom, lsb, step)
size <<= 1
def xml_or_html(view: buf):
"""
Returns an `refinery.lib.id.Fmt` indicating either XML or HTML, or None if the data does not
look like either of these formats at all.
"""
if tag_match := re.search(BR'''(?x)
^ # at the very start of the document
\s{0,10} # allow for some leading white space
< # a tag opens
([?!]? # allow for question or exclamation mark
[-:\w]{3,64}) # the tag name
\s{1,20} # white space after tag name
(/?> # the tag may end here, or:
|[-:\w]{3,32}) # we have an attribute.
''', view):
tag = tag_match[1].lower()
end = tag_match[2].lower()
# <?xml...
if tag == b'?xml':
return Fmt.XML
# <HTML>
# <BODY>
if tag in (b'html', b'body'):
return Fmt.HTM
# <!DOCTYPE html
if tag == b'!doctype' and end == b'html':
return Fmt.HTM
# <project xmlns:xsi=...
if end.startswith(b'xml'):
return Fmt.XML
else:
return Fmt.HTM
return None
def ascii_view(
data: buf,
window_size: int = 0x1000,
ascii_ratio: float = 0.98,
):
"""
If the input data looks like text, get a memoryview of the least significant bytes of each
encoded letter. Otherwise, return None. Whether or not the data looks like text is determined
using `refinery.lib.id.guess_text_encoding`; all parameters are forwarded to this function.
"""
if encoding := guess_text_encoding(data, window_size=window_size, ascii_ratio=ascii_ratio):
return memoryview(data)[encoding.lsb:len(data):encoding.step]
def is_likely_eml(
data: buf,
window_size: int = 0x10000,
):
"""
Checks the input for common strings that occur as email headers. If at least two are found,
the function returns True.
"""
hits = 0
view = memoryview(data)[:window_size]
for marker in (
b'\nReceived:\x20from'
b'\nSubject:\x20',
b'\nTo:\x20',
b'\nFrom:\x20',
b'\nMessage-ID:\x20',
b'\nBcc:\x20',
b'\nContent-Transfer-Encoding:\x20',
b'\nContent-Type:\x20',
b'\nReturn-Path:\x20',
):
if re.search(re.escape(marker), view) is None:
continue
if (hits := hits + 1) >= 2:
return True
else:
return False
def is_likely_vbe(data: buf):
"""
Checks whether the input contains the known markers used by encoded Visual Basic scripts.
"""
view = memoryview(data)
if re.search(BR'#@~\^[!-~]{6}==', view[:+64]) is None:
return False
if re.search(BR'[!-~]{6}==\^#~@', view[-64:]) is None:
return False
return True
def is_likely_json(data: buf):
"""
A fast regular expression based check for whether the input looks like JSON. The expression
checks whether the input is a sequence of valid JSON tokens: quoted strings, constants,
integer and floating-point numbers, and control characters. To be explicit, note that this
function cannot check for correct nesting, regular expressions are insufficient for this.
"""
_json = RB"""
\s*(( # a sequence of the following tokens:
"([^"\\\r\n]|\\[^\r\n])*" # a quoted string literal
| true # true
| false # false
| null # null
| [-+]?([1-9]\d*|0) # an integer
| [-+]?\d*\.?\d+([eE][-+]?\d+)? # a float
| [\{\}\[\]:,] # a structural token
# | //(.*?)\n # do not allow comments (line)
# | /\*.*?\*/ # do not allow comments (block)
)\s*)*?
"""
_json = RB'(?x)\s*(\{%s\})|(\[%s\])\s*' % (_json, _json)
return re.fullmatch(_json, data) is not None
@_structural_check
def get_microsoft_format(data: buf):
"""
Checks for various Microsoft formats. This includes Access Database files and OneNote, but most
importantly it can distinguish between various compound document formats like MSI, Word, Excel,
PowerPoint, and Outlook.
"""
if data[:19] == b'\0\01\0\0Standard ACE DB':
return Fmt.MDB
if data[:19] == b'\0\01\0\0Standard Jet DB':
return Fmt.MDB
if data[:4] != B'\xD0\xCF\x11\xE0':
return None
if data[4:8] != B'\xA1\xB1\x1A\xE1' and any(data[4:12]):
return None
if buffer_contains(data, b'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3'):
return Fmt.ONE
for k in range(0x200, 0x10000, 0x200):
mark = int.from_bytes(data[k:k + 4], 'little')
if mark == 0x00C1A5EC:
return Fmt.DOC
if mark == 0x00100809 and data[k + 4:k + 8] == B'\x00\x06\x05\x00':
return Fmt.XLS
if mark == 0xF01D46A0:
return Fmt.PPT
if mark == 0xF01E6E00:
return Fmt.PPT
if mark == 0x03E8000F:
return Fmt.PPT
if buffer_contains(data, b'W\0o\0r\0d\0D\0o\0c\0u\0m\0e\0n\0t\0'):
# WordDocument
return Fmt.DOC
if buffer_contains(data, b'P\0o\0w\0e\0r\0P\0o\0i\0n\0t\0'):
# PowerPoint
return Fmt.PPT
if buffer_contains(data, b'W\0o\0r\0k\0b\0o\0o\0k\0'):
# Workbook
return Fmt.XLS
if buffer_contains(data, b'_\0_\0s\0u\0b\0s\0t\0g\01\0.\00\0_\0'):
# __substg1._
return Fmt.MSG
if buffer_contains(data, b'_\0_\0n\0a\0m\0e\0i\0d\0_\0v\0e\0r\0s\0i\0o\0n\0'):
# __nameid_version
return Fmt.MSG
if buffer_contains(data, b'_\0_\0r\0e\0c\0i\0p\0_\0v\0e\0r\0s\0i\0o\0n\0'):
# __recip_version
return Fmt.MSG
if buffer_contains(data, b'_\0_\0p\0r\0o\0p\0e\0r\0t\0i\0e\0s\0_\0v\0e\0r\0s\0i\0o\0n\0'):
# __properties_version
return Fmt.MSG
if buffer_contains(data, b'B\0o\0o\0k\0'):
# Book
return Fmt.XLS
if re.search(b'Property|ProductCode|UpgradeCode|PackageCode|InstallExecuteSequence|Component|Feature|File|Media', data):
return Fmt.MSI
if re.search(B'Msi(?:[A-Z][a-z]{2,30}){2,5}', data):
return Fmt.MSI
else:
return Fmt.CFF
@_structural_check
def get_office_xml_type(data: buf):
"""
Checks for known XML-based Office document types like DOCX, XLSX, and PPTX.
"""
if data[:2] != B'PK':
return None
if not buffer_contains(data, B'_rels/.rels'):
return None
if not buffer_contains(data, B'[Content_Types].xml'):
return None
if buffer_contains(data, B'word/document.xml'):
return Fmt.DOCX
if buffer_contains(data, B'xl/document.xml'):
return Fmt.XLSX
if buffer_contains(data, B'ppt/presentation.xml'):
return Fmt.PPTX
@_structural_check
def get_compression_type(
data: buf,
entropy_minimum: float = 0.7,
entropy_look_at: int = 0x2000,
):
"""
This method looks for any of a number of known magic signatures for compression and archive
formats. If one is find, the method selects a data window from the rest of the buffer and
computes its entropy. If the entropy exceeds the given threshold, the input is idenfied as
a known compression format.
"""
size = len(data)
view = memoryview(data)
T = True
F = False
if data[:4] == b'\04\0\0\0' and data[0x10:0x18] == B'{"files"':
return Fmt.ASAR
for format, entropy_required, offset, signature in (
(Fmt.APLIB , T, 0, B'AP32'), # noqa
(Fmt.ACE , F, 7, B'**ACE**'), # noqa
(Fmt.BZ2 , T, 0, B'BZh'), # noqa
(Fmt.JCALG , T, 0, B'JC'), # noqa
(Fmt.LZMA , T, 0, B'\x5D\0\0\0'), # noqa
(Fmt.LZMA , T, 0, B'\xFD7zXZ'), # noqa
(Fmt.RNC , T, 0, B'RNC\x01'), # noqa
(Fmt.RNC , T, 0, B'RNC\x02'), # noqa
(Fmt.LZF , T, 0, B'ZV'), # noqa
(Fmt.LZG , T, 0, B'LZG'), # noqa
(Fmt.LZIP , T, 0, B'LZIP'), # noqa
(Fmt.LZ4 , T, 0, B'\x04\x22\x4D\x18'), # noqa
(Fmt.LZO , F, 0, B'\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a'), # noqa
(Fmt.LZH , T, 0, B'\x1F\xA0'), # noqa
(Fmt.LZW , T, 0, B'\x1F\x9D'), # noqa
(Fmt.GZIP , T, 0, B'\x1F\x8B'), # noqa
(Fmt.XZ , F, 0, B'\xFD\x37\x7A\x58\x5A\x00'), # noqa
(Fmt.MSCF , T, 0, B'\x0A\x51\xE5\xC0'), # noqa
(Fmt.RAR , T, 0, B'Rar!\x1A\x07'), # noqa
(Fmt.XAR , T, 0, B'xar!'), # noqa
(Fmt.SZDD , T, 0, B'SZDD'), # noqa
(Fmt.ZLIB0 , T, 0, B'\x78\x01'), # noqa
(Fmt.ZLIB1 , T, 0, B'\x78\x5E'), # noqa
(Fmt.ZLIB2 , T, 0, B'\x78\x9C'), # noqa
(Fmt.ZLIB3 , T, 0, B'\x78\xDA'), # noqa
(Fmt.ZLIB4 , T, 0, B'\x78\x20'), # noqa
(Fmt.ZLIB5 , T, 0, B'\x78\x7D'), # noqa
(Fmt.ZLIB6 , T, 0, B'\x78\xBB'), # noqa
(Fmt.ZLIB7 , T, 0, B'\x78\xF9'), # noqa
(Fmt.LZFSE , T, 0, B'bvx2'), # noqa
(Fmt.ZSTD , T, 0, B'\x28\xB5\x2F\xFD'), # noqa
(Fmt.ZIP7 , T, 0, B'7z\xBC\xAF\x27\x1C'), # noqa
(Fmt.CAB , T, 0, B'MSCF'), # noqa
(Fmt.CHM , T, 0, B'ITSF'), # noqa
(Fmt.CPIO , F, 0, B'070701'), # noqa
(Fmt.CPIO , F, 0, B'070702'), # noqa
(Fmt.CPIO , F, 0, B'070707'), # noqa
(Fmt.ZIP , T, 0, B'PK\x03\x04'), # noqa
(Fmt.ZIP , T, 0, B'PK\x05\x06'), # noqa
(Fmt.ZIP , T, 0, B'PK\x07\x08'), # noqa
(Fmt.ISO , F, 0x8001, B'CD001'), # noqa
(Fmt.ISO , F, 0x8801, B'CD001'), # noqa
(Fmt.ISO , F, 0x9001, B'CD001'), # noqa
(Fmt.ISZ , T, 0, B'IsZ!'), # noqa
(Fmt.TAR , F, 257, B'ustar'), # noqa
(Fmt.TAR , F, 257, B'ustar'), # noqa
(Fmt.OAR , T, 0, B'OAR'), # noqa
(Fmt.ZPQ , T, 0, B'7kSt\xA01\x83\xD3\x8C\xB2\x28\xB0\xD3zPQ'), # noqa
(Fmt.VMDK , T, 0, B'KDM'), # noqa
(Fmt.VMDK , T, 0, B'# Disk Descripto'), # noqa
(Fmt.VHD , T, 0, B'conectix'), # noqa
(Fmt.VHD , T, 0, B'vhdxfile'), # noqa
(Fmt.DMG , T, size - 512, B'koly'), # noqa
):
if view[offset:offset + len(signature)] == signature:
if not entropy_required or len(data) < 0x100:
return format
for start in (0x1000, 0x400, 0x200, 0x100, 0x80, 0x40, 0x20, 0x10):
if len(view) >= start + entropy_look_at:
view = view[start:]
break
else:
return format
if entropy(view[:entropy_look_at]) >= entropy_minimum:
return format
@_structural_check
def get_image_format(data: buf):
"""
Determine an image format based on known magic signatures or return None if there is no
match.
"""
if data[:4] == B'\0\0\x01\0':
count = int.from_bytes(data[4:6], 'little')
if not 1 <= count <= 100:
return None
w, h, _, r = data[6:10]
if r != 0:
return None
p = int.from_bytes(data[10:12], 'little') # planes
b = int.from_bytes(data[12:14], 'little') # bit count
if not any((w == h, p == 1, b in (1, 2, 4, 8, 16, 24, 32, 64, 96, 128, 256))):
return None
return Fmt.ICO
if data[:3] == B'\xFF\xD8\xFF':
if data[4] in (0xDB, 0xEE, 0xE0):
return Fmt.JPG
if data[4] == 0xE1 and data[7:13] == B'\x45\x78\x69\x66\0\0':
return Fmt.JPG
return None
if data[:4] == b'FORM':
if data[8:12] in (
B'ILBM',
B'8SVX',
B'ACBM',
B'ANBM',
B'ANIM',
B'FAXX',
B'FTXT',
B'SMUS',
B'CMUS',
B'YUVN',
B'FANT',
B'AIFF',
):
return Fmt.IFF
else:
return None
for format, signature in (
(Fmt.HIC, b'ftypheic'),
(Fmt.GIF, B'GIF87a'),
(Fmt.GIF, B'GIF89a'),
(Fmt.TIF, B'\x49\x49\x2A\x00'),
(Fmt.TIF, B'\x4D\x4D\x00\x2A'),
(Fmt.TIF, B'\x49\x49\x2B\x00'),
(Fmt.TIF, B'\x4D\x4D\x00\x2B'),
(Fmt.CIN, B'\x80\x2A\x5F\xD7'),
(Fmt.NUI, B'NURUIMG'),
(Fmt.NUI, B'NURUPAL'),
(Fmt.DPX, B'SDPX'),
(Fmt.DPX, B'XPDS'),
(Fmt.BPG, B'BPG\xFB'),
(Fmt.EXR, B'\x76\x2F\x31\x01'),
(Fmt.JP2, B'\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A'),
(Fmt.JP2, B'\xFF\x4F\xFF\x51'),
(Fmt.QOI, B'\x71\x6f\x69\x66'),
(Fmt.PNG, B'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A'),
(Fmt.PSD, B'8BPS'),
(Fmt.BMP, B'BM'),
(Fmt.FIF, B'FLIF'),
(Fmt.LEP, B'\xCF\x84\x01'),
(Fmt.HDR, B'#?RADIANCE\n'),
):
if data[:len(signature)] == signature:
return format
@_structural_check
def get_media_format(data: buf):
"""
Determine a multi-media format based on known magic signatures or return None if there is no
match.
"""
if data[:4] == B'RIFF':
if data[8:12] == b'WAVE':
return Fmt.WAV
if data[8:12] == b'AVI ':
return Fmt.AVI
return None
for format, signature in (
(Fmt.OGG, B'OggS'),
(Fmt.MP3, B'\xFF\xFB'),
(Fmt.MP3, B'\xFF\xF3'),
(Fmt.MP3, B'\xFF\xF2'),
(Fmt.MP3, B'ID3'),
(Fmt.M3U, B'#EXTM3U'),
(Fmt.MPG, B'\0\0\01\xBA'),
(Fmt.MPG, B'\0\0\01\xB3'),
(Fmt.FLC, B'fLaC'),
(Fmt.MID, B'MThd'),
(Fmt.MKV, B'\x1A\x45\xDF\xA3'),
(Fmt.SWF, B'CWS'),
(Fmt.SWF, B'FWS'),
(Fmt.SIL, B'#!SILK\n'),
):
if data[:len(signature)] == signature:
return format
if data[4:12] in (B'ftypisom', B'ftypMSNV'):
return Fmt.MPG
if data[4:10] == B'ftypM4':
return Fmt.MP4
if len(data) < 0x1000:
return None
stop = min(len(data), 0x10000)
if all(data[i] == 0x47 for i in range(0, stop, 188)):
if any(data[i - 1] != 0x47 for i in range(0, stop, 188)):
return Fmt.MPG
@_structural_check
def get_serialization_format(data: buf):
"""
Checks for known data serialization formats.
"""
if data[:4] == B'\xAC\xED\x00\x05':
return Fmt.S_JAV
if data[:17] == B'\0\01\0\0\0\xFF\xFF\xFF\xFF\x01\0\0\0\0\0\0\0':
if data[17] in range(18) or data[17] in range(0x14, 0x17):
return Fmt.S_DOT
@_structural_check
def get_misc_binary_formats(data: buf):
"""
Checks for various other binary formats that are not covered by other methods in this module.
"""
if len(data) >= 0x30 and PycMagicPattern.fullmatch(data[:4]):
if any(data[offset] & 0x7F == 0x63 for offset in (8, 12, 16)):
return Fmt.PYC
for format, signature in (
(Fmt.PDF, B'%PDF-'),
(Fmt.A3X, B'\xA3\x48\x4B\xBE\x98\x6C\x4A\xA9\x99\x4C\x53\x0A\x86\xD6\x48\x7D\x41\x55\x33\x21'),
(Fmt.CHM, B'ITSF'),
(Fmt.DSS, B'\0\0\0\01Bud1'),
(Fmt.DJV, B'AT&TFORM'),
(Fmt.DEX, B'dex\n035\0'),
(Fmt.IFPS, B'IFPS'),
(Fmt.JAVA, B'\xCA\xFE\xBA\xBE'),
(Fmt.WASM, B'\0asm'),
(Fmt.LUAC, B'\x1BLua'),
(Fmt.LNK, B'L\0\0\0\01\x14\02\0\0\0\0\0\xC0\0\0\0\0\0\0F'),
(Fmt.PCAP, B'\xD4\xC3\xB2\xA1'),
(Fmt.PCAP, B'\xA1\xB2\xC3\xD4'),
(Fmt.PCAP, B'\x4D\x3C\xB2\xA1'),
(Fmt.PCAP, B'\xA1\xB2\x3C\x4D'),
(Fmt.PCAPNG, B'\n\r\n\r'),
(Fmt.SSP, B'SMSNF200'),
(Fmt.SQLITE, B'SQLite format 3\0'),
(Fmt.PPK, B'PuTTY-User-Key-File-'),
(Fmt.WIM, B'MSWIM\0\0\0\xD0\0\0\0\0'),
(Fmt.EVT, B'LfLe'),
(Fmt.EVTX, B'ElfFile'),
):
if data[:len(signature)] == signature:
return format
@_structural_check
def get_text_format(data: buf):
"""
Implements a heuristic check for whether the input is likely XML data.
"""
encoding = guess_text_encoding(data)
if encoding is None:
return None
step = encoding.step
view = memoryview(data)[encoding.lsb:len(data):step]
if is_likely_vbe(view):
return Fmt.VBE
if re.search(BR'^\s{0,500}\{\\rtf', view) is not None:
return Fmt.RTF
if format := xml_or_html(view):
return format
if step == 1 and is_likely_eml(data):
return Fmt.EML
if is_likely_json(view):
return Fmt.JSON
if step == 1:
return Fmt.ASCII
if step == 2:
return Fmt.UTF16
if step == 4:
return Fmt.UTF32
def get_structured_data_type(data: buf):
"""
Attempts to determine whether the input data is just a meaningless blob or whether it has
structure, i.e. adheres to a known file format. Returns an `refinery.lib.id.Fmt` or `None`.
"""
for check in StructuralChecks:
if t := check(data):
return t
def is_likely_xml(data: buf):
"""
Checks whether the input data is likely an XML document.
"""
if view := ascii_view(data, window_size=0):
return xml_or_html(view) == Fmt.XML
return False
def is_likely_htm(data: buf):
"""
Checks whether the input data is likely an HTML document.
"""
if view := ascii_view(data, window_size=0):
return xml_or_html(view) == Fmt.HTM
return False
def is_likely_msi(data: buf):
"""
Checks whether the input data is likely an MSI.
"""
return get_microsoft_format(data) == Fmt.MSI
def is_likely_email(data: buf):
"""
Checks whether the input data is likely a plain-text or Outlook email document.
"""
if is_likely_eml(data):
return True
return get_microsoft_format(data) == Fmt.MSG
def is_likely_doc(data: buf):
if get_microsoft_format(data) == Fmt.DOC:
return True
if get_office_xml_type(data) == Fmt.DOCX:
return True
return False
Functions
def get_pe_type(data)-
Get the correct file type extension for a PE file, or None if the input is unlikely to be a portable executable in the first place.
Expand source code Browse git
@_structural_check def get_pe_type(data: buf): """ Get the correct file type extension for a PE file, or None if the input is unlikely to be a portable executable in the first place. """ if data[:2] != B'MZ': return None nt = data[0x3C:0x3E] if len(nt) < 2: return None nt = int.from_bytes(nt, 'little') if data[nt:nt + 4] != B'PE\0\0': return None arch = data[nt + 4:nt + 6] if arch == B'\x64\x86': dll = Fmt.PE32DLL sub = ( Fmt.PE32SYS, Fmt.PE32GUI, Fmt.PE32CUI, ) elif arch == B'\x4C\x01': dll = Fmt.PE64DLL sub = ( Fmt.PE64SYS, Fmt.PE64GUI, Fmt.PE64CUI, ) else: return None if data[nt + 0x16] & 0x20: return dll subsystem = data[nt + 0x5C] - 1 if not 0 <= subsystem <= 2: return None return sub[subsystem] def get_elf_type(data)-
Get arch and byte order information of an ELF file or return None if the input is unlikely to be one.
Expand source code Browse git
@_structural_check def get_elf_type(data: buf): """ Get arch and byte order information of an ELF file or return None if the input is unlikely to be one. """ if not data[:4] == b'\x7FELF': return None abo = data[4:6] if len(data) < 0x40: return None elif data[6] != 1: # EI_VERSION return None elif abo == B'\x01\x01': return Fmt.ELF32LE elif abo == B'\x01\x02': return Fmt.ELF32BE elif abo == B'\x02\x01': return Fmt.ELF64BE elif abo == B'\x02\x02': return Fmt.ELF64BE def get_macho_type(data)-
Get arch and byte order information of a MachO file or return None if the input is unlikely to be one.
Expand source code Browse git
@_structural_check def get_macho_type(data: buf): """ Get arch and byte order information of a MachO file or return None if the input is unlikely to be one. """ order = 'little' magic = int.from_bytes(data[:4], order) isfat = False if len(data) < 30: return None elif magic == 0xCE_FAEDFE: order = 'big' mtype = Fmt.MACHO32BE elif magic == 0xCF_FAEDFE: order = 'big' mtype = Fmt.MACHO64BE elif magic == 0xFEEDFACE: mtype = Fmt.MACHO32LE elif magic == 0xFEEDFACF: mtype = Fmt.MACHO64BE elif magic == 0xCAFEBABE: mtype = Fmt.MACHOuvLE isfat = True elif magic == 0xBEBAFECA: mtype = Fmt.MACHOuvBE isfat = True else: return None if isfat: cpu = int.from_bytes(data[8:0xC], order) else: cpu = int.from_bytes(data[4:0x8], order) if cpu in ( 0x00000001, # vax 0x00000002, # ROMP 0x00000004, # NS32032 0x00000005, # NS32332 0x00000006, # mc680x0 0x00000007, # x32 0x01000007, # x64 0x00000008, # mips 0x00000009, # NS32352 0x0000000A, # mc98000 0x0000000B, # hppa 0x0000000C, # arm32 0x0100000C, # arm64 0x0000000D, # mc880000 0x0000000E, # sparc 0x0000000F, # i860 0x00000010, # alpha 0x00000011, # RS/6000 0x00000012, # ppc32 0x01000012, # ppc64 ): return mtype def get_executable_type(data)-
Determine the type of an executable.
Expand source code Browse git
def get_executable_type(data: buf): """ Determine the type of an executable. """ if t := get_pe_type(data): return t if t := get_elf_type(data): return t if t := get_macho_type(data): return t def is_likely_pe(data)-
Tests whether the input data is likely a PE file by checking the first two bytes and the magic bytes at the beginning of what should be the NT header.
Expand source code Browse git
def is_likely_pe(data: buf): """ Tests whether the input data is likely a PE file by checking the first two bytes and the magic bytes at the beginning of what should be the NT header. """ return get_pe_type(data) is not None def buffer_offset(haystack, needle, start=0, end=None)-
Performs a substring search of
needleinhaystack. Ifhaystackis abytes-like object, it uses the standard method. If it is amemoryview, it uses a regular expression search.Expand source code Browse git
def buffer_offset(haystack: buf, needle: bytes, start: int = 0, end: int | None = None): """ Performs a substring search of `needle` in `haystack`. If `haystack` is a `bytes`-like object, it uses the standard method. If it is a `memoryview`, it uses a regular expression search. """ if isinstance(haystack, (bytes, bytearray)): return needle.find(haystack, start, end) if m := re.search(re.escape(needle), haystack[start:end]): return start + m.start() return -1 def buffer_contains(haystack, needle)-
Determines whether
haystackcontainsneedle.Expand source code Browse git
def buffer_contains(haystack: buf, needle: bytes): """ Determines whether `haystack` contains `needle`. """ return buffer_offset(haystack, needle) > 0 def is_likely_pe_dotnet(data)-
Tests whether the input data is likely a .NET PE file by running
is_likely_pe()and also checking for the characteristic stringsBSJB,#Strings, and#Blob.Expand source code Browse git
def is_likely_pe_dotnet(data: buf): """ Tests whether the input data is likely a .NET PE file by running `refinery.lib.id.is_likely_pe` and also checking for the characteristic strings `BSJB`, `#Strings`, and `#Blob`. """ if not is_likely_pe(data): return False if not buffer_contains(data, b'BSJB'): return False if not buffer_contains(data, b'#Strings'): return False if not buffer_contains(data, b'#Blob'): return False return True def get_reg_export_type(data)-
Check whether the input data is a Windows registry file export.
Expand source code Browse git
@_structural_check def get_reg_export_type(data: buf): """ Check whether the input data is a Windows registry file export. """ if data[:4] == b'regf': return Fmt.REG_HIVE if data[:31] == b'Windows Registry Editor Version': return Fmt.REG_TEXT def guess_text_encoding(data, window_size=4096, ascii_ratio=0.98)-
Attempts to determine whether the input data is likely printable text. The return value is None if the input is unlikely to be text. Otherwise, the return value is a triple of integers: First the offset after the byte order mark (
0in case there is none), then the offset of the first low byte of a character (odd for big endian encodings, even for others) and finally the size of each encoded character in bytes.Expand source code Browse git
def guess_text_encoding( data: buf, window_size: int = 0x1000, ascii_ratio: float = 0.98, ) -> TextEncoding | None: """ Attempts to determine whether the input data is likely printable text. The return value is None if the input is unlikely to be text. Otherwise, the return value is a triple of integers: First the offset after the byte order mark (`0` in case there is none), then the offset of the first low byte of a character (odd for big endian encodings, even for others) and finally the size of each encoded character in bytes. """ view = memoryview(data) size = window_size step = 1 maxbad = 1 - ascii_ratio bom = 0 lsb = 0 if data[:3] == B'\xEF\xBB\xBF': # BOM: UTF8 bom = 3 elif data[:4] == B'\xFF\xFE\0\0': step = bom = lsb = 4 # UTF-32LE elif data[:2] == B'\xFF\xFE': step = bom = lsb = 2 # UTF-16LE elif data[:2] == B'\xFE\xFF': step, bom, lsb = 2, 2, 3 elif data[:4] == B'\0\0\xFE\xFF': step, bom, lsb = 4, 4, 7 elif any(data[:4] == bom for bom in ( b'\x2B\x2F\x76\x38', b'\x2B\x2F\x76\x39', b'\x2B\x2F\x76\x2B', b'\x2B\x2F\x76\x2F', )): # UTF7 BOM bom = 4 elif len(view) % 2 == 0: u16le = (win := view[1:size:2]) and sum(win) / len(win) <= maxbad u16be = (win := view[0:size:2]) and sum(win) / len(win) <= maxbad if u16le: if u16be: return None step, lsb = 2, 0 elif u16be: step, lsb = 2, 1 if step > 1: if len(data) % step != 0: return None if not (win := view[lsb:size:step]) or sum(win) / len(win) > maxbad: return None if len(data) <= bom: return None if not size: return TextEncoding(bom, lsb, step) if isinstance(data, (bytes, bytearray)): histogram = [data.count(b, bom, size) for b in range(0x100)] else: histogram = [0] * 256 for b in view[bom:size]: histogram[b] += 1 presence = memoryview(bytes(1 if v else 0 for v in histogram)) if sum(presence) > 102: # 96 printable ASCII characters plus some slack for control bytes or encoding return None if sum(presence[0x7F:]) > 5: # Allow for some control characters or encoding-specific values return None if sum(presence[:0x20]) > 5: # Tab, CR, LF, Null, plus one byte slack return None bad = sum(histogram[:0x20]) + sum(histogram[0x7F:]) \ - histogram[0x0D] \ - histogram[0x0A] \ - histogram[0x09] if step == 2: bad -= histogram[0] // 2 if bad / sum(histogram) > maxbad: return None while True: try: win = view[lsb:size:step] bad = sum(m.end() - m.start() for m in re.finditer(BR'[^\t\n\r\x20-\x7E]+', win)) except TypeError: pass else: if bad and bad / len(win) > maxbad: return None if size >= len(view): return TextEncoding(bom, lsb, step) size <<= 1 def xml_or_html(view)-
Returns an
Fmtindicating either XML or HTML, or None if the data does not look like either of these formats at all.Expand source code Browse git
def xml_or_html(view: buf): """ Returns an `refinery.lib.id.Fmt` indicating either XML or HTML, or None if the data does not look like either of these formats at all. """ if tag_match := re.search(BR'''(?x) ^ # at the very start of the document \s{0,10} # allow for some leading white space < # a tag opens ([?!]? # allow for question or exclamation mark [-:\w]{3,64}) # the tag name \s{1,20} # white space after tag name (/?> # the tag may end here, or: |[-:\w]{3,32}) # we have an attribute. ''', view): tag = tag_match[1].lower() end = tag_match[2].lower() # <?xml... if tag == b'?xml': return Fmt.XML # <HTML> # <BODY> if tag in (b'html', b'body'): return Fmt.HTM # <!DOCTYPE html if tag == b'!doctype' and end == b'html': return Fmt.HTM # <project xmlns:xsi=... if end.startswith(b'xml'): return Fmt.XML else: return Fmt.HTM return None def ascii_view(data, window_size=4096, ascii_ratio=0.98)-
If the input data looks like text, get a memoryview of the least significant bytes of each encoded letter. Otherwise, return None. Whether or not the data looks like text is determined using
guess_text_encoding(); all parameters are forwarded to this function.Expand source code Browse git
def ascii_view( data: buf, window_size: int = 0x1000, ascii_ratio: float = 0.98, ): """ If the input data looks like text, get a memoryview of the least significant bytes of each encoded letter. Otherwise, return None. Whether or not the data looks like text is determined using `refinery.lib.id.guess_text_encoding`; all parameters are forwarded to this function. """ if encoding := guess_text_encoding(data, window_size=window_size, ascii_ratio=ascii_ratio): return memoryview(data)[encoding.lsb:len(data):encoding.step] def is_likely_eml(data, window_size=65536)-
Checks the input for common strings that occur as email headers. If at least two are found, the function returns True.
Expand source code Browse git
def is_likely_eml( data: buf, window_size: int = 0x10000, ): """ Checks the input for common strings that occur as email headers. If at least two are found, the function returns True. """ hits = 0 view = memoryview(data)[:window_size] for marker in ( b'\nReceived:\x20from' b'\nSubject:\x20', b'\nTo:\x20', b'\nFrom:\x20', b'\nMessage-ID:\x20', b'\nBcc:\x20', b'\nContent-Transfer-Encoding:\x20', b'\nContent-Type:\x20', b'\nReturn-Path:\x20', ): if re.search(re.escape(marker), view) is None: continue if (hits := hits + 1) >= 2: return True else: return False def is_likely_vbe(data)-
Checks whether the input contains the known markers used by encoded Visual Basic scripts.
Expand source code Browse git
def is_likely_vbe(data: buf): """ Checks whether the input contains the known markers used by encoded Visual Basic scripts. """ view = memoryview(data) if re.search(BR'#@~\^[!-~]{6}==', view[:+64]) is None: return False if re.search(BR'[!-~]{6}==\^#~@', view[-64:]) is None: return False return True def is_likely_json(data)-
A fast regular expression based check for whether the input looks like JSON. The expression checks whether the input is a sequence of valid JSON tokens: quoted strings, constants, integer and floating-point numbers, and control characters. To be explicit, note that this function cannot check for correct nesting, regular expressions are insufficient for this.
Expand source code Browse git
def is_likely_json(data: buf): """ A fast regular expression based check for whether the input looks like JSON. The expression checks whether the input is a sequence of valid JSON tokens: quoted strings, constants, integer and floating-point numbers, and control characters. To be explicit, note that this function cannot check for correct nesting, regular expressions are insufficient for this. """ _json = RB""" \s*(( # a sequence of the following tokens: "([^"\\\r\n]|\\[^\r\n])*" # a quoted string literal | true # true | false # false | null # null | [-+]?([1-9]\d*|0) # an integer | [-+]?\d*\.?\d+([eE][-+]?\d+)? # a float | [\{\}\[\]:,] # a structural token # | //(.*?)\n # do not allow comments (line) # | /\*.*?\*/ # do not allow comments (block) )\s*)*? """ _json = RB'(?x)\s*(\{%s\})|(\[%s\])\s*' % (_json, _json) return re.fullmatch(_json, data) is not None def get_microsoft_format(data)-
Checks for various Microsoft formats. This includes Access Database files and OneNote, but most importantly it can distinguish between various compound document formats like MSI, Word, Excel, PowerPoint, and Outlook.
Expand source code Browse git
@_structural_check def get_microsoft_format(data: buf): """ Checks for various Microsoft formats. This includes Access Database files and OneNote, but most importantly it can distinguish between various compound document formats like MSI, Word, Excel, PowerPoint, and Outlook. """ if data[:19] == b'\0\01\0\0Standard ACE DB': return Fmt.MDB if data[:19] == b'\0\01\0\0Standard Jet DB': return Fmt.MDB if data[:4] != B'\xD0\xCF\x11\xE0': return None if data[4:8] != B'\xA1\xB1\x1A\xE1' and any(data[4:12]): return None if buffer_contains(data, b'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3'): return Fmt.ONE for k in range(0x200, 0x10000, 0x200): mark = int.from_bytes(data[k:k + 4], 'little') if mark == 0x00C1A5EC: return Fmt.DOC if mark == 0x00100809 and data[k + 4:k + 8] == B'\x00\x06\x05\x00': return Fmt.XLS if mark == 0xF01D46A0: return Fmt.PPT if mark == 0xF01E6E00: return Fmt.PPT if mark == 0x03E8000F: return Fmt.PPT if buffer_contains(data, b'W\0o\0r\0d\0D\0o\0c\0u\0m\0e\0n\0t\0'): # WordDocument return Fmt.DOC if buffer_contains(data, b'P\0o\0w\0e\0r\0P\0o\0i\0n\0t\0'): # PowerPoint return Fmt.PPT if buffer_contains(data, b'W\0o\0r\0k\0b\0o\0o\0k\0'): # Workbook return Fmt.XLS if buffer_contains(data, b'_\0_\0s\0u\0b\0s\0t\0g\01\0.\00\0_\0'): # __substg1._ return Fmt.MSG if buffer_contains(data, b'_\0_\0n\0a\0m\0e\0i\0d\0_\0v\0e\0r\0s\0i\0o\0n\0'): # __nameid_version return Fmt.MSG if buffer_contains(data, b'_\0_\0r\0e\0c\0i\0p\0_\0v\0e\0r\0s\0i\0o\0n\0'): # __recip_version return Fmt.MSG if buffer_contains(data, b'_\0_\0p\0r\0o\0p\0e\0r\0t\0i\0e\0s\0_\0v\0e\0r\0s\0i\0o\0n\0'): # __properties_version return Fmt.MSG if buffer_contains(data, b'B\0o\0o\0k\0'): # Book return Fmt.XLS if re.search(b'Property|ProductCode|UpgradeCode|PackageCode|InstallExecuteSequence|Component|Feature|File|Media', data): return Fmt.MSI if re.search(B'Msi(?:[A-Z][a-z]{2,30}){2,5}', data): return Fmt.MSI else: return Fmt.CFF def get_office_xml_type(data)-
Checks for known XML-based Office document types like DOCX, XLSX, and PPTX.
Expand source code Browse git
@_structural_check def get_office_xml_type(data: buf): """ Checks for known XML-based Office document types like DOCX, XLSX, and PPTX. """ if data[:2] != B'PK': return None if not buffer_contains(data, B'_rels/.rels'): return None if not buffer_contains(data, B'[Content_Types].xml'): return None if buffer_contains(data, B'word/document.xml'): return Fmt.DOCX if buffer_contains(data, B'xl/document.xml'): return Fmt.XLSX if buffer_contains(data, B'ppt/presentation.xml'): return Fmt.PPTX def get_compression_type(data, entropy_minimum=0.7, entropy_look_at=8192)-
This method looks for any of a number of known magic signatures for compression and archive formats. If one is find, the method selects a data window from the rest of the buffer and computes its entropy. If the entropy exceeds the given threshold, the input is idenfied as a known compression format.
Expand source code Browse git
@_structural_check def get_compression_type( data: buf, entropy_minimum: float = 0.7, entropy_look_at: int = 0x2000, ): """ This method looks for any of a number of known magic signatures for compression and archive formats. If one is find, the method selects a data window from the rest of the buffer and computes its entropy. If the entropy exceeds the given threshold, the input is idenfied as a known compression format. """ size = len(data) view = memoryview(data) T = True F = False if data[:4] == b'\04\0\0\0' and data[0x10:0x18] == B'{"files"': return Fmt.ASAR for format, entropy_required, offset, signature in ( (Fmt.APLIB , T, 0, B'AP32'), # noqa (Fmt.ACE , F, 7, B'**ACE**'), # noqa (Fmt.BZ2 , T, 0, B'BZh'), # noqa (Fmt.JCALG , T, 0, B'JC'), # noqa (Fmt.LZMA , T, 0, B'\x5D\0\0\0'), # noqa (Fmt.LZMA , T, 0, B'\xFD7zXZ'), # noqa (Fmt.RNC , T, 0, B'RNC\x01'), # noqa (Fmt.RNC , T, 0, B'RNC\x02'), # noqa (Fmt.LZF , T, 0, B'ZV'), # noqa (Fmt.LZG , T, 0, B'LZG'), # noqa (Fmt.LZIP , T, 0, B'LZIP'), # noqa (Fmt.LZ4 , T, 0, B'\x04\x22\x4D\x18'), # noqa (Fmt.LZO , F, 0, B'\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a'), # noqa (Fmt.LZH , T, 0, B'\x1F\xA0'), # noqa (Fmt.LZW , T, 0, B'\x1F\x9D'), # noqa (Fmt.GZIP , T, 0, B'\x1F\x8B'), # noqa (Fmt.XZ , F, 0, B'\xFD\x37\x7A\x58\x5A\x00'), # noqa (Fmt.MSCF , T, 0, B'\x0A\x51\xE5\xC0'), # noqa (Fmt.RAR , T, 0, B'Rar!\x1A\x07'), # noqa (Fmt.XAR , T, 0, B'xar!'), # noqa (Fmt.SZDD , T, 0, B'SZDD'), # noqa (Fmt.ZLIB0 , T, 0, B'\x78\x01'), # noqa (Fmt.ZLIB1 , T, 0, B'\x78\x5E'), # noqa (Fmt.ZLIB2 , T, 0, B'\x78\x9C'), # noqa (Fmt.ZLIB3 , T, 0, B'\x78\xDA'), # noqa (Fmt.ZLIB4 , T, 0, B'\x78\x20'), # noqa (Fmt.ZLIB5 , T, 0, B'\x78\x7D'), # noqa (Fmt.ZLIB6 , T, 0, B'\x78\xBB'), # noqa (Fmt.ZLIB7 , T, 0, B'\x78\xF9'), # noqa (Fmt.LZFSE , T, 0, B'bvx2'), # noqa (Fmt.ZSTD , T, 0, B'\x28\xB5\x2F\xFD'), # noqa (Fmt.ZIP7 , T, 0, B'7z\xBC\xAF\x27\x1C'), # noqa (Fmt.CAB , T, 0, B'MSCF'), # noqa (Fmt.CHM , T, 0, B'ITSF'), # noqa (Fmt.CPIO , F, 0, B'070701'), # noqa (Fmt.CPIO , F, 0, B'070702'), # noqa (Fmt.CPIO , F, 0, B'070707'), # noqa (Fmt.ZIP , T, 0, B'PK\x03\x04'), # noqa (Fmt.ZIP , T, 0, B'PK\x05\x06'), # noqa (Fmt.ZIP , T, 0, B'PK\x07\x08'), # noqa (Fmt.ISO , F, 0x8001, B'CD001'), # noqa (Fmt.ISO , F, 0x8801, B'CD001'), # noqa (Fmt.ISO , F, 0x9001, B'CD001'), # noqa (Fmt.ISZ , T, 0, B'IsZ!'), # noqa (Fmt.TAR , F, 257, B'ustar'), # noqa (Fmt.TAR , F, 257, B'ustar'), # noqa (Fmt.OAR , T, 0, B'OAR'), # noqa (Fmt.ZPQ , T, 0, B'7kSt\xA01\x83\xD3\x8C\xB2\x28\xB0\xD3zPQ'), # noqa (Fmt.VMDK , T, 0, B'KDM'), # noqa (Fmt.VMDK , T, 0, B'# Disk Descripto'), # noqa (Fmt.VHD , T, 0, B'conectix'), # noqa (Fmt.VHD , T, 0, B'vhdxfile'), # noqa (Fmt.DMG , T, size - 512, B'koly'), # noqa ): if view[offset:offset + len(signature)] == signature: if not entropy_required or len(data) < 0x100: return format for start in (0x1000, 0x400, 0x200, 0x100, 0x80, 0x40, 0x20, 0x10): if len(view) >= start + entropy_look_at: view = view[start:] break else: return format if entropy(view[:entropy_look_at]) >= entropy_minimum: return format def get_image_format(data)-
Determine an image format based on known magic signatures or return None if there is no match.
Expand source code Browse git
@_structural_check def get_image_format(data: buf): """ Determine an image format based on known magic signatures or return None if there is no match. """ if data[:4] == B'\0\0\x01\0': count = int.from_bytes(data[4:6], 'little') if not 1 <= count <= 100: return None w, h, _, r = data[6:10] if r != 0: return None p = int.from_bytes(data[10:12], 'little') # planes b = int.from_bytes(data[12:14], 'little') # bit count if not any((w == h, p == 1, b in (1, 2, 4, 8, 16, 24, 32, 64, 96, 128, 256))): return None return Fmt.ICO if data[:3] == B'\xFF\xD8\xFF': if data[4] in (0xDB, 0xEE, 0xE0): return Fmt.JPG if data[4] == 0xE1 and data[7:13] == B'\x45\x78\x69\x66\0\0': return Fmt.JPG return None if data[:4] == b'FORM': if data[8:12] in ( B'ILBM', B'8SVX', B'ACBM', B'ANBM', B'ANIM', B'FAXX', B'FTXT', B'SMUS', B'CMUS', B'YUVN', B'FANT', B'AIFF', ): return Fmt.IFF else: return None for format, signature in ( (Fmt.HIC, b'ftypheic'), (Fmt.GIF, B'GIF87a'), (Fmt.GIF, B'GIF89a'), (Fmt.TIF, B'\x49\x49\x2A\x00'), (Fmt.TIF, B'\x4D\x4D\x00\x2A'), (Fmt.TIF, B'\x49\x49\x2B\x00'), (Fmt.TIF, B'\x4D\x4D\x00\x2B'), (Fmt.CIN, B'\x80\x2A\x5F\xD7'), (Fmt.NUI, B'NURUIMG'), (Fmt.NUI, B'NURUPAL'), (Fmt.DPX, B'SDPX'), (Fmt.DPX, B'XPDS'), (Fmt.BPG, B'BPG\xFB'), (Fmt.EXR, B'\x76\x2F\x31\x01'), (Fmt.JP2, B'\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A'), (Fmt.JP2, B'\xFF\x4F\xFF\x51'), (Fmt.QOI, B'\x71\x6f\x69\x66'), (Fmt.PNG, B'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A'), (Fmt.PSD, B'8BPS'), (Fmt.BMP, B'BM'), (Fmt.FIF, B'FLIF'), (Fmt.LEP, B'\xCF\x84\x01'), (Fmt.HDR, B'#?RADIANCE\n'), ): if data[:len(signature)] == signature: return format def get_media_format(data)-
Determine a multi-media format based on known magic signatures or return None if there is no match.
Expand source code Browse git
@_structural_check def get_media_format(data: buf): """ Determine a multi-media format based on known magic signatures or return None if there is no match. """ if data[:4] == B'RIFF': if data[8:12] == b'WAVE': return Fmt.WAV if data[8:12] == b'AVI ': return Fmt.AVI return None for format, signature in ( (Fmt.OGG, B'OggS'), (Fmt.MP3, B'\xFF\xFB'), (Fmt.MP3, B'\xFF\xF3'), (Fmt.MP3, B'\xFF\xF2'), (Fmt.MP3, B'ID3'), (Fmt.M3U, B'#EXTM3U'), (Fmt.MPG, B'\0\0\01\xBA'), (Fmt.MPG, B'\0\0\01\xB3'), (Fmt.FLC, B'fLaC'), (Fmt.MID, B'MThd'), (Fmt.MKV, B'\x1A\x45\xDF\xA3'), (Fmt.SWF, B'CWS'), (Fmt.SWF, B'FWS'), (Fmt.SIL, B'#!SILK\n'), ): if data[:len(signature)] == signature: return format if data[4:12] in (B'ftypisom', B'ftypMSNV'): return Fmt.MPG if data[4:10] == B'ftypM4': return Fmt.MP4 if len(data) < 0x1000: return None stop = min(len(data), 0x10000) if all(data[i] == 0x47 for i in range(0, stop, 188)): if any(data[i - 1] != 0x47 for i in range(0, stop, 188)): return Fmt.MPG def get_serialization_format(data)-
Checks for known data serialization formats.
Expand source code Browse git
@_structural_check def get_serialization_format(data: buf): """ Checks for known data serialization formats. """ if data[:4] == B'\xAC\xED\x00\x05': return Fmt.S_JAV if data[:17] == B'\0\01\0\0\0\xFF\xFF\xFF\xFF\x01\0\0\0\0\0\0\0': if data[17] in range(18) or data[17] in range(0x14, 0x17): return Fmt.S_DOT def get_misc_binary_formats(data)-
Checks for various other binary formats that are not covered by other methods in this module.
Expand source code Browse git
@_structural_check def get_misc_binary_formats(data: buf): """ Checks for various other binary formats that are not covered by other methods in this module. """ if len(data) >= 0x30 and PycMagicPattern.fullmatch(data[:4]): if any(data[offset] & 0x7F == 0x63 for offset in (8, 12, 16)): return Fmt.PYC for format, signature in ( (Fmt.PDF, B'%PDF-'), (Fmt.A3X, B'\xA3\x48\x4B\xBE\x98\x6C\x4A\xA9\x99\x4C\x53\x0A\x86\xD6\x48\x7D\x41\x55\x33\x21'), (Fmt.CHM, B'ITSF'), (Fmt.DSS, B'\0\0\0\01Bud1'), (Fmt.DJV, B'AT&TFORM'), (Fmt.DEX, B'dex\n035\0'), (Fmt.IFPS, B'IFPS'), (Fmt.JAVA, B'\xCA\xFE\xBA\xBE'), (Fmt.WASM, B'\0asm'), (Fmt.LUAC, B'\x1BLua'), (Fmt.LNK, B'L\0\0\0\01\x14\02\0\0\0\0\0\xC0\0\0\0\0\0\0F'), (Fmt.PCAP, B'\xD4\xC3\xB2\xA1'), (Fmt.PCAP, B'\xA1\xB2\xC3\xD4'), (Fmt.PCAP, B'\x4D\x3C\xB2\xA1'), (Fmt.PCAP, B'\xA1\xB2\x3C\x4D'), (Fmt.PCAPNG, B'\n\r\n\r'), (Fmt.SSP, B'SMSNF200'), (Fmt.SQLITE, B'SQLite format 3\0'), (Fmt.PPK, B'PuTTY-User-Key-File-'), (Fmt.WIM, B'MSWIM\0\0\0\xD0\0\0\0\0'), (Fmt.EVT, B'LfLe'), (Fmt.EVTX, B'ElfFile'), ): if data[:len(signature)] == signature: return format def get_text_format(data)-
Implements a heuristic check for whether the input is likely XML data.
Expand source code Browse git
@_structural_check def get_text_format(data: buf): """ Implements a heuristic check for whether the input is likely XML data. """ encoding = guess_text_encoding(data) if encoding is None: return None step = encoding.step view = memoryview(data)[encoding.lsb:len(data):step] if is_likely_vbe(view): return Fmt.VBE if re.search(BR'^\s{0,500}\{\\rtf', view) is not None: return Fmt.RTF if format := xml_or_html(view): return format if step == 1 and is_likely_eml(data): return Fmt.EML if is_likely_json(view): return Fmt.JSON if step == 1: return Fmt.ASCII if step == 2: return Fmt.UTF16 if step == 4: return Fmt.UTF32 def get_structured_data_type(data)-
Attempts to determine whether the input data is just a meaningless blob or whether it has structure, i.e. adheres to a known file format. Returns an
FmtorNone.Expand source code Browse git
def get_structured_data_type(data: buf): """ Attempts to determine whether the input data is just a meaningless blob or whether it has structure, i.e. adheres to a known file format. Returns an `refinery.lib.id.Fmt` or `None`. """ for check in StructuralChecks: if t := check(data): return t def is_likely_xml(data)-
Checks whether the input data is likely an XML document.
Expand source code Browse git
def is_likely_xml(data: buf): """ Checks whether the input data is likely an XML document. """ if view := ascii_view(data, window_size=0): return xml_or_html(view) == Fmt.XML return False def is_likely_htm(data)-
Checks whether the input data is likely an HTML document.
Expand source code Browse git
def is_likely_htm(data: buf): """ Checks whether the input data is likely an HTML document. """ if view := ascii_view(data, window_size=0): return xml_or_html(view) == Fmt.HTM return False def is_likely_msi(data)-
Checks whether the input data is likely an MSI.
Expand source code Browse git
def is_likely_msi(data: buf): """ Checks whether the input data is likely an MSI. """ return get_microsoft_format(data) == Fmt.MSI def is_likely_email(data)-
Checks whether the input data is likely a plain-text or Outlook email document.
Expand source code Browse git
def is_likely_email(data: buf): """ Checks whether the input data is likely a plain-text or Outlook email document. """ if is_likely_eml(data): return True return get_microsoft_format(data) == Fmt.MSG def is_likely_doc(data)-
Expand source code Browse git
def is_likely_doc(data: buf): if get_microsoft_format(data) == Fmt.DOC: return True if get_office_xml_type(data) == Fmt.DOCX: return True return False
Classes
class Format (category, extension=None, mnemonic=None, details=None, mime=None)-
Expand source code Browse git
class Format: __slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details' def __hash__(self): return hash(tuple(self)) def __eq__(self, other): if not isinstance(other, Format): return False return all(a == b for a, b in zip(self, other)) def __iter__(self): yield self.category yield self.extension yield self.mnemonic yield self.details yield self.mime def __init__( self, category: FormatCategory, extension: str | None = None, mnemonic: str | None = None, details: str | None = None, mime: str | None = None, ) -> None: self.category = category self.extension = extension or 'bin' self.mnemonic = mnemonic or self.extension.upper() self.details = details or self.mnemonic if mime is None: try: mime = MimeByExtension[self.extension] except KeyError: if category == FormatCategory.Text: mime = 'text/plain' else: mime = 'application/ocet-stream' self.mime = mimeSubclasses
Instance variables
var category-
Expand source code Browse git
class Format: __slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details' def __hash__(self): return hash(tuple(self)) def __eq__(self, other): if not isinstance(other, Format): return False return all(a == b for a, b in zip(self, other)) def __iter__(self): yield self.category yield self.extension yield self.mnemonic yield self.details yield self.mime def __init__( self, category: FormatCategory, extension: str | None = None, mnemonic: str | None = None, details: str | None = None, mime: str | None = None, ) -> None: self.category = category self.extension = extension or 'bin' self.mnemonic = mnemonic or self.extension.upper() self.details = details or self.mnemonic if mime is None: try: mime = MimeByExtension[self.extension] except KeyError: if category == FormatCategory.Text: mime = 'text/plain' else: mime = 'application/ocet-stream' self.mime = mime var details-
Expand source code Browse git
class Format: __slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details' def __hash__(self): return hash(tuple(self)) def __eq__(self, other): if not isinstance(other, Format): return False return all(a == b for a, b in zip(self, other)) def __iter__(self): yield self.category yield self.extension yield self.mnemonic yield self.details yield self.mime def __init__( self, category: FormatCategory, extension: str | None = None, mnemonic: str | None = None, details: str | None = None, mime: str | None = None, ) -> None: self.category = category self.extension = extension or 'bin' self.mnemonic = mnemonic or self.extension.upper() self.details = details or self.mnemonic if mime is None: try: mime = MimeByExtension[self.extension] except KeyError: if category == FormatCategory.Text: mime = 'text/plain' else: mime = 'application/ocet-stream' self.mime = mime var extension-
Expand source code Browse git
class Format: __slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details' def __hash__(self): return hash(tuple(self)) def __eq__(self, other): if not isinstance(other, Format): return False return all(a == b for a, b in zip(self, other)) def __iter__(self): yield self.category yield self.extension yield self.mnemonic yield self.details yield self.mime def __init__( self, category: FormatCategory, extension: str | None = None, mnemonic: str | None = None, details: str | None = None, mime: str | None = None, ) -> None: self.category = category self.extension = extension or 'bin' self.mnemonic = mnemonic or self.extension.upper() self.details = details or self.mnemonic if mime is None: try: mime = MimeByExtension[self.extension] except KeyError: if category == FormatCategory.Text: mime = 'text/plain' else: mime = 'application/ocet-stream' self.mime = mime var mime-
Expand source code Browse git
class Format: __slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details' def __hash__(self): return hash(tuple(self)) def __eq__(self, other): if not isinstance(other, Format): return False return all(a == b for a, b in zip(self, other)) def __iter__(self): yield self.category yield self.extension yield self.mnemonic yield self.details yield self.mime def __init__( self, category: FormatCategory, extension: str | None = None, mnemonic: str | None = None, details: str | None = None, mime: str | None = None, ) -> None: self.category = category self.extension = extension or 'bin' self.mnemonic = mnemonic or self.extension.upper() self.details = details or self.mnemonic if mime is None: try: mime = MimeByExtension[self.extension] except KeyError: if category == FormatCategory.Text: mime = 'text/plain' else: mime = 'application/ocet-stream' self.mime = mime var mnemonic-
Expand source code Browse git
class Format: __slots__ = 'category', 'extension', 'mime', 'mnemonic', 'details' def __hash__(self): return hash(tuple(self)) def __eq__(self, other): if not isinstance(other, Format): return False return all(a == b for a, b in zip(self, other)) def __iter__(self): yield self.category yield self.extension yield self.mnemonic yield self.details yield self.mime def __init__( self, category: FormatCategory, extension: str | None = None, mnemonic: str | None = None, details: str | None = None, mime: str | None = None, ) -> None: self.category = category self.extension = extension or 'bin' self.mnemonic = mnemonic or self.extension.upper() self.details = details or self.mnemonic if mime is None: try: mime = MimeByExtension[self.extension] except KeyError: if category == FormatCategory.Text: mime = 'text/plain' else: mime = 'application/ocet-stream' self.mime = mime
class FormatCategory (*args, **kwds)-
Enum where members are also (and must be) ints
Expand source code Browse git
class FormatCategory(enum.IntEnum): Executable = enum.auto() Text = enum.auto() Document = enum.auto() Image = enum.auto() Binary = enum.auto() Media = enum.auto() Archive = enum.auto() Compression = enum.auto() Serialized = enum.auto()Ancestors
- enum.IntEnum
- builtins.int
- enum.ReprEnum
- enum.Enum
Class variables
var Executablevar Textvar Documentvar Imagevar Binaryvar Mediavar Archivevar Compressionvar Serialized
class FC (*args, **kwds)-
Enum where members are also (and must be) ints
Expand source code Browse git
class FormatCategory(enum.IntEnum): Executable = enum.auto() Text = enum.auto() Document = enum.auto() Image = enum.auto() Binary = enum.auto() Media = enum.auto() Archive = enum.auto() Compression = enum.auto() Serialized = enum.auto()Ancestors
- enum.IntEnum
- builtins.int
- enum.ReprEnum
- enum.Enum
Class variables
var Executablevar Textvar Documentvar Imagevar Binaryvar Mediavar Archivevar Compressionvar Serialized
class Fmt (category, extension=None, mnemonic=None, details=None, mime=None)-
An enumeration of all known file formats that can be returned by
get_structured_data_type().Expand source code Browse git
class Fmt(Format, enum.Enum): """ An enumeration of all known file formats that can be returned by `refinery.lib.id.get_structured_data_type`. """ PE32GUI = (FC.Executable, 'exe', 'PE/32/GUI') PE32CUI = (FC.Executable, 'exe', 'PE/32/CUI') PE32DLL = (FC.Executable, 'dll', 'PE/32/DLL') PE32SYS = (FC.Executable, 'sys', 'PE/32/SYS') PE64GUI = (FC.Executable, 'exe', 'PE/64/GUI') PE64CUI = (FC.Executable, 'exe', 'PE/64/CUI') PE64DLL = (FC.Executable, 'dll', 'PE/64/DLL') PE64SYS = (FC.Executable, 'sys', 'PE/64/SYS') ELF32LE = (FC.Executable, 'elf', 'ELF/32/LE') ELF64LE = (FC.Executable, 'elf', 'ELF/64/LE') ELF32BE = (FC.Executable, 'elf', 'ELF/32/BE') ELF64BE = (FC.Executable, 'elf', 'ELF/64/BE') MACHOuvLE = (FC.Executable, 'macho', 'MachO/Fat/LE') MACHOuvBE = (FC.Executable, 'macho', 'MachO/Fat/BE') MACHO32LE = (FC.Executable, 'macho', 'MachO/32/LE') MACHO64LE = (FC.Executable, 'macho', 'Macho/64/LE') MACHO32BE = (FC.Executable, 'macho', 'MachO/32/BE') MACHO64BE = (FC.Executable, 'macho', 'Macho/64/BE') JAVA = (FC.Executable, 'class', 'JavaClass') DEX = (FC.Executable, 'dex', 'Dalvik') WASM = (FC.Executable, 'wasm', 'WASM', 'Web Assembly') LUAC = (FC.Executable, 'luac', 'LUAC', 'LUA Bytecode') PYC = (FC.Executable, 'pyc', 'PYC', 'Python Bytecode') PDF = (FC.Document, 'pdf', 'PDF', 'PDF Document') CHM = (FC.Document, 'chm', 'CHM', 'Microsoft Windows HtmlHelp Data') DJV = (FC.Document, 'djvu') PCAP = (FC.Binary, 'pcap', 'PCAP', 'Network Packet Capture') PCAPNG = (FC.Binary, 'pcapng', 'PCAP/NG', 'Next-Generation Network Packet Capture') SSP = (FC.Binary, 'ssp', 'SmartSniff', 'SmartSniff Packets File') SQLITE = (FC.Binary, 'db', 'SQLite', 'SQLite Database') DSS = (FC.Binary, 'DS_Store', 'DSS', 'MacOS DS Store') A3X = (FC.Binary, 'a3x', 'A3X', 'Compiled AutoIt3') IFPS = (FC.Binary, 'ifps', 'IFPS', 'InnerFuse PascalScript') PPK = (FC.Binary, 'ppk', 'PuTTY', 'PuTTY Private Key File') WIM = (FC.Binary, 'wim', 'WIM', 'Windows Imaging Format') EVT = (FC.Binary, 'evt', 'EVT', 'Windows Event Viewer') EVTX = (FC.Binary, 'evtx', 'EVTX', 'Windows Event Viewer XML') LNK = (FC.Binary, 'lnk', 'LNK', 'Windows Shortcut') REG_HIVE = (FC.Binary, 'reg', 'WinReg/Hive', 'Windows Registry Hive File', 'text/plain') REG_TEXT = (FC.Binary, 'reg', 'WinReg/Text', 'Windows Registry Script') MDB = (FC.Document, 'accdb', 'MDB', 'Microsoft Access Database') DOC = (FC.Document, 'doc') ONE = (FC.Document, 'one') XLS = (FC.Document, 'xls') PPT = (FC.Document, 'ppt') MSG = (FC.Document, 'msg') MSI = (FC.Archive, 'msi') CFF = (FC.Binary, 'ole', 'Compound File Format') DOCX = (FC.Document, 'docx') XLSX = (FC.Document, 'xlsx') PPTX = (FC.Document, 'pptx') ASCII = (FC.Text, 'txt', 'PlainText', 'Single-Byte, Plain Text Encoding') UTF16 = (FC.Text, 'txt', 'UTF16') UTF32 = (FC.Text, 'txt', 'UTF32') JSON = (FC.Text, 'json') XML = (FC.Text, 'xml') HTM = (FC.Text, 'html') RTF = (FC.Text, 'rtf', 'RTF') VBE = (FC.Text, 'vbe', 'VBE', 'Encoded VBScript') EML = (FC.Text, 'eml', 'EML', 'Plain-Text EMail Document') HIC = (FC.Image, 'heic', 'HEIC', 'High Efficiency Image Container') ICO = (FC.Image, r'ico', r'ICO', 'Icon') GIF = (FC.Image, r'gif', r'GIF', 'Graphics Interchange Format') TIF = (FC.Image, r'tif', r'TIF', 'Tagged Image File Format') CIN = (FC.Image, r'cin', r'CIN', 'Kodak Cineon Image') NUI = (FC.Image, r'nui', r'NUI', 'Nuru ASCI/ANSI Image or Palette') DPX = (FC.Image, r'dpx', r'DPX', 'SMPTE DPX Image') BPG = (FC.Image, r'bpg', r'BPG', 'Better Portable Graphics') EXR = (FC.Image, r'exr', r'EXR', 'OpenEXR Image') JPG = (FC.Image, r'jpg', r'JPG', 'Joint Photographic Experts Group Image') JP2 = (FC.Image, r'jp2', r'JP2', 'JPEG 2000') QOI = (FC.Image, r'qoi', r'QOI', 'Quite OK Image Format') IFF = (FC.Image, r'iff', r'IFF', 'IFF or Amiga Image') PNG = (FC.Image, r'png', r'PNG', 'Portable Network Graphics') PSD = (FC.Image, r'psd', r'PSD', 'Adobe Photoshop Document') BMP = (FC.Image, r'bmp', r'BMP', 'Bitmap') FIF = (FC.Image, 'flif', 'FLIF', 'Free Lossless Image Format') LEP = (FC.Image, r'lep', r'LEP', 'Lepton Compressed JPEG Image') HDR = (FC.Image, r'hdr', r'HDR', 'Radiance High Dynamic Range Image') OGG = (FC.Media, 'ogg') WAV = (FC.Media, 'wav') AVI = (FC.Media, 'avi') MP3 = (FC.Media, 'mp3') M3U = (FC.Media, 'm3u', 'M3U', 'Multimedia Playlist') MP4 = (FC.Media, 'mp4') MPG = (FC.Media, 'mpg') FLC = (FC.Media, 'flac') MID = (FC.Media, 'mid') MKV = (FC.Media, 'mkv') SWF = (FC.Media, 'swf') SIL = (FC.Media, 'sil') ACE = (FC.Archive, 'ace') ASAR = (FC.Archive, 'asar') VHD = (FC.Archive, 'vhd') VMDK = (FC.Archive, 'vmdk') ISO = (FC.Archive, 'iso') ISZ = (FC.Archive, 'isz', 'ISZ', 'Compressed ISO Image') DMG = (FC.Archive, 'dmg') XAR = (FC.Archive, 'xar', 'XAR', 'eXtensible ARchive Format') TAR = (FC.Archive, 'tar') OAR = (FC.Archive, 'oar') ZIP7 = (FC.Archive, '7z', '7Zip') ZIP = (FC.Archive, 'zip') RAR = (FC.Archive, 'rar') CAB = (FC.Archive, 'cab') CPIO = (FC.Archive, 'cpio') ZPQ = (FC.Archive, 'zpq') S_JAV = (FC.Serialized, 'bin', 'SerializedJava') S_DOT = (FC.Serialized, 'bin', 'SerializedDotNet') S_PHP = (FC.Serialized, 'bin', 'SerializedPHP') APLIB = (FC.Compression, 'ap', 'apLib') BZ2 = (FC.Compression, 'bz2', 'BZIP') JCALG = (FC.Compression, 'bin', 'jcAlg') LZMA = (FC.Compression, 'lzma') LZF = (FC.Compression, 'lzf') LZH = (FC.Compression, 'lzh') LZG = (FC.Compression, 'lzg') RNC = (FC.Compression, 'rnc', 'RNC', 'Rob Northern Compression') LZIP = (FC.Compression, 'lzip') LZO = (FC.Compression, 'lzo') LZ4 = (FC.Compression, 'lz4') LZW = (FC.Compression, 'lzw') LZFSE = (FC.Compression, 'lzfse') MSCF = (FC.Compression, 'mscf') SZDD = (FC.Compression, 'szdd') GZIP = (FC.Compression, 'gz') XZ = (FC.Compression, 'xz', 'XZ/LZMA2') ZLIB0 = (FC.Compression, 'zlib', 'ZLIB/0') ZLIB1 = (FC.Compression, 'zlib', 'ZLIB/1') ZLIB2 = (FC.Compression, 'zlib', 'ZLIB/2') ZLIB3 = (FC.Compression, 'zlib', 'ZLIB/3') ZLIB4 = (FC.Compression, 'zlib', 'ZLIB/4') ZLIB5 = (FC.Compression, 'zlib', 'ZLIB/5') ZLIB6 = (FC.Compression, 'zlib', 'ZLIB/6') ZLIB7 = (FC.Compression, 'zlib', 'ZLIB/7') ZSTD = (FC.Compression, 'zstd')Ancestors
- Format
- enum.Enum
Class variables
var PE32GUIvar PE32CUIvar PE32DLLvar PE32SYSvar PE64GUIvar PE64CUIvar PE64DLLvar PE64SYSvar ELF32LEvar ELF64LEvar ELF32BEvar ELF64BEvar MACHOuvLEvar MACHOuvBEvar MACHO32LEvar MACHO64LEvar MACHO32BEvar MACHO64BEvar JAVAvar DEXvar WASMvar LUACvar PYCvar PDFvar CHMvar DJVvar PCAPvar PCAPNGvar SSPvar SQLITEvar DSSvar A3Xvar IFPSvar PPKvar WIMvar EVTvar EVTXvar LNKvar REG_HIVEvar REG_TEXTvar MDBvar DOCvar ONEvar XLSvar PPTvar MSGvar MSIvar CFFvar DOCXvar XLSXvar PPTXvar ASCIIvar UTF16var UTF32var JSONvar XMLvar HTMvar RTFvar VBEvar EMLvar HICvar ICOvar GIFvar TIFvar CINvar NUIvar DPXvar BPGvar EXRvar JPGvar JP2var QOIvar IFFvar PNGvar PSDvar BMPvar FIFvar LEPvar HDRvar OGGvar WAVvar AVIvar MP3var M3Uvar MP4var MPGvar FLCvar MIDvar MKVvar SWFvar SILvar ACEvar ASARvar VHDvar VMDKvar ISOvar ISZvar DMGvar XARvar TARvar OARvar ZIP7var ZIPvar RARvar CABvar CPIOvar ZPQvar S_JAVvar S_DOTvar S_PHPvar APLIBvar BZ2var JCALGvar LZMAvar LZFvar LZHvar LZGvar RNCvar LZIPvar LZOvar LZ4var LZWvar LZFSEvar MSCFvar SZDDvar GZIPvar XZvar ZLIB0var ZLIB1var ZLIB2var ZLIB3var ZLIB4var ZLIB5var ZLIB6var ZLIB7var ZSTD
class TextEncoding (bom=0, lsb=0, step=1)-
TextEncoding(bom, lsb, step)
Expand source code Browse git
class TextEncoding(NamedTuple): bom: int = 0 lsb: int = 0 step: int = 1Ancestors
- builtins.tuple
Instance variables
var bom-
Alias for field number 0
Expand source code Browse git
class TextEncoding(NamedTuple): bom: int = 0 lsb: int = 0 step: int = 1 var lsb-
Alias for field number 1
Expand source code Browse git
class TextEncoding(NamedTuple): bom: int = 0 lsb: int = 0 step: int = 1 var step-
Alias for field number 2
Expand source code Browse git
class TextEncoding(NamedTuple): bom: int = 0 lsb: int = 0 step: int = 1