Module refinery.lib.patterns
Library of regular expression patterns.
Expand source code Browse git
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Library of regular expression patterns.
"""
import enum
import functools
import re
from typing import Optional
from refinery.lib.patterns.tlds import tlds
from refinery.lib.tools import cached_property, normalize_to_identifier, normalize_to_display
class pattern:
"""
A wrapper for regular expression pattern objects created from re.compile,
allowing combination of several patterns into one via overloaded
operators.
"""
str_pattern: str
bin_pattern: Optional[bytes]
bin_compiled: re.Pattern
str_compiled: re.Pattern
def __init__(self, pattern: str, flags: int = 0):
self.str_pattern = pattern
self.bin_pattern = pattern.encode('ascii')
self.regex_flags = flags
def __bytes__(self):
return self.bin_pattern
@cached_property
def bin_compiled(self):
return re.compile(B'(%s)' % self.bin_pattern, flags=self.regex_flags)
@cached_property
def str_compiled(self):
return re.compile(self.str_pattern, flags=self.regex_flags)
def __str__(self):
return self.str_pattern
def __getattr__(self, verb):
if not hasattr(re.Pattern, verb):
raise AttributeError(verb)
bin_attr = getattr(self.bin_compiled, verb)
if not callable(bin_attr):
return bin_attr
str_attr = getattr(self.str_compiled, verb)
def wrapper(*args, **kwargs):
for argument in args:
if isinstance(argument, str):
return str_attr(*args, **kwargs)
else:
return bin_attr(*args, **kwargs)
functools.update_wrapper(wrapper, bin_attr)
return wrapper
class alphabet(pattern):
"""
A pattern object representing strings of letters from a given alphabet, with
an optional prefix and postfix.
"""
def __init__(self, repeat, prefix='', postfix='', at_least=1, at_most=None, **kwargs):
if not at_most:
count = '+' if at_least <= 1 else '{{{},}}'.format(at_least)
else:
count = '{{{},{}}}(?!{})'.format(at_least, at_most, repeat)
pattern.__init__(self,
R'{b}(?:{r}){c}{a}'.format(
r=repeat,
b=prefix,
c=count,
a=postfix
),
**kwargs
)
class tokenize(pattern):
"""
A pattern representing a sequence of tokens matching the `token` pattern, separated
by sequences matching the pattern `sep`. The optional parameter `bound` is required
before and after each token, its default value is the regular expression zero length
match for a word boundary.
"""
def __init__(self, token, sep, bound='\\b', unique_sep=False, sep_ignores_whitespace=True, **kwargs):
if unique_sep:
if sep_ignores_whitespace:
p = R'(?:{b}{t}{b}\s{{0,50}}(?P<__sep>{s})\s{{0,50}})(?:(?:{b}{t}{b}\s{{0,50}}(?P=__sep)\s{{0,50}})+{b}{t}{b}|{b}{t}{b})'
else:
p = R'(?:{b}{t}{b}(?P<__sep>{s}))(?:(?:{b}{t}{b}(?P=__sep))+{b}{t}{b}|{b}{t}{b})'
else:
p = R'(?:{b}{t}{b}{s})+(?:{b}{t}{b})'
pattern.__init__(self, p.format(s=sep, b=bound, t=token), **kwargs)
class PatternEnum(enum.Enum):
@classmethod
def get(cls, name, default):
try:
return cls[name]
except KeyError:
return default
def __str__(self):
return str(self.value)
def __bytes__(self):
return bytes(self.value)
def __repr__(self):
return F'<pattern {self.name}: {self.value}>'
def __getattr__(self, name):
if name in dir(re.Pattern):
return getattr(self.value, name)
raise AttributeError
@property
def display(self):
return normalize_to_display(self.name)
_TLDS = R'(?i:{possible_tld})(?!(?:{dealbreakers}))'.format(
possible_tld='|'.join(tlds),
dealbreakers='|'.join([
R'[a-z]',
R'[A-Za-z]{3}',
R'\.\w\w',
R'\([\'"\w)]'
])
)
# see https://tools.ietf.org/html/rfc2181#section-11
_format_serrated_domain = (
R'(?:\w[a-zA-Z0-9\-\_]{{0,62}}?\.){repeat}'
R'\w[a-zA-Z0-9\-\_]{{0,62}}\.{tlds}'
)
_format_defanged_domain = (
R'(?:\w[a-zA-Z0-9\-\_]{{0,62}}?(?:\[\.\]|\.)){repeat}'
R'\w[a-zA-Z0-9\-\_]{{0,62}}(?:\[\.\]|\.){tlds}'
)
_pattern_utf8 = R'(?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF7][\x80-\xBF]{3})+'
_pattern_b92 = R'~|(?:[!-_a-}]{2})+[!-_a-}]?'
_pattern_serrated_domain = _format_serrated_domain.format(repeat='{0,20}', tlds=_TLDS)
_pattern_defanged_domain = _format_defanged_domain.format(repeat='{0,20}', tlds=_TLDS)
_pattern_subdomain = _format_serrated_domain.format(repeat='{1,20}', tlds=_TLDS)
_pattern_octet = R'(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]?\d)'
_pattern_serrated_ipv4 = R'(?<!\.|\d)(?:{o}\.){{3}}{o}(?![\d\.])'.format(o=_pattern_octet)
_pattern_defanged_ipv4 = R'(?:{o}{d}){{3}}{o}'.format(o=_pattern_octet, d=R'(?:\[\.\]|\.)')
# Taken from: https://stackoverflow.com/a/17871737/9130824
_pattern_ipv6 = (
R'('
R'([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|' # 1:2:3:4:5:6:7:8
R'([0-9a-fA-F]{1,4}:){1,7}:|' # 1:: 1:2:3:4:5:6:7::
R'([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|' # 1::8 1:2:3:4:5:6::8 1:2:3:4:5:6::8
R'([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|' # 1::7:8 1:2:3:4:5::7:8 1:2:3:4:5::8
R'([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|' # 1::6:7:8 1:2:3:4::6:7:8 1:2:3:4::8
R'([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|' # 1::5:6:7:8 1:2:3::5:6:7:8 1:2:3::8
R'([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|' # 1::4:5:6:7:8 1:2::4:5:6:7:8 1:2::8
R'[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|' # 1::3:4:5:6:7:8 1::3:4:5:6:7:8 1::8
R':((:[0-9a-fA-F]{1,4}){1,7}|:)|' # ::2:3:4:5:6:7:8 ::2:3:4:5:6:7:8 ::8 ::
R'fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|' # fe80::7:8%eth0 fe80::7:8%1 (link-local IPv6 addresses with zone index)
R'::(ffff(:0{1,4}){0,1}:){0,1}' #
R'((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}' #
R'(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|' # ::255.255.255.255 ::ffff:255.255.255.255 ::ffff:0:255.255.255.255
R'([0-9a-fA-F]{1,4}:){1,4}:' # (IPv4-mapped IPv6 addresses and IPv4-translated addresses)
R'((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}' #
R'(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])' # 2001:db8:3:4::192.0.2.33 64:ff9b::192.0.2.33
R')' # (IPv4-Embedded IPv6 Address)
)
_pattern_serrated_socket = '(?:{ip}|{d})(?::\\d{{2,5}})'.format(ip=_pattern_serrated_ipv4, d=_pattern_serrated_domain)
_pattern_defanged_socket = '(?:{ip}|{d})(?::\\d{{2,5}})'.format(ip=_pattern_defanged_ipv4, d=_pattern_defanged_domain)
_pattern_serrated_hostname = _pattern_serrated_socket + '?'
_pattern_defanged_hostname = _pattern_defanged_socket + '?'
_pattern_integer = '[-+]?(?:0[bB][01]+|0[xX][0-9a-fA-F]+|0[1-7][0-7]*|[1-9][0-9]*|0)(?=[uU]?[iI]\\d{1,2}|[LlHh]|[^a-zA-Z0-9]|$)'
_pattern_float = R'[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?'
_pattern_number = F'(?:(?:{_pattern_integer})|(?:{_pattern_float}))'
_pattern_number = (
'[-+]?(?:0[bB][01]+|0[xX][0-9a-fA-F]+|0[1-7][0-7]*|(?:[1-9][0-9]*|0)(?P<fp1>\\.[0-9]*)?|(?P<fp2>\\.[0-9]+))'
'(?(fp1)(?:[eE][-+]?[0-9]+)?|(?(fp2)(?:[eE][-+]?[0-9]+)?|(?=[uU]?[iI]\\d{1,2}|[LlHh]|[^a-zA-Z0-9]|$)))'
)
_pattern_cmdstr = R'''(?:"(?:""|[^"])*"|'(?:''|[^'])*')'''
_pattern_ps1str = R'''(?:(?:@"\s*?[\r\n].*?[\r\n]"@)|(?:@'\s*?[\r\n].*?[\r\n]'@)|(?:"(?:`.|""|[^"\n])*")|(?:'(?:''|[^'\n])*'))'''
_pattern_vbastr = R'''"(?:""|[^"])*"'''
_pattern_vbaint = R'(?:&[bB][01]+|&[hH][0-9a-fA-F]+|&[oO][0-7]*|[-+]?(?:[1-9][0-9]*|0))(?=\b|$)'
_pattern_string = R'''(?:"(?:[^"\\\r\n]|\\[^\r\n])*"|'(?:[^'\\\r\n]|\\[^\r\n])*')'''
_pattern_string_multiline = R'''(?:"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')'''
_pattern_urlenc_coarse = R'''(?:%[0-9a-fA-F]{2}|[0-9a-zA-Z\-\._~\?!$&=:\/#\[\]@'\(\)\*\+,;])+'''
_pattern_urlenc = R'''(?:%[0-9a-fA-F]{2}|[0-9a-zA-Z\-\._~\?!$&=])+'''
_pattern_urlenc_narrow = R'''(?:%[0-9a-fA-F]{2})+'''
_pattern_json = R'''[\[\{](?:"(?:[^"\\\r\n]|\\[^\r\n])*"(?:\s*[:,])?|(?:none|true|false|%s|%s|\]|\})(?:\s*,)?|[,\]\}\[\{\s]+)*[\]\}]''' % (
_pattern_integer, _pattern_float)
_pattern_wshenc = R'''#@~\^[ -~]{6}==(?:.*?)[ -~]{6}==\^#~@'''
_part_url_credentials = (
R'(?:(?P<url_username>[^"\'\s\x00-\x20\x7E-\xFF]{1,256})?'
R'(?::(?P<url_password>[^"\'\s\x00-\x20\x7E-\xFF]{0,256})?)?@)?'
)
_prefix_serrated_url = R'(?P<url_scheme>(?P<url_protocol>[a-zA-Z]{2,20}:)?\/\/)' + _part_url_credentials
_prefix_defanged_url = R'(?P<url_scheme>(?P<url_protocol>[a-zA-Z]{2,20}(?:\[:\]|:))?\/\/)' + _part_url_credentials
_suffix_combined_url = R'(?P<url_path>[/?#](?:[#/=:;$!?&.,\w\+\%\-\*\'~@()](?![a-zA-Z]{2,20}://))*)?'
_pattern_serrated_url = F'{_prefix_serrated_url}(?P<url_host>{_pattern_serrated_hostname}){_suffix_combined_url}'
_pattern_defanged_url = F'{_prefix_defanged_url}(?P<url_host>{_pattern_defanged_hostname}){_suffix_combined_url}'
_pattern_email = R'(?:[a-zA-Z0-9_\.\+\-]{{1,256}}?)@(?:{})'.format(_pattern_serrated_domain)
_pattern_guid = R'(?:\b|\{)[0-9A-Fa-f]{8}(?:\-[0-9A-Fa-f]{4}){3}\-[0-9A-Fa-f]{12}(?:\}|\b)'
_pattern_pathpart_nospace = R'[-\w+,.;@\]\[{}^`~]+' # R'[^/\\:"<>|\s\x7E-\xFF\x00-\x1F\xAD]+'
_pattern_win_path_element = R'(?:{n} ){{0,4}}{n}'.format(n=_pattern_pathpart_nospace)
_pattern_nix_path_element = R'(?:{n} ){{0,1}}{n}'.format(n=_pattern_pathpart_nospace)
_pattern_win_env_variable = R'%[a-zA-Z][a-zA-Z0-9_\-\(\)]*%'
_pattern_win_path = R'(?:{s}|{p}|)(?P<__pathsep>[\\\/])(?:{p}(?P=__pathsep))*{p}(?:(?P=__pathsep)|\b)'.format(
s='|'.join([
_pattern_win_env_variable, # environment variable
R'[A-Za-z]:', # drive letter with colon
R'\\\\[a-zA-Z0-9_.$@]{1,50}', # UNC path
R'HK[A-Z_]{1,30}', # registry root key
]),
p=_pattern_win_path_element
)
_pattern_nix_path = R'(?:/(?:{n}/)+|(?:{n}/){{2,}}){n}'.format(n=_pattern_nix_path_element)
_pattern_any_path = R'(?:{nix})|(?:{win})'.format(
nix=_pattern_nix_path,
win=_pattern_win_path
)
_pattern_uuencode = R'begin\s+\d{3}\s+[\x20!-~]+?\r?\n(?:M[\x20-\x60]{60}\r?\n)*(?:.*?\r?\n)?`\r?\nend'
def make_hexline_pattern(blocksize: int) -> str:
return R'(?:{s}+\s+)?\s*({h})(?:[ \t]+(.+?))?'.format(
h=tokenize(
RF'(?:0[xX])?[0-9a-fA-F]{{{2 * blocksize}}}h?',
sep=R'[- \t\/:;,\\]{1,3}'
).str_pattern,
s=R'[-\w:;,#\.\$\?!\/\\=\(\)\[\]\{\}]'
)
_pattern_hexline = make_hexline_pattern(1)
_pattern_pem = (
R'-----BEGIN(?:\s[A-Z0-9]+)+-----{n}'
R'(?:{b}{{40,100}}{n})*{b}{{1,100}}={{0,3}}{n}'
R'-----END(?:\s[A-Z0-9]+)+-----'
).format(n=R'(?:\r\n|\n\r|\n)', b=R'[0-9a-zA-Z\+\/]')
__all__ = [
'pattern',
'alphabet',
'tokenize',
'formats',
'indicators',
'wallets',
'defanged'
]
class checks(PatternEnum):
json = pattern(_pattern_json)
"Data that consists of JSON-like tokens; cannot detect actual JSON data."
class formats(PatternEnum):
"""
An enumeration of patterns for certain formats.
"""
integer = pattern(_pattern_integer)
"Integer expressions"
float = pattern(_pattern_float)
"Floating point number expressions"
number = pattern(_pattern_number)
"Either an integer or a float"
string = pattern(_pattern_string)
"C syntax string literal"
multiline_string = pattern(_pattern_string_multiline)
"C syntax string literal that also allows line breaks"
cmdstr = pattern(_pattern_cmdstr)
"Windows command line escaped string literal"
ps1str = pattern(_pattern_ps1str, flags=re.DOTALL)
"PowerShell escaped string literal"
vbastr = pattern(_pattern_vbastr)
"VBS/VBA string literal"
vbaint = pattern(_pattern_vbaint)
"VBS/VBA integer literal"
printable = alphabet(R'[\s!-~]')
"Any sequence of printable characters"
urlquote = pattern(_pattern_urlenc)
"Any sequence of url-encoded characters, default char set"
urlquote_coarse = pattern(_pattern_urlenc_coarse)
"Any sequence of url-encoded characters, coarser variant with more characters allowed"
urlquote_narrow = pattern(_pattern_urlenc_narrow)
"A hex-encoded buffer using URL escape sequences"
intarray = tokenize(_pattern_integer, sep=R'[;,]', bound='', unique_sep=True)
"Sequences of integers, separated by commas or semicolons"
numarray = tokenize(_pattern_number, sep=R'[;,]', bound='', unique_sep=True)
"Sequences of numbers, separated by commas or semicolons"
word = alphabet(R'\\w')
"Sequences of word characters"
letters = alphabet(R'[a-zA-Z]')
"Sequences of alphabetic characters"
wshenc = pattern(_pattern_wshenc)
"Encoded Windows Scripting Host Scripts (JS/VBS)"
alphanumeric = alphabet(R'[a-zA-Z0-9]')
"Sequences of alpha-numeric characters"
b32 = pattern('[A-Z2-7]+|[a-z2-7+]')
"Base32 encoded strings"
b64 = alphabet(R'(?:[0-9a-zA-Z\+/]{4})', postfix=R'(?:(?:[0-9a-zA-Z\+/]{2,3})={0,3})?')
"Base64 encoded strings"
b85 = alphabet(R'[-!+*()#-&^-~0-9;-Z]')
"Base85 encoded strings"
b92 = pattern(_pattern_b92)
"Base92 encoded strings"
b64any = alphabet(R'(?:[-\w\+/]{4})', postfix=R'(?:(?:[-\w\+/]{2,3})={0,3})?')
"Both URL-safe and normal Base64 alphabets."
b64url = alphabet(R'[-\w]{4}', postfix=R'(?:[-\w]{2,3}={0,3})?')
"Base64 encoded strings using URL-safe alphabet"
hex = alphabet(R'[0-9a-fA-F]{2}')
"Hexadecimal strings"
uppercase_hex = alphabet(R'[0-9A-F]{2}')
"Uppercase hexadecimal strings"
spaced_hex = tokenize(R'[0-9a-fA-F]+', R'\s*', bound='')
"Hexadecimal strings"
spaced_b64 = alphabet(R'[-\s\w\+/]', postfix=R'(?:={0,3})?')
"Base64 encoded strings, separated by whitespace"
spaced_b85 = alphabet(R'[-!+*()#-&^-~0-9;-Z\s]')
"Base85 encoded string, separated by whitespace"
utf8 = pattern(_pattern_utf8)
"A sequence of bytes that can be decoded as UTF8."
hexdump = tokenize(_pattern_hexline, bound='', sep=R'\s*\n')
"""
This pattern matches a typical hexdump output where hexadecimally encoded
bytes are followed by a string which contains dots or printable characters
from the dump. For example:
46 4F 4F 0A 42 41 52 0A FOO.BAR.
F0 0B AA BA F0 0B ......
"""
hexarray = tokenize(R'[0-9A-Fa-f]{2}', sep=R'[;,]', bound='', unique_sep=True)
"Arrays of hexadecimal strings, separated by commas or semicolons"
uuencode = pattern(_pattern_uuencode)
"UUEncoded data"
@classmethod
def from_dashname(cls, key):
return getattr(cls, normalize_to_identifier(key))
class wallets(PatternEnum):
# https://gist.github.com/etherx-dev/76559d9e6d916917a960e33ceea91481
ADA = pattern("addr1[a-z0-9]+")
ATOM = pattern("cosmos[-\\w\\.]{10,}")
BCH = pattern("(bitcoincash:)?(q|p)[a-z0-9]{41}|(BITCOINCASH:)?(Q|P)[A-Z0-9]{41}")
BTC = pattern("(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{25,39})")
BTCP = pattern("5[HJK][1-9A-Za-z][^A-HJ-NP-Za-km-z0-9]{48}")
DASH = pattern("X[1-9A-HJ-NP-Za-km-z]{33}")
DOGE = pattern("D{1}[5-9A-HJ-NP-U]{1}[1-9A-HJ-NP-Za-km-z]{32}")
DOT = pattern("1[0-9a-zA-Z]{47}")
ETH = pattern("0x[a-fA-F0-9]{40}")
IOTA = pattern("iota[a-z0-9]{10,}")
LSK = pattern("[0-9]{19}L")
LTC = pattern("[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}")
NEO = pattern("N[0-9a-zA-Z]{33}")
ONE = pattern("(?:bnb|one)1[a-z0-9]{38}")
ONT = pattern("A[0-9a-zA-Z]{33}")
RONIN = pattern("ronin:[a-fA-F0-9]{40}")
TERRA = pattern("terra1[a-z0-9]{38}")
XEM = pattern("N[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}")
XLM = pattern("G[A-D][A-Z2-7]{54}")
XMR = pattern("4[0-9AB][1-9A-HJ-NP-Za-km-z]{90,120}")
XRP = pattern("r[0-9a-zA-Z]{24,34}")
class indicators(PatternEnum):
"""
An enumeration of patterns for indicators.
"""
domain = pattern(_pattern_serrated_domain)
"Domain names"
email = pattern(_pattern_email)
"Email addresses"
guid = pattern(_pattern_guid)
"Windows GUID strings"
ipv4 = pattern(_pattern_serrated_ipv4)
"String representations of IPv4 addresses"
ipv6 = pattern(_pattern_ipv6)
"String representations of IPv6 addresses"
md5 = alphabet('[0-9A-Fa-f]', at_least=32, at_most=32)
"Hexadecimal strings of length 32"
sha1 = alphabet('[0-9A-Fa-f]', at_least=40, at_most=40)
"Hexadecimal strings of length 40"
sha256 = alphabet('[0-9A-Fa-f]', at_least=64, at_most=64)
"Hexadecimal strings of length 64"
hostname = pattern(_pattern_serrated_hostname)
"Any domain name or IPv4 address, optionally followed by a colon and a port number."
socket = pattern(_pattern_serrated_socket)
"Any domain name or IPv4 address followed by a colon and a (port) number"
subdomain = pattern(_pattern_subdomain)
"A domain which contains at least three parts, including the top level"
url = pattern(_pattern_serrated_url)
"Uniform resource locator addresses"
btc = wallets.BTC.value
"Bitcoin addresses"
pem = pattern(_pattern_pem)
"A pattern matching PEM encoded cryptographic parameters"
xmr = wallets.XMR.value
"Monero addresses"
path = pattern(_pattern_any_path)
"Windows and Linux path names"
winpath = pattern(_pattern_win_path)
"Windows path names"
nixpath = pattern(_pattern_nix_path)
"Posix path names"
environment_variable = pattern(_pattern_win_env_variable)
"Windows environment variables, i.e. something like `%APPDATA%`"
@classmethod
def from_dashname(cls, key):
return getattr(cls, normalize_to_identifier(key))
class defanged(PatternEnum):
"""
An enumeration of patterns for defanged indicators. Used only by the reverse
operation of `refinery.defang`.
"""
hostname = pattern(_pattern_defanged_hostname)
"A defanged `refinery.lib.patterns.indicators.hostname`."
url = pattern(_pattern_defanged_url)
"A defanged `refinery.lib.patterns.indicators.url`."
Sub-modules
refinery.lib.patterns.tlds
Classes
class pattern (pattern, flags=0)
-
A wrapper for regular expression pattern objects created from re.compile, allowing combination of several patterns into one via overloaded operators.
Expand source code Browse git
class pattern: """ A wrapper for regular expression pattern objects created from re.compile, allowing combination of several patterns into one via overloaded operators. """ str_pattern: str bin_pattern: Optional[bytes] bin_compiled: re.Pattern str_compiled: re.Pattern def __init__(self, pattern: str, flags: int = 0): self.str_pattern = pattern self.bin_pattern = pattern.encode('ascii') self.regex_flags = flags def __bytes__(self): return self.bin_pattern @cached_property def bin_compiled(self): return re.compile(B'(%s)' % self.bin_pattern, flags=self.regex_flags) @cached_property def str_compiled(self): return re.compile(self.str_pattern, flags=self.regex_flags) def __str__(self): return self.str_pattern def __getattr__(self, verb): if not hasattr(re.Pattern, verb): raise AttributeError(verb) bin_attr = getattr(self.bin_compiled, verb) if not callable(bin_attr): return bin_attr str_attr = getattr(self.str_compiled, verb) def wrapper(*args, **kwargs): for argument in args: if isinstance(argument, str): return str_attr(*args, **kwargs) else: return bin_attr(*args, **kwargs) functools.update_wrapper(wrapper, bin_attr) return wrapper
Subclasses
Class variables
var str_pattern
var bin_pattern
Instance variables
var bin_compiled
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
var str_compiled
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
class alphabet (repeat, prefix='', postfix='', at_least=1, at_most=None, **kwargs)
-
A pattern object representing strings of letters from a given alphabet, with an optional prefix and postfix.
Expand source code Browse git
class alphabet(pattern): """ A pattern object representing strings of letters from a given alphabet, with an optional prefix and postfix. """ def __init__(self, repeat, prefix='', postfix='', at_least=1, at_most=None, **kwargs): if not at_most: count = '+' if at_least <= 1 else '{{{},}}'.format(at_least) else: count = '{{{},{}}}(?!{})'.format(at_least, at_most, repeat) pattern.__init__(self, R'{b}(?:{r}){c}{a}'.format( r=repeat, b=prefix, c=count, a=postfix ), **kwargs )
Ancestors
Class variables
var str_pattern
var bin_pattern
Instance variables
var bin_compiled
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
var str_compiled
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
class tokenize (token, sep, bound='\\b', unique_sep=False, sep_ignores_whitespace=True, **kwargs)
-
A pattern representing a sequence of tokens matching the
token
pattern, separated by sequences matching the patternsep
. The optional parameterbound
is required before and after each token, its default value is the regular expression zero length match for a word boundary.Expand source code Browse git
class tokenize(pattern): """ A pattern representing a sequence of tokens matching the `token` pattern, separated by sequences matching the pattern `sep`. The optional parameter `bound` is required before and after each token, its default value is the regular expression zero length match for a word boundary. """ def __init__(self, token, sep, bound='\\b', unique_sep=False, sep_ignores_whitespace=True, **kwargs): if unique_sep: if sep_ignores_whitespace: p = R'(?:{b}{t}{b}\s{{0,50}}(?P<__sep>{s})\s{{0,50}})(?:(?:{b}{t}{b}\s{{0,50}}(?P=__sep)\s{{0,50}})+{b}{t}{b}|{b}{t}{b})' else: p = R'(?:{b}{t}{b}(?P<__sep>{s}))(?:(?:{b}{t}{b}(?P=__sep))+{b}{t}{b}|{b}{t}{b})' else: p = R'(?:{b}{t}{b}{s})+(?:{b}{t}{b})' pattern.__init__(self, p.format(s=sep, b=bound, t=token), **kwargs)
Ancestors
Class variables
var str_pattern
var bin_pattern
Instance variables
var bin_compiled
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
var str_compiled
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
class formats (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration of patterns for certain formats.
Expand source code Browse git
class formats(PatternEnum): """ An enumeration of patterns for certain formats. """ integer = pattern(_pattern_integer) "Integer expressions" float = pattern(_pattern_float) "Floating point number expressions" number = pattern(_pattern_number) "Either an integer or a float" string = pattern(_pattern_string) "C syntax string literal" multiline_string = pattern(_pattern_string_multiline) "C syntax string literal that also allows line breaks" cmdstr = pattern(_pattern_cmdstr) "Windows command line escaped string literal" ps1str = pattern(_pattern_ps1str, flags=re.DOTALL) "PowerShell escaped string literal" vbastr = pattern(_pattern_vbastr) "VBS/VBA string literal" vbaint = pattern(_pattern_vbaint) "VBS/VBA integer literal" printable = alphabet(R'[\s!-~]') "Any sequence of printable characters" urlquote = pattern(_pattern_urlenc) "Any sequence of url-encoded characters, default char set" urlquote_coarse = pattern(_pattern_urlenc_coarse) "Any sequence of url-encoded characters, coarser variant with more characters allowed" urlquote_narrow = pattern(_pattern_urlenc_narrow) "A hex-encoded buffer using URL escape sequences" intarray = tokenize(_pattern_integer, sep=R'[;,]', bound='', unique_sep=True) "Sequences of integers, separated by commas or semicolons" numarray = tokenize(_pattern_number, sep=R'[;,]', bound='', unique_sep=True) "Sequences of numbers, separated by commas or semicolons" word = alphabet(R'\\w') "Sequences of word characters" letters = alphabet(R'[a-zA-Z]') "Sequences of alphabetic characters" wshenc = pattern(_pattern_wshenc) "Encoded Windows Scripting Host Scripts (JS/VBS)" alphanumeric = alphabet(R'[a-zA-Z0-9]') "Sequences of alpha-numeric characters" b32 = pattern('[A-Z2-7]+|[a-z2-7+]') "Base32 encoded strings" b64 = alphabet(R'(?:[0-9a-zA-Z\+/]{4})', postfix=R'(?:(?:[0-9a-zA-Z\+/]{2,3})={0,3})?') "Base64 encoded strings" b85 = alphabet(R'[-!+*()#-&^-~0-9;-Z]') "Base85 encoded strings" b92 = pattern(_pattern_b92) "Base92 encoded strings" b64any = alphabet(R'(?:[-\w\+/]{4})', postfix=R'(?:(?:[-\w\+/]{2,3})={0,3})?') "Both URL-safe and normal Base64 alphabets." b64url = alphabet(R'[-\w]{4}', postfix=R'(?:[-\w]{2,3}={0,3})?') "Base64 encoded strings using URL-safe alphabet" hex = alphabet(R'[0-9a-fA-F]{2}') "Hexadecimal strings" uppercase_hex = alphabet(R'[0-9A-F]{2}') "Uppercase hexadecimal strings" spaced_hex = tokenize(R'[0-9a-fA-F]+', R'\s*', bound='') "Hexadecimal strings" spaced_b64 = alphabet(R'[-\s\w\+/]', postfix=R'(?:={0,3})?') "Base64 encoded strings, separated by whitespace" spaced_b85 = alphabet(R'[-!+*()#-&^-~0-9;-Z\s]') "Base85 encoded string, separated by whitespace" utf8 = pattern(_pattern_utf8) "A sequence of bytes that can be decoded as UTF8." hexdump = tokenize(_pattern_hexline, bound='', sep=R'\s*\n') """ This pattern matches a typical hexdump output where hexadecimally encoded bytes are followed by a string which contains dots or printable characters from the dump. For example: 46 4F 4F 0A 42 41 52 0A FOO.BAR. F0 0B AA BA F0 0B ...... """ hexarray = tokenize(R'[0-9A-Fa-f]{2}', sep=R'[;,]', bound='', unique_sep=True) "Arrays of hexadecimal strings, separated by commas or semicolons" uuencode = pattern(_pattern_uuencode) "UUEncoded data" @classmethod def from_dashname(cls, key): return getattr(cls, normalize_to_identifier(key))
Ancestors
- refinery.lib.patterns.PatternEnum
- enum.Enum
Class variables
var integer
-
Integer expressions
var float
-
Floating point number expressions
var number
-
Either an integer or a float
var string
-
C syntax string literal
var multiline_string
-
C syntax string literal that also allows line breaks
var cmdstr
-
Windows command line escaped string literal
var ps1str
-
PowerShell escaped string literal
var vbastr
-
VBS/VBA string literal
var vbaint
-
VBS/VBA integer literal
var printable
-
Any sequence of printable characters
var urlquote
-
Any sequence of url-encoded characters, default char set
var urlquote_coarse
-
Any sequence of url-encoded characters, coarser variant with more characters allowed
var urlquote_narrow
-
A hex-encoded buffer using URL escape sequences
var intarray
-
Sequences of integers, separated by commas or semicolons
var numarray
-
Sequences of numbers, separated by commas or semicolons
var word
-
Sequences of word characters
var letters
-
Sequences of alphabetic characters
var wshenc
-
Encoded Windows Scripting Host Scripts (JS/VBS)
var alphanumeric
-
Sequences of alpha-numeric characters
var b32
-
Base32 encoded strings
var b64
-
Base64 encoded strings
var b85
-
Base85 encoded strings
var b92
-
Base92 encoded strings
var b64any
-
Both URL-safe and normal Base64 alphabets.
var b64url
-
Base64 encoded strings using URL-safe alphabet
var hex
-
Hexadecimal strings
var uppercase_hex
-
Uppercase hexadecimal strings
var spaced_hex
-
Hexadecimal strings
var spaced_b64
-
Base64 encoded strings, separated by whitespace
var spaced_b85
-
Base85 encoded string, separated by whitespace
var utf8
-
A sequence of bytes that can be decoded as UTF8.
var hexdump
-
This pattern matches a typical hexdump output where hexadecimally encoded bytes are followed by a string which contains dots or printable characters from the dump. For example:
46 4F 4F 0A 42 41 52 0A FOO.BAR. F0 0B AA BA F0 0B ......
var hexarray
-
Arrays of hexadecimal strings, separated by commas or semicolons
var uuencode
-
UUEncoded data
Static methods
def from_dashname(key)
-
Expand source code Browse git
@classmethod def from_dashname(cls, key): return getattr(cls, normalize_to_identifier(key))
class indicators (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration of patterns for indicators.
Expand source code Browse git
class indicators(PatternEnum): """ An enumeration of patterns for indicators. """ domain = pattern(_pattern_serrated_domain) "Domain names" email = pattern(_pattern_email) "Email addresses" guid = pattern(_pattern_guid) "Windows GUID strings" ipv4 = pattern(_pattern_serrated_ipv4) "String representations of IPv4 addresses" ipv6 = pattern(_pattern_ipv6) "String representations of IPv6 addresses" md5 = alphabet('[0-9A-Fa-f]', at_least=32, at_most=32) "Hexadecimal strings of length 32" sha1 = alphabet('[0-9A-Fa-f]', at_least=40, at_most=40) "Hexadecimal strings of length 40" sha256 = alphabet('[0-9A-Fa-f]', at_least=64, at_most=64) "Hexadecimal strings of length 64" hostname = pattern(_pattern_serrated_hostname) "Any domain name or IPv4 address, optionally followed by a colon and a port number." socket = pattern(_pattern_serrated_socket) "Any domain name or IPv4 address followed by a colon and a (port) number" subdomain = pattern(_pattern_subdomain) "A domain which contains at least three parts, including the top level" url = pattern(_pattern_serrated_url) "Uniform resource locator addresses" btc = wallets.BTC.value "Bitcoin addresses" pem = pattern(_pattern_pem) "A pattern matching PEM encoded cryptographic parameters" xmr = wallets.XMR.value "Monero addresses" path = pattern(_pattern_any_path) "Windows and Linux path names" winpath = pattern(_pattern_win_path) "Windows path names" nixpath = pattern(_pattern_nix_path) "Posix path names" environment_variable = pattern(_pattern_win_env_variable) "Windows environment variables, i.e. something like `%APPDATA%`" @classmethod def from_dashname(cls, key): return getattr(cls, normalize_to_identifier(key))
Ancestors
- refinery.lib.patterns.PatternEnum
- enum.Enum
Class variables
var domain
-
Domain names
var email
-
Email addresses
var guid
-
Windows GUID strings
var ipv4
-
String representations of IPv4 addresses
var ipv6
-
String representations of IPv6 addresses
var md5
-
Hexadecimal strings of length 32
var sha1
-
Hexadecimal strings of length 40
var sha256
-
Hexadecimal strings of length 64
var hostname
-
Any domain name or IPv4 address, optionally followed by a colon and a port number.
var socket
-
Any domain name or IPv4 address followed by a colon and a (port) number
var subdomain
-
A domain which contains at least three parts, including the top level
var url
-
Uniform resource locator addresses
var btc
-
Bitcoin addresses
var pem
-
A pattern matching PEM encoded cryptographic parameters
var xmr
-
Monero addresses
var path
-
Windows and Linux path names
var winpath
-
Windows path names
var nixpath
-
Posix path names
var environment_variable
-
Windows environment variables, i.e. something like
%APPDATA%
Static methods
def from_dashname(key)
-
Expand source code Browse git
@classmethod def from_dashname(cls, key): return getattr(cls, normalize_to_identifier(key))
class wallets (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Expand source code Browse git
class wallets(PatternEnum): # https://gist.github.com/etherx-dev/76559d9e6d916917a960e33ceea91481 ADA = pattern("addr1[a-z0-9]+") ATOM = pattern("cosmos[-\\w\\.]{10,}") BCH = pattern("(bitcoincash:)?(q|p)[a-z0-9]{41}|(BITCOINCASH:)?(Q|P)[A-Z0-9]{41}") BTC = pattern("(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{25,39})") BTCP = pattern("5[HJK][1-9A-Za-z][^A-HJ-NP-Za-km-z0-9]{48}") DASH = pattern("X[1-9A-HJ-NP-Za-km-z]{33}") DOGE = pattern("D{1}[5-9A-HJ-NP-U]{1}[1-9A-HJ-NP-Za-km-z]{32}") DOT = pattern("1[0-9a-zA-Z]{47}") ETH = pattern("0x[a-fA-F0-9]{40}") IOTA = pattern("iota[a-z0-9]{10,}") LSK = pattern("[0-9]{19}L") LTC = pattern("[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}") NEO = pattern("N[0-9a-zA-Z]{33}") ONE = pattern("(?:bnb|one)1[a-z0-9]{38}") ONT = pattern("A[0-9a-zA-Z]{33}") RONIN = pattern("ronin:[a-fA-F0-9]{40}") TERRA = pattern("terra1[a-z0-9]{38}") XEM = pattern("N[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}-[A-Za-z0-9]{4,7}") XLM = pattern("G[A-D][A-Z2-7]{54}") XMR = pattern("4[0-9AB][1-9A-HJ-NP-Za-km-z]{90,120}") XRP = pattern("r[0-9a-zA-Z]{24,34}")
Ancestors
- refinery.lib.patterns.PatternEnum
- enum.Enum
Class variables
var ADA
var ATOM
var BCH
var BTC
var BTCP
var DASH
var DOGE
var DOT
var ETH
var IOTA
var LSK
var LTC
var NEO
var ONE
var ONT
var RONIN
var TERRA
var XEM
var XLM
var XMR
var XRP
class defanged (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration of patterns for defanged indicators. Used only by the reverse operation of
defang
.Expand source code Browse git
class defanged(PatternEnum): """ An enumeration of patterns for defanged indicators. Used only by the reverse operation of `refinery.defang`. """ hostname = pattern(_pattern_defanged_hostname) "A defanged `refinery.lib.patterns.indicators.hostname`." url = pattern(_pattern_defanged_url) "A defanged `refinery.lib.patterns.indicators.url`."
Ancestors
- refinery.lib.patterns.PatternEnum
- enum.Enum
Class variables
var hostname
-
A defanged
indicators.hostname
. var url
-
A defanged
indicators.url
.