Module refinery.lib.ole.pcode
VBA p-code disassembler for Microsoft Office documents.
This module is a port of pcodedmp by Vesselin Bontchev, adapted for the Binary Refinery project. Since then, many bugs have been fixed and improvements made.
The original work is copyright (c) Vesselin Bontchev and licensed under GPL v3. The source code has been modified to fit the code requirements of this project.
Regardless of the license used for the binary refinery, this code file is also subject to the terms and conditions of the GNU General Public License version 3.
References
[pcodedmp] https://github.com/bontchev/pcodedmp [MS-OVBA] https://docs.microsoft.com/en-us/openspecs/
Expand source code Browse git
"""
VBA p-code disassembler for Microsoft Office documents.
This module is a port of pcodedmp by Vesselin Bontchev, adapted for the Binary Refinery project.
Since then, many bugs have been fixed and improvements made.
The original work is copyright (c) Vesselin Bontchev and licensed under GPL v3. The source code
has been modified to fit the code requirements of this project.
Regardless of the license used for the binary refinery, this code file is also subject to the
terms and conditions of the GNU General Public License version 3.
References:
[pcodedmp] https://github.com/bontchev/pcodedmp
[MS-OVBA] https://docs.microsoft.com/en-us/openspecs/
"""
from __future__ import annotations
import codecs
import logging
import re
import struct as _struct
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, NamedTuple
from refinery.lib.ole.file import OleFile
from refinery.lib.ole.vba import _codepage_to_codec, _find_vba_projects, decompress_stream
logger = logging.getLogger(__name__)
_STRUCT_WORD: dict[str, _struct.Struct] = {
'<': _struct.Struct('<H'),
'>': _struct.Struct('>H'),
}
_STRUCT_DWORD: dict[str, _struct.Struct] = {
'<': _struct.Struct('<L'),
'>': _struct.Struct('>L'),
}
_VAR_TYPES_LONG: tuple[str, ...] = (
'Var', '?', 'Int', 'Lng', 'Sng', 'Dbl', 'Cur', 'Date',
'Str', 'Obj', 'Err', 'Bool', 'Var',
)
_SPECIALS: tuple[str, ...] = ('False', 'True', 'Null', 'Empty')
_OPTIONS: tuple[str, ...] = (
'Base 0', 'Base 1', 'Compare Text', 'Compare Binary',
'Explicit', 'Private Module',
)
_SUFFIX_TYPES: frozenset[str] = frozenset({
'Integer', 'Long', 'Single', 'Double', 'Currency', 'String',
})
_SUFFIX_TYPE_IDS: frozenset[int] = frozenset({2, 3, 4, 5, 6, 8})
class Opcode(NamedTuple):
mnem: str
args: list[str] = []
varg: bool = False
@dataclass
class TypeRef:
name: str
is_array: bool = False
from_suffix: bool = False
@dataclass
class VarInfo:
name: str
type: TypeRef | None = None
has_new: bool = False
has_withevents: bool = False
@dataclass
class ArgInfo:
name: str
type: TypeRef | None = None
is_byval: bool = False
is_byref: bool = False
is_optional: bool = False
is_paramarray: bool = False
default_value: str | None = None
@dataclass
class FuncInfo:
scope: str
is_static: bool
kind: str
name: str
args: list[ArgInfo] = field(default_factory=list)
return_type: TypeRef | None = None
is_declare: bool = False
is_ptrsafe: bool = False
lib_name: str | None = None
alias_name: str | None = None
@dataclass
class DimScope:
keywords: list[str] = field(default_factory=list)
@dataclass
class CoerceType:
type_short: str
@dataclass
class RecordInfo:
text: str
if TYPE_CHECKING:
OpcodeArg = str | TypeRef | VarInfo | ArgInfo | FuncInfo | DimScope | CoerceType | RecordInfo
else:
OpcodeArg = str
class PCodeLine(NamedTuple):
opcodes: list[tuple[str, list[OpcodeArg]]]
class PCodeModule(NamedTuple):
"""
Structured representation of a disassembled VBA module.
"""
path: str
lines: list[PCodeLine]
identifiers_stripped: bool = False
# VBA7 opcodes; VBA3, VBA5 and VBA6 will be upconverted to these.
Op = Opcode
OPCODES: dict[int, Opcode] = {
0x000: Op('Imp'),
0x001: Op('Eqv'),
0x002: Op('Xor'),
0x003: Op('Or'),
0x004: Op('And'),
0x005: Op('Eq'),
0x006: Op('Ne'),
0x007: Op('Le'),
0x008: Op('Ge'),
0x009: Op('Lt'),
0x00A: Op('Gt'),
0x00B: Op('Add'),
0x00C: Op('Sub'),
0x00D: Op('Mod'),
0x00E: Op('IDiv'),
0x00F: Op('Mul'),
0x010: Op('Div'),
0x011: Op('Concat'),
0x012: Op('Like'),
0x013: Op('Pwr'),
0x014: Op('Is'),
0x015: Op('Not'),
0x016: Op('UMi'),
0x017: Op('FnAbs'),
0x018: Op('FnFix'),
0x019: Op('FnInt'),
0x01A: Op('FnSgn'),
0x01B: Op('FnLen'),
0x01C: Op('FnLenB'),
0x01D: Op('Paren'),
0x01E: Op('Sharp'),
0x01F: Op('LdLHS', ['name']),
0x020: Op('Ld', ['name']),
0x021: Op('MemLd', ['name']),
0x022: Op('DictLd', ['name']),
0x023: Op('IndexLd', ['0x']),
0x024: Op('ArgsLd', ['name', '0x']),
0x025: Op('ArgsMemLd', ['name', '0x']),
0x026: Op('ArgsDictLd', ['name', '0x']),
0x027: Op('St', ['name']),
0x028: Op('MemSt', ['name']),
0x029: Op('DictSt', ['name']),
0x02A: Op('IndexSt', ['0x']),
0x02B: Op('ArgsSt', ['name', '0x']),
0x02C: Op('ArgsMemSt', ['name', '0x']),
0x02D: Op('ArgsDictSt', ['name', '0x']),
0x02E: Op('Set', ['name']),
0x02F: Op('Memset', ['name']),
0x030: Op('Dictset', ['name']),
0x031: Op('Indexset', ['0x']),
0x032: Op('ArgsSet', ['name', '0x']),
0x033: Op('ArgsMemSet', ['name', '0x']),
0x034: Op('ArgsDictSet', ['name', '0x']),
0x035: Op('MemLdWith', ['name']),
0x036: Op('DictLdWith', ['name']),
0x037: Op('ArgsMemLdWith', ['name', '0x']),
0x038: Op('ArgsDictLdWith', ['name', '0x']),
0x039: Op('MemStWith', ['name']),
0x03A: Op('DictStWith', ['name']),
0x03B: Op('ArgsMemStWith', ['name', '0x']),
0x03C: Op('ArgsDictStWith', ['name', '0x']),
0x03D: Op('MemSetWith', ['name']),
0x03E: Op('DictSetWith', ['name']),
0x03F: Op('ArgsMemSetWith', ['name', '0x']),
0x040: Op('ArgsDictSetWith', ['name', '0x']),
0x041: Op('ArgsCall', ['name', '0x']),
0x042: Op('ArgsMemCall', ['name', '0x']),
0x043: Op('ArgsMemCallWith', ['name', '0x']),
0x044: Op('ArgsArray', ['name', '0x']),
0x045: Op('Assert'),
0x046: Op('BoS', ['0x']),
0x047: Op('BoSImplicit'),
0x048: Op('BoL'),
0x049: Op('LdAddressOf', ['name']),
0x04A: Op('MemAddressOf', ['name']),
0x04B: Op('Case'),
0x04C: Op('CaseTo'),
0x04D: Op('CaseGt'),
0x04E: Op('CaseLt'),
0x04F: Op('CaseGe'),
0x050: Op('CaseLe'),
0x051: Op('CaseNe'),
0x052: Op('CaseEq'),
0x053: Op('CaseElse'),
0x054: Op('CaseDone'),
0x055: Op('Circle', ['0x']),
0x056: Op('Close', ['0x']),
0x057: Op('CloseAll'),
0x058: Op('Coerce'),
0x059: Op('CoerceVar'),
0x05A: Op('Context', ['context_']),
0x05B: Op('Debug'),
0x05C: Op('DefType', ['0x', '0x']),
0x05D: Op('Dim'),
0x05E: Op('DimImplicit'),
0x05F: Op('Do'),
0x060: Op('DoEvents'),
0x061: Op('DoUnitil'),
0x062: Op('DoWhile'),
0x063: Op('Else'),
0x064: Op('ElseBlock'),
0x065: Op('ElseIfBlock'),
0x066: Op('ElseIfTypeBlock', ['imp_']),
0x067: Op('End'),
0x068: Op('EndContext'),
0x069: Op('EndFunc'),
0x06A: Op('EndIf'),
0x06B: Op('EndIfBlock'),
0x06C: Op('EndImmediate'),
0x06D: Op('EndProp'),
0x06E: Op('EndSelect'),
0x06F: Op('EndSub'),
0x070: Op('EndType'),
0x071: Op('EndWith'),
0x072: Op('Erase', ['0x']),
0x073: Op('Error'),
0x074: Op('EventDecl', ['func_']),
0x075: Op('RaiseEvent', ['name', '0x']),
0x076: Op('ArgsMemRaiseEvent', ['name', '0x']),
0x077: Op('ArgsMemRaiseEventWith', ['name', '0x']),
0x078: Op('ExitDo'),
0x079: Op('ExitFor'),
0x07A: Op('ExitFunc'),
0x07B: Op('ExitProp'),
0x07C: Op('ExitSub'),
0x07D: Op('FnCurDir'),
0x07E: Op('FnDir'),
0x07F: Op('Empty0'),
0x080: Op('Empty1'),
0x081: Op('FnError'),
0x082: Op('FnFormat'),
0x083: Op('FnFreeFile'),
0x084: Op('FnInStr'),
0x085: Op('FnInStr3'),
0x086: Op('FnInStr4'),
0x087: Op('FnInStrB'),
0x088: Op('FnInStrB3'),
0x089: Op('FnInStrB4'),
0x08A: Op('FnLBound', ['0x']),
0x08B: Op('FnMid'),
0x08C: Op('FnMidB'),
0x08D: Op('FnStrComp'),
0x08E: Op('FnStrComp3'),
0x08F: Op('FnStringVar'),
0x090: Op('FnStringStr'),
0x091: Op('FnUBound', ['0x']),
0x092: Op('For'),
0x093: Op('ForEach'),
0x094: Op('ForEachAs', ['imp_']),
0x095: Op('ForStep'),
0x096: Op('FuncDefn', ['func_']),
0x097: Op('FuncDefnSave', ['func_']),
0x098: Op('GetRec'),
0x099: Op('GoSub', ['name']),
0x09A: Op('GoTo', ['name']),
0x09B: Op('If'),
0x09C: Op('IfBlock'),
0x09D: Op('TypeOf', ['imp_']),
0x09E: Op('IfTypeBlock', ['imp_']),
0x09F: Op('Implements', ['0x', '0x', '0x', '0x']),
0x0A0: Op('Input'),
0x0A1: Op('InputDone'),
0x0A2: Op('InputItem'),
0x0A3: Op('Label', ['name']),
0x0A4: Op('Let'),
0x0A5: Op('Line', ['0x']),
0x0A6: Op('LineCont', [], True),
0x0A7: Op('LineInput'),
0x0A8: Op('LineNum', ['name']),
0x0A9: Op('LitCy', ['0x', '0x', '0x', '0x']),
0x0AA: Op('LitDate', ['0x', '0x', '0x', '0x']),
0x0AB: Op('LitDefault'),
0x0AC: Op('LitDI2', ['0x']),
0x0AD: Op('LitDI4', ['0x', '0x']),
0x0AE: Op('LitDI8', ['0x', '0x', '0x', '0x']),
0x0AF: Op('LitHI2', ['0x']),
0x0B0: Op('LitHI4', ['0x', '0x']),
0x0B1: Op('LitHI8', ['0x', '0x', '0x', '0x']),
0x0B2: Op('LitNothing'),
0x0B3: Op('LitOI2', ['0x']),
0x0B4: Op('LitOI4', ['0x', '0x']),
0x0B5: Op('LitOI8', ['0x', '0x', '0x', '0x']),
0x0B6: Op('LitR4', ['0x', '0x']),
0x0B7: Op('LitR8', ['0x', '0x', '0x', '0x']),
0x0B8: Op('LitSmallI2'),
0x0B9: Op('LitStr', [], True),
0x0BA: Op('LitVarSpecial'),
0x0BB: Op('Lock'),
0x0BC: Op('Loop'),
0x0BD: Op('LoopUntil'),
0x0BE: Op('LoopWhile'),
0x0BF: Op('LSet'),
0x0C0: Op('Me'),
0x0C1: Op('MeImplicit'),
0x0C2: Op('MemRedim', ['name', '0x', 'type_']),
0x0C3: Op('MemRedimWith', ['name', '0x', 'type_']),
0x0C4: Op('MemRedimAs', ['name', '0x', 'type_']),
0x0C5: Op('MemRedimAsWith', ['name', '0x', 'type_']),
0x0C6: Op('Mid'),
0x0C7: Op('MidB'),
0x0C8: Op('Name'),
0x0C9: Op('New', ['imp_']),
0x0CA: Op('Next'),
0x0CB: Op('NextVar'),
0x0CC: Op('OnError', ['name']),
0x0CD: Op('OnGosub', [], True),
0x0CE: Op('OnGoto', [], True),
0x0CF: Op('Open', ['0x']),
0x0D0: Op('Option'),
0x0D1: Op('OptionBase'),
0x0D2: Op('ParamByVal'),
0x0D3: Op('ParamOmitted'),
0x0D4: Op('ParamNamed', ['name']),
0x0D5: Op('PrintChan'),
0x0D6: Op('PrintComma'),
0x0D7: Op('PrintEoS'),
0x0D8: Op('PrintItemComma'),
0x0D9: Op('PrintItemNL'),
0x0DA: Op('PrintItemSemi'),
0x0DB: Op('PrintNL'),
0x0DC: Op('PrintObj'),
0x0DD: Op('PrintSemi'),
0x0DE: Op('PrintSpc'),
0x0DF: Op('PrintTab'),
0x0E0: Op('PrintTabComma'),
0x0E1: Op('PSet', ['0x']),
0x0E2: Op('PutRec'),
0x0E3: Op('QuoteRem', ['0x'], True),
0x0E4: Op('Redim', ['name', '0x', 'type_']),
0x0E5: Op('RedimAs', ['name', '0x', 'type_']),
0x0E6: Op('Reparse', [], True),
0x0E7: Op('Rem', [], True),
0x0E8: Op('Resume', ['name']),
0x0E9: Op('Return'),
0x0EA: Op('RSet'),
0x0EB: Op('Scale', ['0x']),
0x0EC: Op('Seek'),
0x0ED: Op('SelectCase'),
0x0EE: Op('SelectIs', ['imp_']),
0x0EF: Op('SelectType'),
0x0F0: Op('SetStmt'),
0x0F1: Op('Stack', ['0x', '0x']),
0x0F2: Op('Stop'),
0x0F3: Op('Type', ['rec_']),
0x0F4: Op('Unlock'),
0x0F5: Op('VarDefn', ['var_']),
0x0F6: Op('Wend'),
0x0F7: Op('While'),
0x0F8: Op('With'),
0x0F9: Op('WriteChan'),
0x0FA: Op('ConstFuncExpr'),
0x0FB: Op('LbConst', ['name']),
0x0FC: Op('LbIf'),
0x0FD: Op('LbElse'),
0x0FE: Op('LbElseIf'),
0x0FF: Op('LbEndIf'),
0x100: Op('LbMark'),
0x101: Op('EndForVariable'),
0x102: Op('StartForVariable'),
0x103: Op('NewRedim'),
0x104: Op('StartWithExpr'),
0x105: Op('SetOrSt', ['name']),
0x106: Op('EndEnum'),
0x107: Op('Illegal'),
}
INTERNAL_NAMES: list[str] = [
'<crash>',
'0',
'Abs',
'Access',
'AddressOf',
'Alias',
'And',
'Any',
'Append',
'Array',
'As',
'Assert',
'B',
'Base',
'BF',
'Binary',
'Boolean',
'ByRef',
'Byte',
'ByVal',
'Call',
'Case',
'CBool',
'CByte',
'CCur',
'CDate',
'CDec',
'CDbl',
'CDecl',
'ChDir',
'CInt',
'Circle',
'CLng',
'Close',
'Compare',
'Const',
'CSng',
'CStr',
'CurDir',
'CurDir$',
'CVar',
'CVDate',
'CVErr',
'Currency',
'Database',
'Date',
'Date$',
'Debug',
'Decimal',
'Declare',
'DefBool',
'DefByte',
'DefCur',
'DefDate',
'DefDec',
'DefDbl',
'DefInt',
'DefLng',
'DefObj',
'DefSng',
'DefStr',
'DefVar',
'Dim',
'Dir',
'Dir$',
'Do',
'DoEvents',
'Double',
'Each',
'Else',
'ElseIf',
'Empty',
'End',
'EndIf',
'Enum',
'Eqv',
'Erase',
'Error',
'Error$',
'Event',
'WithEvents',
'Explicit',
'F',
'False',
'Fix',
'For',
'Format',
'Format$',
'FreeFile',
'Friend',
'Function',
'Get',
'Global',
'Go',
'GoSub',
'Goto',
'If',
'Imp',
'Implements',
'In',
'Input',
'Input$',
'InputB',
'InputB',
'InStr',
'InputB$',
'Int',
'InStrB',
'Is',
'Integer',
'Left',
'LBound',
'LenB',
'Len',
'Lib',
'Let',
'Line',
'Like',
'Load',
'Local',
'Lock',
'Long',
'Loop',
'LSet',
'Me',
'Mid',
'Mid$',
'MidB',
'MidB$',
'Mod',
'Module',
'Name',
'New',
'Next',
'Not',
'Nothing',
'Null',
'Object',
'On',
'Open',
'Option',
'Optional',
'Or',
'Output',
'ParamArray',
'Preserve',
'Print',
'Private',
'Property',
'PSet',
'Public',
'Put',
'RaiseEvent',
'Random',
'Randomize',
'Read',
'ReDim',
'Rem',
'Resume',
'Return',
'RGB',
'RSet',
'Scale',
'Seek',
'Select',
'Set',
'Sgn',
'Shared',
'Single',
'Spc',
'Static',
'Step',
'Stop',
'StrComp',
'String',
'String$',
'Sub',
'Tab',
'Text',
'Then',
'To',
'True',
'Type',
'TypeOf',
'UBound',
'Unload',
'Unlock',
'Unknown',
'Until',
'Variant',
'WEnd',
'While',
'Width',
'With',
'Write',
'Xor',
'#Const',
'#Else',
'#ElseIf',
'#End',
'#If',
'Attribute',
'VB_Base',
'VB_Control',
'VB_Creatable',
'VB_Customizable',
'VB_Description',
'VB_Exposed',
'VB_Ext_Key',
'VB_HelpID',
'VB_Invoke_Func',
'VB_Invoke_Property',
'VB_Invoke_PropertyPut',
'VB_Invoke_PropertyPutRef',
'VB_MemberFlags',
'VB_Name',
'VB_PredecraredID',
'VB_ProcData',
'VB_TemplateDerived',
'VB_VarDescription',
'VB_VarHelpID',
'VB_VarMemberFlags',
'VB_VarProcData',
'VB_UserMemID',
'VB_VarUserMemID',
'VB_GlobalNameSpace',
',',
'.',
'"',
'_',
'!',
'#',
'&',
"'",
'(',
')',
'*',
'+',
'-',
' /',
':',
';',
'<',
'<=',
'<>',
'=',
'=<',
'=>',
'>',
'><',
'>=',
'?',
'\\',
'^',
':=',
]
DIM_TYPES: list[str] = [
'', 'Null', 'Integer', 'Long', 'Single', 'Double', 'Currency',
'Date', 'String', 'Object', 'Error', 'Boolean', 'Variant', '',
'Decimal', '', '', 'Byte', '', '', 'LongLong', '', '', '',
'Any',
]
def _get_word(buffer: bytes | bytearray | memoryview, offset: int, endian: str) -> int:
return _STRUCT_WORD[endian].unpack_from(buffer, offset)[0]
def _get_dword(buffer: bytes | bytearray | memoryview, offset: int, endian: str) -> int:
return _STRUCT_DWORD[endian].unpack_from(buffer, offset)[0]
def _skip_structure(
buffer: bytes | bytearray | memoryview,
offset: int,
endian: str,
is_length_dw: bool,
element_size: int,
check_minus_one: bool,
) -> int:
if is_length_dw:
length = _get_dword(buffer, offset, endian)
offset += 4
skip = check_minus_one and (length == 0xFFFFFFFF)
else:
length = _get_word(buffer, offset, endian)
offset += 2
skip = check_minus_one and (length == 0xFFFF)
if not skip:
offset += length * element_size
return offset
def _get_var(
buffer: bytes | bytearray | memoryview,
offset: int,
endian: str,
is_dword: bool,
) -> tuple[int, int]:
if is_dword:
value = _get_dword(buffer, offset, endian)
offset += 4
else:
value = _get_word(buffer, offset, endian)
offset += 2
return offset, value
def _get_type_and_length(
buffer: bytes | bytearray | memoryview,
offset: int,
endian: str,
) -> tuple[int, int]:
if endian == '>':
return buffer[offset], buffer[offset + 1]
else:
return buffer[offset + 1], buffer[offset]
def _translate_opcode(opcode: int, vba_ver: int, is_64bit: bool) -> int:
if vba_ver == 3:
if 0 <= opcode <= 67:
return opcode
elif 68 <= opcode <= 70:
return opcode + 2
elif 71 <= opcode <= 111:
return opcode + 4
elif 112 <= opcode <= 150:
return opcode + 8
elif 151 <= opcode <= 164:
return opcode + 9
elif 165 <= opcode <= 166:
return opcode + 10
elif 167 <= opcode <= 169:
return opcode + 11
elif 170 <= opcode <= 238:
return opcode + 12
else:
return opcode + 24
elif vba_ver == 5:
if 0 <= opcode <= 68:
return opcode
elif 69 <= opcode <= 71:
return opcode + 1
elif 72 <= opcode <= 112:
return opcode + 3
elif 113 <= opcode <= 151:
return opcode + 7
elif 152 <= opcode <= 165:
return opcode + 8
elif 166 <= opcode <= 167:
return opcode + 9
elif 168 <= opcode <= 170:
return opcode + 10
else:
return opcode + 11
elif not is_64bit:
if 0 <= opcode <= 173:
return opcode
elif 174 <= opcode <= 175:
return opcode + 1
elif 176 <= opcode <= 178:
return opcode + 2
else:
return opcode + 3
else:
return opcode
def _get_id(
id_code: int,
identifiers: list[str],
vba_ver: int,
is_64bit: bool,
) -> str:
orig_code = id_code
id_code >>= 1
try:
if id_code >= 0x100:
id_code -= 0x100
if vba_ver >= 7:
id_code -= 4
if is_64bit:
id_code -= 3
return identifiers[id_code]
else:
if vba_ver >= 7:
if id_code == 0xE9:
return 'PtrSafe'
if id_code > 0xE9:
id_code -= 1
if vba_ver >= 6 and id_code >= 0xC3:
id_code -= 1
return INTERNAL_NAMES[id_code]
except (IndexError, KeyError):
return F'id_{orig_code:04X}'
def _get_name(
buffer: bytes | bytearray | memoryview,
identifiers: list[str],
offset: int,
endian: str,
vba_ver: int,
is_64bit: bool,
) -> str:
object_id = _get_word(buffer, offset, endian)
return _get_id(object_id, identifiers, vba_ver, is_64bit)
def _get_type_name(type_id: int) -> str:
type_flags = type_id & 0xE0
type_id &= ~0xE0
type_name = DIM_TYPES[type_id] if type_id < len(DIM_TYPES) else ''
if type_flags & 0x80:
if type_name == 'LongLong':
type_name = 'Long'
type_name += 'Ptr'
return type_name
def _disasm_type(
indirect_table: bytes | bytearray | memoryview,
dword: int,
) -> str:
type_id = indirect_table[dword + 6]
type_name = _get_type_name(type_id)
return type_name or F'type_{dword:08X}'
_VALID_INTERNAL_TYPE_NAMES = frozenset({
'Boolean',
'Name',
})
class DisassemblyContext:
"""
Holds shared state for the disassembly of a single VBA module, eliminating repeated parameter
threading through every helper function.
"""
def __init__(
self,
indirect_table: bytes | bytearray | memoryview,
object_table: bytes | bytearray | memoryview,
declaration_table: bytes | bytearray | memoryview,
identifiers: list[str],
endian: str,
vba_ver: int,
is_64bit: bool,
codec: str,
version: int = 0,
module_data: bytes | bytearray | memoryview | None = None,
external_types: dict[int, str] | None = None,
):
self.indirect_table = indirect_table
self.object_table = object_table
self.declaration_table = declaration_table
self.identifiers = identifiers
self.endian = endian
self.vba_ver = vba_ver
self.is_64bit = is_64bit
self.codec = codec
self.version = version
self.module_data = module_data
self.external_types: dict[int, str] = external_types or {}
self._linecont_pending = False
self._has_pa_bit = False
def disasm_name(self, word: int, mnemonic: str, op_type: int) -> str:
var_types = [
'', '?', '%', '&', '!', '#', '@', '?', '$', '?', '?', '?', '?', '?',
]
var_name = _get_id(word, self.identifiers, self.vba_ver, self.is_64bit)
if op_type < len(var_types):
str_type = var_types[op_type]
else:
str_type = ''
if op_type == 32:
var_name = F'[{var_name}]'
if mnemonic == 'OnError':
str_type = ''
if op_type == 1:
var_name = '(Resume Next)'
elif op_type == 2:
var_name = '(GoTo 0)'
elif mnemonic == 'Resume':
str_type = ''
if op_type == 1:
var_name = '(Next)'
elif op_type != 0:
var_name = ''
return (var_name + str_type).rstrip()
def disasm_imp(self, arg: str, word: int, mnemonic: str) -> str:
if mnemonic != 'Open':
if arg == 'imp_':
shift = 3 if self.is_64bit else 2
offs = (word >> shift) * 10
if offs + 8 <= len(self.object_table):
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name == 0:
return self.external_types.get(offs, '')
name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit)
if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES:
return self.external_types.get(offs, '')
return name
return F'{arg}{word:04X}'
access_mode = ['Read', 'Write', 'Read Write']
lock_mode = ['Read Write', 'Write', 'Read']
mode = word & 0x00FF
access = (word & 0x0F00) >> 8
lock = (word & 0xF000) >> 12
imp_name = '(For '
if mode & 0x01:
imp_name += 'Input'
elif mode & 0x02:
imp_name += 'Output'
elif mode & 0x04:
imp_name += 'Random'
elif mode & 0x08:
imp_name += 'Append'
elif mode == 0x20:
imp_name += 'Binary'
if access and (access <= len(access_mode)):
imp_name += F' Access {access_mode[access - 1]}'
if lock:
if lock & 0x04:
imp_name += ' Shared'
elif lock <= len(lock_mode):
imp_name += F' Lock {lock_mode[lock - 1]}'
imp_name += ')'
return imp_name
def disasm_rec(self, dword: int) -> str:
object_name = _get_name(
self.indirect_table, self.identifiers, dword + 2,
self.endian, self.vba_ver, self.is_64bit)
options = _get_word(self.indirect_table, dword + 18, self.endian)
if (options & 1) == 0:
object_name = F'(Private) {object_name}'
else:
object_name = F'(Public) {object_name}'
return object_name
def _resolve_udt_name(self, type_desc: int) -> str:
"""Resolve a user-defined type (type_id=0x1D) from the object table.
The type descriptor at `type_desc` stores the object table reference word at
offset +8 (instead of the usual +2 for non-builtin types).
"""
if type_desc + 10 > len(self.indirect_table):
return ''
word = _get_word(self.indirect_table, type_desc + 8, self.endian)
if self.is_64bit:
offs = (word >> 3) * 10
required = offs + 8
else:
offs = (word >> 2) * 10
required = offs + 4
if required > len(self.object_table):
return ''
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name == 0:
return self.external_types.get(offs, '')
return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit)
def disasm_object(self, offset: int) -> tuple[str, bool]:
if self.is_64bit:
type_desc = _get_dword(self.indirect_table, offset, self.endian)
if type_desc + 4 > len(self.indirect_table):
return '', False
flags = _get_word(self.indirect_table, type_desc, self.endian)
is_array = bool(flags & 0x0800)
if flags & 0x02:
type_id = self.indirect_table[type_desc + 6]
if type_id == 0x1D:
name = self._resolve_udt_name(type_desc)
if name:
return name, is_array
return _disasm_type(self.indirect_table, type_desc), is_array
word = _get_word(self.indirect_table, type_desc + 2, self.endian)
offs = (word >> 3) * 10
if offs + 8 > len(self.object_table):
return '', False
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name == 0:
ext = self.external_types.get(offs)
return ext or '', is_array
if hl_name == 0xFFFF:
type_name = _get_type_name(self.indirect_table[type_desc + 6])
if not type_name and type_desc + 17 <= len(self.indirect_table):
type_name = _get_type_name(self.indirect_table[type_desc + 16])
return type_name, is_array
name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit)
if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES:
ext = self.external_types.get(offs)
return ext or '', is_array
return name, is_array
type_desc = _get_dword(self.indirect_table, offset, self.endian)
flags = _get_word(self.indirect_table, type_desc, self.endian)
is_array = bool(flags & 0x0800)
if flags & 0x02:
type_id = self.indirect_table[type_desc + 6]
if type_id == 0x1D:
name = self._resolve_udt_name(type_desc)
if name:
return name, is_array
return _disasm_type(self.indirect_table, type_desc), is_array
word = _get_word(self.indirect_table, type_desc + 2, self.endian)
offs = (word >> 2) * 10
if offs + 4 > len(self.object_table):
return '', False
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name == 0:
ext = self.external_types.get(offs)
return ext or '', is_array
if hl_name == 0xFFFF:
type_name = _get_type_name(self.indirect_table[type_desc + 6])
if not type_name and type_desc + 17 <= len(self.indirect_table):
type_name = _get_type_name(self.indirect_table[type_desc + 16])
return type_name, is_array
name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit)
if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES:
ext = self.external_types.get(offs)
return ext or '', is_array
return name, is_array
def disasm_var(self, dword: int) -> VarInfo:
b_flag1 = self.indirect_table[dword]
b_flag2 = self.indirect_table[dword + 1]
has_as = (b_flag1 & 0x20) != 0
has_new = (b_flag2 & 0x20) != 0
var_name = _get_name(
self.indirect_table, self.identifiers, dword + 2,
self.endian, self.vba_ver, self.is_64bit)
type_ref: TypeRef | None = None
if has_new or has_as:
type_name = ''
is_array = False
if has_as:
offs = 16 if self.is_64bit else 12
word = _get_word(self.indirect_table, dword + offs + 2, self.endian)
if word == 0xFFFF:
type_id = self.indirect_table[dword + offs]
type_name = _get_type_name(type_id)
else:
type_name, is_array = self.disasm_object(dword + offs)
if type_name:
type_ref = TypeRef(type_name, is_array)
else:
offs = 16 if self.is_64bit else 12
if len(self.indirect_table) >= dword + offs + 4:
word = _get_word(self.indirect_table, dword + offs + 2, self.endian)
if word == 0xFFFF:
type_id = self.indirect_table[dword + offs]
if (type_id & 0x40) and (b_flag1 & 0x10):
type_id &= ~0x40
if type_id in _SUFFIX_TYPE_IDS:
type_name = _get_type_name(type_id)
if type_name:
type_ref = TypeRef(type_name, from_suffix=True)
else:
try:
type_name, is_array = self.disasm_object(dword + offs)
except Exception:
type_name = ''
is_array = False
if type_name in _SUFFIX_TYPES:
type_ref = TypeRef(type_name, is_array, from_suffix=True)
elif is_array:
var_name += '()'
return VarInfo(var_name, type_ref, has_new)
def disasm_arg(self, arg_offset: int) -> ArgInfo | None:
flags = _get_word(self.indirect_table, arg_offset, self.endian)
offs = 4 if self.is_64bit else 0
name_word = _get_word(self.indirect_table, arg_offset + 2, self.endian)
if name_word >= 0xFFFE:
return None
arg_name = _get_name(
self.indirect_table, self.identifiers, arg_offset + 2,
self.endian, self.vba_ver, self.is_64bit)
arg_type = _get_dword(self.indirect_table, arg_offset + offs + 12, self.endian)
arg_opts = _get_word(self.indirect_table, arg_offset + offs + 24, self.endian)
is_paramarray = bool(arg_opts & 0x0001)
if is_paramarray:
self._has_pa_bit = True
is_byval = bool(arg_opts & 0x0004)
is_byref = bool(arg_opts & 0x0002)
is_optional = bool(arg_opts & 0x0200)
type_ref: TypeRef | None = None
if flags & 0x0020:
arg_type_name = ''
is_array = False
if (arg_type & 0xFFFF0000) == 0xFFFF0000:
arg_type_id = arg_type & 0x000000FF
arg_type_name = _get_type_name(arg_type_id)
elif self.is_64bit and arg_type < len(DIM_TYPES) and DIM_TYPES[arg_type]:
arg_type_name = _get_type_name(arg_type)
else:
arg_type_name, is_array = self.disasm_object(arg_offset + offs + 12)
if arg_type_name.startswith('type_') and self.is_64bit:
arg_type_id = arg_type & 0x000000FF
if arg_type_id < len(DIM_TYPES) and DIM_TYPES[arg_type_id]:
arg_type_name = _get_type_name(arg_type_id)
is_array = False
if arg_type_name:
type_ref = TypeRef(arg_type_name, is_array)
elif (arg_type & 0xFFFF0000) == 0xFFFF0000:
arg_type_id = arg_type & 0x000000FF
if arg_type_id in _SUFFIX_TYPE_IDS:
type_name = _get_type_name(arg_type_id)
if type_name:
type_ref = TypeRef(type_name, from_suffix=True)
elif self.is_64bit and arg_type < len(DIM_TYPES) and DIM_TYPES[arg_type]:
if arg_type in _SUFFIX_TYPE_IDS:
type_name = _get_type_name(arg_type)
if type_name:
type_ref = TypeRef(type_name, from_suffix=True)
else:
try:
type_name, is_array = self.disasm_object(arg_offset + offs + 12)
except Exception:
type_name = ''
is_array = False
if type_name in _SUFFIX_TYPES:
type_ref = TypeRef(type_name, is_array, from_suffix=True)
elif (not type_name or type_name.startswith('type_')) and self.is_64bit:
arg_type_id = arg_type & 0x000000FF
if arg_type_id < len(DIM_TYPES) and DIM_TYPES[arg_type_id]:
type_name = _get_type_name(arg_type_id)
if type_name in _SUFFIX_TYPES:
type_ref = TypeRef(type_name, from_suffix=True)
elif is_array:
arg_name += '()'
elif is_array:
arg_name += '()'
default_value: str | None = None
if is_optional:
default_tag_off = arg_offset + offs + 28
default_val_off = arg_offset + offs + 32
ind = self.indirect_table
if default_tag_off + 2 <= len(ind) and default_val_off + 4 <= len(ind):
vt_tag = _get_word(ind, default_tag_off, self.endian)
value_dw = _get_dword(ind, default_val_off, self.endian)
default_value = self._format_default_value(vt_tag, value_dw)
return ArgInfo(
arg_name, type_ref, is_byval, is_byref,
is_optional, is_paramarray, default_value,
)
def _format_default_value(self, vt_tag: int, value_dw: int) -> str | None:
VT_I2 = 2
VT_I4 = 3
VT_R4 = 4
VT_R8 = 5
VT_CY = 6
VT_BSTR = 8
VT_BOOL = 11
VT_UI1 = 17
ind = self.indirect_table
if vt_tag == 0:
return None
elif vt_tag == VT_I2:
val = value_dw & 0xFFFF
return str(val - 0x10000 if val > 0x7FFF else val)
elif vt_tag == VT_I4:
return str(value_dw - 0x100000000 if value_dw > 0x7FFFFFFF else value_dw)
elif vt_tag == VT_R4:
val = _struct.unpack('<f', _struct.pack('<I', value_dw))[0]
return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val)
elif vt_tag == VT_R8:
if value_dw + 8 <= len(ind):
val = _struct.unpack('<d', bytes(ind[value_dw:value_dw + 8]))[0]
return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val)
elif vt_tag == VT_CY:
val = value_dw / 10000
return str(int(val)) if val == int(val) else str(val)
elif vt_tag == VT_BSTR:
if value_dw + 4 <= len(ind):
str_len = _get_dword(ind, value_dw, self.endian)
if str_len == 0:
return '""'
if 0 < str_len < 0x10000 and value_dw + 4 + str_len <= len(ind):
s = bytes(ind[value_dw + 4:value_dw + 4 + str_len]).decode(self.codec, errors='replace')
return F'"{s}"'
elif vt_tag == VT_BOOL:
return 'True' if (value_dw & 0xFFFF) != 0 else 'False'
elif vt_tag == VT_UI1:
return str(value_dw & 0xFF)
return None
def _patch_64bit_defaults(self, arg_list: list[ArgInfo], func_name: str) -> None:
md = self.module_data
if md is None:
return
raw = bytes(md)
need = sum(1 for a in arg_list if a.is_optional and a.default_value is None)
if need == 0:
return
blocks: list[tuple[int, list[str]]] = []
pos = 0
while True:
idx = raw.find(b'\xfa\x00\xb9\x00', pos)
if idx < 0:
break
defaults: list[str] = []
cur = idx + 2
while cur + 4 <= len(raw) and raw[cur:cur + 2] == b'\xb9\x00':
str_len = _get_word(raw, cur + 2, '<')
if str_len <= 0 or str_len > 0x1000 or cur + 4 + str_len > len(raw):
break
str_data = raw[cur + 4:cur + 4 + str_len]
if not all(32 <= b < 127 or b in (9, 10, 13) for b in str_data):
break
defaults.append(str_data.decode(self.codec, errors='replace'))
cur = cur + 4 + str_len
if defaults:
blocks.append((idx, defaults))
pos = idx + 4
if not blocks:
return
func_bytes = func_name.encode(self.codec, errors='replace')
func_pos = raw.find(func_bytes)
if func_pos >= 0:
best = min(blocks, key=lambda b: abs(b[0] - func_pos))
defaults = best[1]
elif len(blocks) == 1:
defaults = blocks[0][1]
else:
return
defaults = list(reversed(defaults))
di = 0
for arg in arg_list:
if not arg.is_optional or arg.default_value is not None:
continue
if di >= len(defaults):
break
arg.default_value = F'"{defaults[di]}"'
di += 1
def _declare64(self, decl_offset: int, func_name: str) -> tuple[str | None, str | None]:
"""
Extract Lib and Alias names from a 64-bit Declare entry in the declaration table.
The 64-bit entry structure differs significantly from 32-bit: the lib name identifier
word is not at a fixed offset within the entry header. Instead, we extract the lib name
from VBA source text stored later in the declaration table, falling back to the binary
structure when source text is not available.
"""
decl = self.declaration_table
decl_bytes = bytes(decl)
lib_name = None
alias_name = None
# Strategy 1: Extract from VBA source text in the declaration table.
# The source text may contain embedded null bytes, so strip them before matching.
text = decl_bytes.replace(b'\x00', b'').decode('ascii', errors='replace')
match = re.search(
rf'(?:Function|Sub)\s+{re.escape(func_name)}\b.*?Lib\s+"([^"]+)"', text)
if match:
lib_name = match.group(1)
after_lib = text[match.end():]
alias_match = re.match(r'\s*Alias\s*"([^"]+)"', after_lib)
if alias_match:
alias_name = alias_match.group(1)
# Strategy 2: Binary structure fallback. The alias string offset depends on version:
# VBA7 version 0x0097 has 4 extra bytes of padding (alias at +0x20), later versions
# use the standard offset (+0x1C).
_alias_off = 0x20 if self.version <= 0x97 else 0x1C
if lib_name is None and self.version > 0x97 and decl_offset >= 2:
# For VBA7 versions after 0x97 the lib identifier word for each entry is stored
# in the 2 bytes immediately preceding the entry header, placed there as trailing
# data of the previous entry. This does not apply to the very first entry
# (decl_offset == 0) or to versions <= 0x97 where the lib word sits at header +2.
lib_word = _get_word(decl, decl_offset - 2, self.endian)
if lib_word != 0 and lib_word != 0xFFFF:
lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit)
if lib_name is None:
alias_start = decl_offset + _alias_off
if alias_start < len(decl):
alias_bytes_raw = bytes(decl[alias_start:])
null_pos = alias_bytes_raw.find(0)
if null_pos > 0 and all(32 <= b < 127 for b in alias_bytes_raw[:null_pos]):
abs_null = alias_start + null_pos
dword_aligned = (abs_null + 1 + 3) & ~3
lib_word_offset = dword_aligned + 2
if lib_word_offset + 2 <= len(decl):
lib_word = _get_word(decl, lib_word_offset, self.endian)
if lib_word != 0 and lib_word != 0xFFFF:
lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit)
if lib_name is None:
lib_word = _get_word(decl, decl_offset + 2, self.endian)
if lib_word != 0:
lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit)
# Read alias from binary structure if not found via source text.
if alias_name is None and not match:
alias_start = decl_offset + _alias_off
if alias_start < len(decl):
alias_bytes_raw = bytes(decl[alias_start:])
null_pos = alias_bytes_raw.find(0)
if null_pos > 0:
alias_name = alias_bytes_raw[:null_pos].decode(self.codec, errors='replace')
return lib_name, alias_name
def disasm_func(self, dword: int, op_type: int) -> FuncInfo:
flags = _get_word(self.indirect_table, dword, self.endian)
name_word = _get_word(self.indirect_table, dword + 2, self.endian)
offs2 = 4 if self.vba_ver > 5 else 0
if self.is_64bit:
offs2 += 16
self._linecont_pending = False
sub_name = _get_id(name_word, self.identifiers, self.vba_ver, self.is_64bit)
arg_offset = _get_dword(self.indirect_table, dword + offs2 + 36, self.endian)
ret_type = _get_dword(self.indirect_table, dword + offs2 + 40, self.endian)
decl_offset = _get_word(self.indirect_table, dword + offs2 + 44, self.endian)
c_options_offset = 60 if self.is_64bit and self.version > 0x97 else 54
c_options = self.indirect_table[dword + offs2 + c_options_offset]
new_flags_offset = 63 if self.is_64bit and self.version > 0x97 else 57
new_flags = self.indirect_table[dword + offs2 + new_flags_offset]
scope = ''
is_friend = False
if self.vba_ver > 5:
if (new_flags & 0x0002) == 0:
scope = 'Private'
elif op_type & 0x04:
scope = 'Public'
if new_flags & 0x0004:
is_friend = True
else:
if (flags & 0x0008) == 0:
scope = 'Private'
elif op_type & 0x04:
scope = 'Public'
is_static = bool(flags & 0x0080)
has_declare = (c_options & 0x90) == 0 and decl_offset != 0xFFFF
is_ptrsafe = bool(self.vba_ver > 5 and new_flags & 0x20)
has_as = (flags & 0x0020) != 0
if flags & 0x1000:
kind = 'Function' if op_type in (2, 6) else 'Sub'
elif flags & 0x2000:
kind = 'Property Get'
elif flags & 0x4000:
kind = 'Property Let'
elif flags & 0x8000:
kind = 'Property Set'
else:
kind = 'Sub'
return_type: TypeRef | None = None
if has_as:
type_name = ''
is_array = False
if (ret_type & 0xFFFF0000) == 0xFFFF0000:
type_id = ret_type & 0x000000FF
type_name = _get_type_name(type_id)
else:
type_name, is_array = self.disasm_object(dword + offs2 + 40)
if type_name:
return_type = TypeRef(type_name, is_array)
elif (ret_type & 0xFFFF0000) == 0xFFFF0000:
ret_type_id = ret_type & 0x000000FF
if ret_type_id in _SUFFIX_TYPE_IDS:
type_name = _get_type_name(ret_type_id)
if type_name:
return_type = TypeRef(type_name, from_suffix=True)
lib_name: str | None = None
alias_name: str | None = None
if has_declare:
if self.is_64bit:
lib_name, alias_name = self._declare64(decl_offset, sub_name)
else:
lib_name = _get_name(
self.declaration_table, self.identifiers, decl_offset + 2,
self.endian, self.vba_ver, self.is_64bit)
alias_offset = _get_word(
self.declaration_table, decl_offset + 4, self.endian)
if alias_offset < len(self.declaration_table):
alias_bytes = bytes(self.declaration_table[alias_offset:])
null_pos = alias_bytes.find(0)
if null_pos > 0:
alias_name = alias_bytes[:null_pos].decode(
self.codec, errors='replace')
if alias_name == sub_name:
alias_name = None
arg_list: list[ArgInfo] = []
while (
arg_offset != 0xFFFFFFFF
and arg_offset != 0
and arg_offset + 26 < len(self.indirect_table)
):
arg = self.disasm_arg(arg_offset)
if arg is not None:
arg_list.append(arg)
arg_offset = _get_dword(
self.indirect_table,
arg_offset + (24 if self.is_64bit else 20),
self.endian,
)
if self.is_64bit and any(
a.is_optional and a.default_value is None for a in arg_list
):
self._patch_64bit_defaults(arg_list, sub_name)
if (
arg_list
and not self._has_pa_bit
and not any(a.is_paramarray for a in arg_list)
):
last = arg_list[-1]
_pa_candidate = (
last.type is not None
and last.type.is_array
and (last.type.name == 'Variant' or last.type.name == '')
) or (
last.type is None
and last.name.endswith('()')
)
_pa_no_modifiers = not last.is_byval and not last.is_byref and not last.is_optional
if _pa_candidate and _pa_no_modifiers:
last.is_paramarray = True
if is_friend:
scope = 'Friend' if not scope else F'{scope} Friend'
return FuncInfo(
scope, is_static, kind, sub_name, arg_list,
return_type, has_declare, is_ptrsafe, lib_name, alias_name,
)
def disasm_var_arg(
self,
module_data: bytes | bytearray | memoryview,
offset: int,
w_length: int,
mnemonic: str,
) -> list[str]:
substring = module_data[offset:offset + w_length]
length_str = F'0x{w_length:04X}'
if mnemonic in ('LitStr', 'QuoteRem', 'Rem', 'Reparse'):
quoted = F'"{codecs.decode(substring, self.codec, "replace")}"'
return [length_str, quoted]
elif mnemonic in ('OnGosub', 'OnGoto'):
offset1 = offset
names: list[str] = []
for _ in range(w_length // 2):
offset1, word = _get_var(module_data, offset1, self.endian, False)
names.append(_get_id(word, self.identifiers, self.vba_ver, self.is_64bit))
return [length_str, ', '.join(names)]
else:
hex_dump = ' '.join(F'{c:02X}' for c in substring)
return [length_str, hex_dump]
def dump_line(
self,
module_data: bytes | bytearray | memoryview,
line_start: int,
line_length: int,
) -> list[tuple[str, list[OpcodeArg]]]:
"""
Disassemble one p-code line into a list of (mnemonic, [arg, ...]) tuples.
"""
self._linecont_pending = False
result: list[tuple[str, list[OpcodeArg]]] = []
if line_length <= 0:
return result
offset = line_start
end_of_line = line_start + line_length
while offset < end_of_line:
offset, opcode = _get_var(module_data, offset, self.endian, False)
op_type = (opcode & ~0x03FF) >> 10
opcode &= 0x03FF
translated = _translate_opcode(opcode, self.vba_ver, self.is_64bit)
if translated not in OPCODES:
return result
instruction = OPCODES[translated]
mnemonic = instruction.mnem
if op_type == 8 and mnemonic in ('FnMid', 'FnMidB', 'FnCurDir', 'FnError', 'Mid', 'MidB'):
mnemonic += '$'
parts: list[OpcodeArg] = []
if mnemonic in ('Coerce', 'CoerceVar', 'DefType'):
if op_type < len(_VAR_TYPES_LONG):
parts.append(CoerceType(_VAR_TYPES_LONG[op_type]))
elif op_type == 17:
parts.append(CoerceType('Byte'))
else:
parts.append(CoerceType(str(op_type)))
elif mnemonic in ('Dim', 'DimImplicit', 'Type'):
dim_type: list[str] = []
if op_type & 0x04:
dim_type.append('Global')
elif op_type & 0x08:
dim_type.append('Public')
elif op_type & 0x10:
dim_type.append('Private')
elif op_type & 0x20:
dim_type.append('Static')
if (op_type & 0x01) and (mnemonic != 'Type'):
dim_type.append('Const')
if dim_type:
parts.append(DimScope(dim_type))
elif mnemonic == 'LitVarSpecial':
parts.append(_SPECIALS[op_type])
elif mnemonic in ('ArgsCall', 'ArgsMemCall', 'ArgsMemCallWith'):
if op_type < 16:
parts.append('(Call)')
else:
op_type -= 16
elif mnemonic == 'Option':
parts.append(_OPTIONS[op_type])
elif mnemonic in ('Redim', 'RedimAs'):
if op_type & 16:
parts.append('(Preserve)')
elif mnemonic in (
'FnDir', 'FnFormat', 'FnStringVar', 'FnStringStr',
):
parts.append(F'0x{op_type:04X}')
elif mnemonic == 'LitSmallI2':
parts.append(str(op_type))
for arg in instruction.args:
if arg == 'name':
offset, word = _get_var(module_data, offset, self.endian, False)
the_name = self.disasm_name(word, mnemonic, op_type)
if the_name:
parts.append(the_name)
elif arg in ('0x', 'imp_'):
offset, word = _get_var(module_data, offset, self.endian, False)
the_imp = self.disasm_imp(arg, word, mnemonic)
if the_imp:
parts.append(the_imp)
elif arg in ('func_', 'var_', 'rec_', 'type_', 'context_'):
offset, dword = _get_var(module_data, offset, self.endian, True)
if (
arg == 'rec_'
and len(self.indirect_table) >= dword + 20
):
parts.append(RecordInfo(self.disasm_rec(dword)))
elif (
arg == 'type_'
and len(self.indirect_table) >= dword + 7
):
type_id = self.indirect_table[dword + 6]
if type_id == 0x1D:
the_type = self._resolve_udt_name(dword)
else:
the_type = ''
if not the_type:
the_type = _disasm_type(self.indirect_table, dword)
parts.append(TypeRef(the_type))
elif (
arg == 'var_'
and len(self.indirect_table) >= dword + 16
):
var_info = self.disasm_var(dword)
if op_type & 0x20:
var_info.has_withevents = True
parts.append(var_info)
if op_type & 0x10:
word = _get_word(module_data, offset, self.endian)
offset += 2
parts.append(F'0x{word:04X}')
elif (
arg == 'func_'
and len(self.indirect_table) >= dword + 61
):
parts.append(self.disasm_func(dword, op_type))
else:
parts.append(F'{arg}{dword:08X}')
if self.is_64bit and (arg == 'context_'):
offset, dword = _get_var(module_data, offset, self.endian, True)
parts.append(F'{dword:08X}')
if instruction.varg:
offset, w_length = _get_var(module_data, offset, self.endian, False)
var_arg_parts = self.disasm_var_arg(
module_data, offset, w_length, mnemonic)
parts.extend(var_arg_parts)
offset += w_length
if w_length & 1:
offset += 1
result.append((mnemonic, parts))
if mnemonic == 'LineCont':
self._linecont_pending = True
return result
# MS-OVBA specification offsets for module stream parsing
_OFFSET_DW_LENGTH = 0x0005
_OFFSET_VBA6_INDIRECT_START = 0x0011
_OFFSET_VBA6_32_DECL_LENGTH = 0x003F
_OFFSET_VBA6_32_DECL_DATA = 0x0043
_OFFSET_VBA6_64_DECL_LENGTH = 0x0043
_OFFSET_VBA6_64_DECL_DATA = 0x0047
_OFFSET_VBA6_64_LINE_START = 0x0019
_OFFSET_OBJECT_TABLE = 0x008A
_OFFSET_PCODE_LINES = 0x003C
_PCODE_MAGIC = 0xCAFE
def _parse_external_type_table(
module_data: bytes | bytearray | memoryview,
object_table: bytes | bytearray | memoryview,
ot_start_in_module: int,
endian: str,
identifiers: list[str],
vba_ver: int,
is_64bit: bool,
) -> dict[int, str]:
"""Parse the external type table that follows the object table in module_data.
For each external OT entry (hl_name == 0 or small internal name), the table
stores a record with id_codes for the library name and type name. The structure is:
+0: FFFF (separator)
+2: 0101 (flags)
+4: DWORD size of payload
+8: payload containing one or more 8-byte type pairs:
0200 <lib_id> <type_id> 0000
A single record may contain multiple type pairs packed in its payload.
"""
result: dict[int, str] = {}
ot_len = len(object_table)
if ot_len == 0:
return result
external_ot_offsets: list[int] = []
extra_ot_offsets: list[int] = []
for ot_idx in range(ot_len // 10):
ot_offs = ot_idx * 10
hl_name = _get_word(object_table, ot_offs + 6, endian)
if hl_name == 0:
external_ot_offsets.append(ot_offs)
elif (hl_name >> 1) < 0x100:
try:
name = _get_id(hl_name, identifiers, vba_ver, is_64bit)
except Exception:
continue
if name not in _VALID_INTERNAL_TYPE_NAMES:
extra_ot_offsets.append(ot_offs)
external_ot_offsets.extend(extra_ot_offsets)
if not external_ot_offsets:
return result
pos = ot_start_in_module + ot_len
ot_iter = iter(external_ot_offsets)
try:
while pos + 8 <= len(module_data):
marker = _get_word(module_data, pos, endian)
if marker != 0xFFFF:
break
pos += 2
# currently unused:
# _flags = _get_word(module_data, pos, endian)
pos += 2
payload_size = _get_dword(module_data, pos, endian)
pos += 4
if payload_size < 6 or pos + payload_size > len(module_data):
break
payload_end = pos + payload_size
while pos + 8 <= payload_end:
# currently unused:
# _prefix = _get_word(module_data, pos, endian)
lib_id = _get_word(module_data, pos + 2, endian)
type_id = _get_word(module_data, pos + 4, endian)
pos += 8
ot_offs = next(ot_iter, None)
if ot_offs is None:
break
try:
lib_name = _get_id(lib_id, identifiers, vba_ver, is_64bit)
except Exception:
continue
try:
type_name = _get_id(type_id, identifiers, vba_ver, is_64bit)
except Exception:
continue
if lib_name and type_name:
result[ot_offs] = F'{lib_name}.{type_name}'
pos = payload_end
except Exception:
pass
return result
def _pcode_dump(
module_data: bytes | bytearray | memoryview,
vba_project_data: bytes | bytearray | memoryview,
identifiers: list[str],
is_64bit: bool,
codec: str,
) -> list[PCodeLine]:
"""
Disassemble p-code from a VBA module stream. Returns structured PCodeLine objects.
"""
lines: list[PCodeLine] = []
if _get_word(module_data, 2, '<') > 0xFF:
endian = '>'
else:
endian = '<'
vba_ver = 3
try:
version = _get_word(vba_project_data, 2, endian)
if version >= 0x6B:
if version >= 0x97:
vba_ver = 7
else:
vba_ver = 6
if is_64bit:
dw_length = _get_dword(module_data, _OFFSET_VBA6_64_DECL_LENGTH, endian)
declaration_table = module_data[
_OFFSET_VBA6_64_DECL_DATA:_OFFSET_VBA6_64_DECL_DATA + dw_length]
dw_length = _get_dword(module_data, _OFFSET_VBA6_INDIRECT_START, endian)
table_start = dw_length + 12
else:
dw_length = _get_dword(module_data, _OFFSET_VBA6_32_DECL_LENGTH, endian)
declaration_table = module_data[
_OFFSET_VBA6_32_DECL_DATA:_OFFSET_VBA6_32_DECL_DATA + dw_length]
dw_length = _get_dword(module_data, _OFFSET_VBA6_INDIRECT_START, endian)
table_start = dw_length + 10
dw_length = _get_dword(module_data, table_start, endian)
table_start += 4
indirect_table = module_data[
table_start:table_start + dw_length]
dw_length = _get_dword(module_data, _OFFSET_DW_LENGTH, endian)
dw_length2 = dw_length + _OFFSET_OBJECT_TABLE
dw_length = _get_dword(module_data, dw_length2, endian)
dw_length2 += 4
object_table = module_data[
dw_length2:dw_length2 + dw_length]
ot_module_start = dw_length2
offset = _OFFSET_VBA6_64_LINE_START
else:
vba_ver = 5
offset = 11
dw_length = _get_dword(module_data, offset, endian)
offs = offset + 4
declaration_table = module_data[offs:offs + dw_length]
offset = _skip_structure(module_data, offset, endian, True, 1, False)
offset += 64
offset = _skip_structure(module_data, offset, endian, False, 16, False)
offset = _skip_structure(module_data, offset, endian, True, 1, False)
offset += 6
offset = _skip_structure(module_data, offset, endian, True, 1, False)
offs = offset + 8
dw_length = _get_dword(module_data, offs, endian)
table_start = dw_length + 14
offs = dw_length + 10
dw_length = _get_dword(module_data, offs, endian)
indirect_table = module_data[
table_start:table_start + dw_length]
dw_length = _get_dword(module_data, offset, endian)
offs = dw_length + _OFFSET_OBJECT_TABLE
dw_length = _get_dword(module_data, offs, endian)
offs += 4
object_table = module_data[offs:offs + dw_length]
ot_module_start = offs
offset += 77
external_types = _parse_external_type_table(
module_data, object_table, ot_module_start,
endian, identifiers, vba_ver, is_64bit,
)
ctx = DisassemblyContext(
indirect_table, object_table, declaration_table,
identifiers, endian, vba_ver, is_64bit, codec, version,
module_data=module_data, external_types=external_types)
dw_length = _get_dword(module_data, offset, endian)
offset = dw_length + _OFFSET_PCODE_LINES
offset, magic = _get_var(module_data, offset, endian, False)
if magic != _PCODE_MAGIC:
return lines
offset += 2
offset, num_lines = _get_var(module_data, offset, endian, False)
pcode_start = offset + num_lines * 12 + 10
for _ in range(num_lines):
offset += 4
offset, line_length = _get_var(module_data, offset, endian, False)
offset += 2
offset, line_offset = _get_var(module_data, offset, endian, True)
opcodes = ctx.dump_line(module_data, pcode_start + line_offset, line_length)
lines.append(PCodeLine(opcodes))
except Exception as exc:
logger.warning(F'p-code disassembly error: {exc}')
return lines
def _get_identifiers(
vba_project_data: bytes | bytearray | memoryview,
codec: str,
) -> list[str]:
"""
Extract identifier names from the _VBA_PROJECT stream.
"""
identifiers: list[str] = []
try:
magic = _get_word(vba_project_data, 0, '<')
if magic != 0x61CC:
return identifiers
version = _get_word(vba_project_data, 2, '<')
unicode_ref = ((version >= 0x5B)
and (version not in (0x60, 0x62, 0x63))
or (version == 0x4E)
)
unicode_name = ((version >= 0x59)
and (version not in (0x60, 0x62, 0x63))
or (version == 0x4E)
)
non_unicode_name = (((version <= 0x59) and (version != 0x4E))
or (0x5F < version < 0x6B)
)
word = _get_word(vba_project_data, 5, '<')
endian = '>' if word == 0x000E else '<'
offset = 0x1E
offset, num_refs = _get_var(vba_project_data, offset, endian, False)
offset += 2
for _ in range(num_refs):
offset, ref_length = _get_var(vba_project_data, offset, endian, False)
if ref_length == 0:
offset += 6
elif ref_length < 3 + 2 * unicode_ref:
offset += ref_length
else:
if unicode_ref:
c = vba_project_data[offset + 4]
else:
c = vba_project_data[offset + 2]
offset += ref_length
if chr(c) in ('C', 'D'):
offset = _skip_structure(vba_project_data, offset, endian, False, 1, False)
offset += 10
offset, word = _get_var(vba_project_data, offset, endian, False)
if word:
offset = _skip_structure(vba_project_data, offset, endian, False, 1, False)
offset, w_length = _get_var(vba_project_data, offset, endian, False)
if w_length:
offset += 2
offset += w_length + 30
offset = _skip_structure(vba_project_data, offset, endian, False, 2, False)
offset = _skip_structure(vba_project_data, offset, endian, False, 4, False)
offset += 2
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset += 0x64
offset, num_projects = _get_var(vba_project_data, offset, endian, False)
for _ in range(num_projects):
offset, w_length = _get_var(vba_project_data, offset, endian, False)
if unicode_name:
offset += w_length
if non_unicode_name:
if w_length:
offset, w_length = _get_var(vba_project_data, offset, endian, False)
offset += w_length
offset = _skip_structure(vba_project_data, offset, endian, False, 1, False)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset, _ = _get_var(vba_project_data, offset, endian, False)
if version >= 0x6B:
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset += 2
if version != 0x51:
offset += 4
offset = _skip_structure(vba_project_data, offset, endian, False, 8, False)
offset += 11
offset += 6
offset = _skip_structure(vba_project_data, offset, endian, True, 1, False)
offset += 6
offset, w0 = _get_var(vba_project_data, offset, endian, False)
offset, num_ids = _get_var(vba_project_data, offset, endian, False)
offset, w1 = _get_var(vba_project_data, offset, endian, False)
offset += 4
num_junk_ids = num_ids + w1 - w0
num_ids = w0 - w1
for _ in range(num_junk_ids):
offset += 4
id_type, id_length = _get_type_and_length(vba_project_data, offset, endian)
offset += 2
if id_type > 0x7F:
offset += 6
offset += id_length
for _ in range(num_ids):
is_kwd = False
ident = ''
id_type, id_length = _get_type_and_length(vba_project_data, offset, endian)
offset += 2
if (id_length == 0) and (id_type == 0):
offset += 2
id_type, id_length = _get_type_and_length(vba_project_data, offset, endian)
offset += 2
is_kwd = True
if id_type & 0x80:
offset += 6
if id_length:
ident = codecs.decode(
vba_project_data[offset:offset + id_length], codec, 'replace')
offset += id_length
identifiers.append(ident)
if not is_kwd:
offset += 4
except Exception as exc:
logger.warning(F'identifier extraction error: {exc}')
return identifiers
def format_pcode_text(
module_path: str,
module_data_size: int,
lines: list[PCodeLine],
) -> str:
"""
Render structured PCodeLine data into pcodedmp-compatible text output.
"""
output: list[str] = []
output.append(F'{module_path} - {module_data_size:d} bytes')
for line_num, pcode_line in enumerate(lines):
output.append(F'Line #{line_num:d}:')
for mnemonic, args in pcode_line.opcodes:
text = F'\t{mnemonic} {" ".join(str(a) for a in args)}'
output.append(text)
return '\n'.join(output) + '\n'
class PCodeDisassembler:
"""
VBA p-code disassembler that produces structured PCodeModule output. The output is suitable for
consumption by the decompiler for reconstruction to VBA source code.
"""
def __init__(self, data: bytes | bytearray | memoryview):
self._data = data
def iter_modules(self):
"""
Yield PCodeModule objects for each VBA module.
"""
for ole_data in self._get_ole_streams():
ole = OleFile(ole_data)
yield from self._iter_project_modules(ole)
def _iter_project_modules(
self,
ole: OleFile,
):
"""
Iterate over VBA modules in an OLE file, yielding PCodeModule per module.
"""
vba_projects = _find_vba_projects(ole)
if not vba_projects:
return
for vba_root, _, dir_path in vba_projects:
codec, code_modules, is_64bit = self._process_dir(ole, dir_path)
vba_project_path = vba_root + 'VBA/_VBA_PROJECT'
vba_project_data = self._process_vba_project(ole, vba_project_path)
identifiers = _get_identifiers(vba_project_data, codec)
identifiers_stripped = not identifiers
for module in code_modules:
module_path = F'{vba_root}VBA/{module}'
try:
module_data = ole.openstream(module_path).read()
except Exception:
continue
lines = _pcode_dump(
module_data, vba_project_data, identifiers, is_64bit, codec)
yield PCodeModule(module_path, lines, identifiers_stripped)
def _get_ole_streams(self) -> list[bytes | bytearray | memoryview]:
"""
Extract OLE data from the input. If the input is already an OLE compound file, returns it
directly. If it's a ZIP (OOXML), extracts all vbaProject.bin entries.
"""
if self._data[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
return [self._data]
if self._data[:2] == b'PK':
import zipfile
from refinery.lib.structures import MemoryFile
results: list[bytes | bytearray | memoryview] = []
try:
with zipfile.ZipFile(MemoryFile(self._data, bytes)) as zf:
for name in zf.namelist():
if name.lower().endswith('vbaproject.bin'):
results.append(zf.read(name))
except zipfile.BadZipFile:
pass
return results
return [self._data]
def _process_dir(
self,
ole: OleFile,
dir_path: str,
) -> tuple[str, list[str], bool]:
"""
Parse the VBA dir stream to find module names and codepage. Returns (codec, code_modules,
is_64bit).
"""
dir_data_compressed = ole.openstream(dir_path).read()
dir_data = decompress_stream(dir_data_compressed)
stream_size = len(dir_data)
code_modules: list[str] = []
is_64bit = False
codec = 'latin1'
offset = 0
while offset < stream_size:
try:
tag = _get_word(dir_data, offset, '<')
w_length = _get_word(dir_data, offset + 2, '<')
if tag == 9:
w_length = 6
elif tag == 3:
w_length = 2
offset += 6
if w_length:
if tag == 3:
codepage = _get_word(dir_data, offset, '<')
codec = _codepage_to_codec(codepage)
elif tag == 50:
stream_name = codecs.decode(
dir_data[offset:offset + w_length], 'utf_16_le', errors='replace')
code_modules.append(stream_name)
elif tag == 1:
sys_kind = _get_dword(dir_data, offset, '<')
is_64bit = sys_kind == 3
offset += w_length
except Exception:
break
return codec, code_modules, is_64bit
def _process_vba_project(
self,
ole: OleFile,
vba_project_path: str,
) -> bytes | bytearray | memoryview:
"""
Read the _VBA_PROJECT stream (raw, not compressed).
"""
return ole.openstream(vba_project_path).read()
Functions
def format_pcode_text(module_path, module_data_size, lines)-
Render structured PCodeLine data into pcodedmp-compatible text output.
Expand source code Browse git
def format_pcode_text( module_path: str, module_data_size: int, lines: list[PCodeLine], ) -> str: """ Render structured PCodeLine data into pcodedmp-compatible text output. """ output: list[str] = [] output.append(F'{module_path} - {module_data_size:d} bytes') for line_num, pcode_line in enumerate(lines): output.append(F'Line #{line_num:d}:') for mnemonic, args in pcode_line.opcodes: text = F'\t{mnemonic} {" ".join(str(a) for a in args)}' output.append(text) return '\n'.join(output) + '\n'
Classes
class Opcode (mnem, args=[], varg=False)-
Opcode(mnem, args, varg)
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = FalseAncestors
- builtins.tuple
Instance variables
var mnem-
Alias for field number 0
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var args-
Alias for field number 1
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var varg-
Alias for field number 2
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False
class TypeRef (name, is_array=False, from_suffix=False)-
TypeRef(name: 'str', is_array: 'bool' = False, from_suffix: 'bool' = False)
Expand source code Browse git
@dataclass class TypeRef: name: str is_array: bool = False from_suffix: bool = FalseInstance variables
var name-
The type of the None singleton.
var is_array-
The type of the None singleton.
var from_suffix-
The type of the None singleton.
class VarInfo (name, type=None, has_new=False, has_withevents=False)-
VarInfo(name: 'str', type: 'TypeRef | None' = None, has_new: 'bool' = False, has_withevents: 'bool' = False)
Expand source code Browse git
@dataclass class VarInfo: name: str type: TypeRef | None = None has_new: bool = False has_withevents: bool = FalseInstance variables
var name-
The type of the None singleton.
var type-
The type of the None singleton.
var has_new-
The type of the None singleton.
var has_withevents-
The type of the None singleton.
class ArgInfo (name, type=None, is_byval=False, is_byref=False, is_optional=False, is_paramarray=False, default_value=None)-
ArgInfo(name: 'str', type: 'TypeRef | None' = None, is_byval: 'bool' = False, is_byref: 'bool' = False, is_optional: 'bool' = False, is_paramarray: 'bool' = False, default_value: 'str | None' = None)
Expand source code Browse git
@dataclass class ArgInfo: name: str type: TypeRef | None = None is_byval: bool = False is_byref: bool = False is_optional: bool = False is_paramarray: bool = False default_value: str | None = NoneInstance variables
var name-
The type of the None singleton.
var type-
The type of the None singleton.
var is_byval-
The type of the None singleton.
var is_byref-
The type of the None singleton.
var is_optional-
The type of the None singleton.
var is_paramarray-
The type of the None singleton.
var default_value-
The type of the None singleton.
class FuncInfo (scope, is_static, kind, name, args=<factory>, return_type=None, is_declare=False, is_ptrsafe=False, lib_name=None, alias_name=None)-
FuncInfo(scope: 'str', is_static: 'bool', kind: 'str', name: 'str', args: 'list[ArgInfo]' =
, return_type: 'TypeRef | None' = None, is_declare: 'bool' = False, is_ptrsafe: 'bool' = False, lib_name: 'str | None' = None, alias_name: 'str | None' = None) Expand source code Browse git
@dataclass class FuncInfo: scope: str is_static: bool kind: str name: str args: list[ArgInfo] = field(default_factory=list) return_type: TypeRef | None = None is_declare: bool = False is_ptrsafe: bool = False lib_name: str | None = None alias_name: str | None = NoneInstance variables
var scope-
The type of the None singleton.
var is_static-
The type of the None singleton.
var kind-
The type of the None singleton.
var name-
The type of the None singleton.
var args-
The type of the None singleton.
var return_type-
The type of the None singleton.
var is_declare-
The type of the None singleton.
var is_ptrsafe-
The type of the None singleton.
var lib_name-
The type of the None singleton.
var alias_name-
The type of the None singleton.
class DimScope (keywords=<factory>)-
DimScope(keywords: 'list[str]' =
) Expand source code Browse git
@dataclass class DimScope: keywords: list[str] = field(default_factory=list)Instance variables
var keywords-
The type of the None singleton.
class CoerceType (type_short)-
CoerceType(type_short: 'str')
Expand source code Browse git
@dataclass class CoerceType: type_short: strInstance variables
var type_short-
The type of the None singleton.
class RecordInfo (text)-
RecordInfo(text: 'str')
Expand source code Browse git
@dataclass class RecordInfo: text: strInstance variables
var text-
The type of the None singleton.
class PCodeLine (opcodes)-
PCodeLine(opcodes,)
Expand source code Browse git
class PCodeLine(NamedTuple): opcodes: list[tuple[str, list[OpcodeArg]]]Ancestors
- builtins.tuple
Instance variables
var opcodes-
Alias for field number 0
Expand source code Browse git
class PCodeLine(NamedTuple): opcodes: list[tuple[str, list[OpcodeArg]]]
class PCodeModule (path, lines, identifiers_stripped=False)-
Structured representation of a disassembled VBA module.
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = FalseAncestors
- builtins.tuple
Instance variables
var path-
Alias for field number 0
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = False var lines-
Alias for field number 1
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = False var identifiers_stripped-
Alias for field number 2
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = False
class Op (mnem, args=[], varg=False)-
Opcode(mnem, args, varg)
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = FalseAncestors
- builtins.tuple
Instance variables
var mnem-
Alias for field number 0
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var args-
Alias for field number 1
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var varg-
Alias for field number 2
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False
class DisassemblyContext (indirect_table, object_table, declaration_table, identifiers, endian, vba_ver, is_64bit, codec, version=0, module_data=None, external_types=None)-
Holds shared state for the disassembly of a single VBA module, eliminating repeated parameter threading through every helper function.
Expand source code Browse git
class DisassemblyContext: """ Holds shared state for the disassembly of a single VBA module, eliminating repeated parameter threading through every helper function. """ def __init__( self, indirect_table: bytes | bytearray | memoryview, object_table: bytes | bytearray | memoryview, declaration_table: bytes | bytearray | memoryview, identifiers: list[str], endian: str, vba_ver: int, is_64bit: bool, codec: str, version: int = 0, module_data: bytes | bytearray | memoryview | None = None, external_types: dict[int, str] | None = None, ): self.indirect_table = indirect_table self.object_table = object_table self.declaration_table = declaration_table self.identifiers = identifiers self.endian = endian self.vba_ver = vba_ver self.is_64bit = is_64bit self.codec = codec self.version = version self.module_data = module_data self.external_types: dict[int, str] = external_types or {} self._linecont_pending = False self._has_pa_bit = False def disasm_name(self, word: int, mnemonic: str, op_type: int) -> str: var_types = [ '', '?', '%', '&', '!', '#', '@', '?', '$', '?', '?', '?', '?', '?', ] var_name = _get_id(word, self.identifiers, self.vba_ver, self.is_64bit) if op_type < len(var_types): str_type = var_types[op_type] else: str_type = '' if op_type == 32: var_name = F'[{var_name}]' if mnemonic == 'OnError': str_type = '' if op_type == 1: var_name = '(Resume Next)' elif op_type == 2: var_name = '(GoTo 0)' elif mnemonic == 'Resume': str_type = '' if op_type == 1: var_name = '(Next)' elif op_type != 0: var_name = '' return (var_name + str_type).rstrip() def disasm_imp(self, arg: str, word: int, mnemonic: str) -> str: if mnemonic != 'Open': if arg == 'imp_': shift = 3 if self.is_64bit else 2 offs = (word >> shift) * 10 if offs + 8 <= len(self.object_table): hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: return self.external_types.get(offs, '') name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES: return self.external_types.get(offs, '') return name return F'{arg}{word:04X}' access_mode = ['Read', 'Write', 'Read Write'] lock_mode = ['Read Write', 'Write', 'Read'] mode = word & 0x00FF access = (word & 0x0F00) >> 8 lock = (word & 0xF000) >> 12 imp_name = '(For ' if mode & 0x01: imp_name += 'Input' elif mode & 0x02: imp_name += 'Output' elif mode & 0x04: imp_name += 'Random' elif mode & 0x08: imp_name += 'Append' elif mode == 0x20: imp_name += 'Binary' if access and (access <= len(access_mode)): imp_name += F' Access {access_mode[access - 1]}' if lock: if lock & 0x04: imp_name += ' Shared' elif lock <= len(lock_mode): imp_name += F' Lock {lock_mode[lock - 1]}' imp_name += ')' return imp_name def disasm_rec(self, dword: int) -> str: object_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) options = _get_word(self.indirect_table, dword + 18, self.endian) if (options & 1) == 0: object_name = F'(Private) {object_name}' else: object_name = F'(Public) {object_name}' return object_name def _resolve_udt_name(self, type_desc: int) -> str: """Resolve a user-defined type (type_id=0x1D) from the object table. The type descriptor at `type_desc` stores the object table reference word at offset +8 (instead of the usual +2 for non-builtin types). """ if type_desc + 10 > len(self.indirect_table): return '' word = _get_word(self.indirect_table, type_desc + 8, self.endian) if self.is_64bit: offs = (word >> 3) * 10 required = offs + 8 else: offs = (word >> 2) * 10 required = offs + 4 if required > len(self.object_table): return '' hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: return self.external_types.get(offs, '') return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) def disasm_object(self, offset: int) -> tuple[str, bool]: if self.is_64bit: type_desc = _get_dword(self.indirect_table, offset, self.endian) if type_desc + 4 > len(self.indirect_table): return '', False flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: type_id = self.indirect_table[type_desc + 6] if type_id == 0x1D: name = self._resolve_udt_name(type_desc) if name: return name, is_array return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 3) * 10 if offs + 8 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: ext = self.external_types.get(offs) return ext or '', is_array if hl_name == 0xFFFF: type_name = _get_type_name(self.indirect_table[type_desc + 6]) if not type_name and type_desc + 17 <= len(self.indirect_table): type_name = _get_type_name(self.indirect_table[type_desc + 16]) return type_name, is_array name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES: ext = self.external_types.get(offs) return ext or '', is_array return name, is_array type_desc = _get_dword(self.indirect_table, offset, self.endian) flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: type_id = self.indirect_table[type_desc + 6] if type_id == 0x1D: name = self._resolve_udt_name(type_desc) if name: return name, is_array return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 2) * 10 if offs + 4 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: ext = self.external_types.get(offs) return ext or '', is_array if hl_name == 0xFFFF: type_name = _get_type_name(self.indirect_table[type_desc + 6]) if not type_name and type_desc + 17 <= len(self.indirect_table): type_name = _get_type_name(self.indirect_table[type_desc + 16]) return type_name, is_array name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES: ext = self.external_types.get(offs) return ext or '', is_array return name, is_array def disasm_var(self, dword: int) -> VarInfo: b_flag1 = self.indirect_table[dword] b_flag2 = self.indirect_table[dword + 1] has_as = (b_flag1 & 0x20) != 0 has_new = (b_flag2 & 0x20) != 0 var_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) type_ref: TypeRef | None = None if has_new or has_as: type_name = '' is_array = False if has_as: offs = 16 if self.is_64bit else 12 word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: type_id = self.indirect_table[dword + offs] type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs) if type_name: type_ref = TypeRef(type_name, is_array) else: offs = 16 if self.is_64bit else 12 if len(self.indirect_table) >= dword + offs + 4: word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: type_id = self.indirect_table[dword + offs] if (type_id & 0x40) and (b_flag1 & 0x10): type_id &= ~0x40 if type_id in _SUFFIX_TYPE_IDS: type_name = _get_type_name(type_id) if type_name: type_ref = TypeRef(type_name, from_suffix=True) else: try: type_name, is_array = self.disasm_object(dword + offs) except Exception: type_name = '' is_array = False if type_name in _SUFFIX_TYPES: type_ref = TypeRef(type_name, is_array, from_suffix=True) elif is_array: var_name += '()' return VarInfo(var_name, type_ref, has_new) def disasm_arg(self, arg_offset: int) -> ArgInfo | None: flags = _get_word(self.indirect_table, arg_offset, self.endian) offs = 4 if self.is_64bit else 0 name_word = _get_word(self.indirect_table, arg_offset + 2, self.endian) if name_word >= 0xFFFE: return None arg_name = _get_name( self.indirect_table, self.identifiers, arg_offset + 2, self.endian, self.vba_ver, self.is_64bit) arg_type = _get_dword(self.indirect_table, arg_offset + offs + 12, self.endian) arg_opts = _get_word(self.indirect_table, arg_offset + offs + 24, self.endian) is_paramarray = bool(arg_opts & 0x0001) if is_paramarray: self._has_pa_bit = True is_byval = bool(arg_opts & 0x0004) is_byref = bool(arg_opts & 0x0002) is_optional = bool(arg_opts & 0x0200) type_ref: TypeRef | None = None if flags & 0x0020: arg_type_name = '' is_array = False if (arg_type & 0xFFFF0000) == 0xFFFF0000: arg_type_id = arg_type & 0x000000FF arg_type_name = _get_type_name(arg_type_id) elif self.is_64bit and arg_type < len(DIM_TYPES) and DIM_TYPES[arg_type]: arg_type_name = _get_type_name(arg_type) else: arg_type_name, is_array = self.disasm_object(arg_offset + offs + 12) if arg_type_name.startswith('type_') and self.is_64bit: arg_type_id = arg_type & 0x000000FF if arg_type_id < len(DIM_TYPES) and DIM_TYPES[arg_type_id]: arg_type_name = _get_type_name(arg_type_id) is_array = False if arg_type_name: type_ref = TypeRef(arg_type_name, is_array) elif (arg_type & 0xFFFF0000) == 0xFFFF0000: arg_type_id = arg_type & 0x000000FF if arg_type_id in _SUFFIX_TYPE_IDS: type_name = _get_type_name(arg_type_id) if type_name: type_ref = TypeRef(type_name, from_suffix=True) elif self.is_64bit and arg_type < len(DIM_TYPES) and DIM_TYPES[arg_type]: if arg_type in _SUFFIX_TYPE_IDS: type_name = _get_type_name(arg_type) if type_name: type_ref = TypeRef(type_name, from_suffix=True) else: try: type_name, is_array = self.disasm_object(arg_offset + offs + 12) except Exception: type_name = '' is_array = False if type_name in _SUFFIX_TYPES: type_ref = TypeRef(type_name, is_array, from_suffix=True) elif (not type_name or type_name.startswith('type_')) and self.is_64bit: arg_type_id = arg_type & 0x000000FF if arg_type_id < len(DIM_TYPES) and DIM_TYPES[arg_type_id]: type_name = _get_type_name(arg_type_id) if type_name in _SUFFIX_TYPES: type_ref = TypeRef(type_name, from_suffix=True) elif is_array: arg_name += '()' elif is_array: arg_name += '()' default_value: str | None = None if is_optional: default_tag_off = arg_offset + offs + 28 default_val_off = arg_offset + offs + 32 ind = self.indirect_table if default_tag_off + 2 <= len(ind) and default_val_off + 4 <= len(ind): vt_tag = _get_word(ind, default_tag_off, self.endian) value_dw = _get_dword(ind, default_val_off, self.endian) default_value = self._format_default_value(vt_tag, value_dw) return ArgInfo( arg_name, type_ref, is_byval, is_byref, is_optional, is_paramarray, default_value, ) def _format_default_value(self, vt_tag: int, value_dw: int) -> str | None: VT_I2 = 2 VT_I4 = 3 VT_R4 = 4 VT_R8 = 5 VT_CY = 6 VT_BSTR = 8 VT_BOOL = 11 VT_UI1 = 17 ind = self.indirect_table if vt_tag == 0: return None elif vt_tag == VT_I2: val = value_dw & 0xFFFF return str(val - 0x10000 if val > 0x7FFF else val) elif vt_tag == VT_I4: return str(value_dw - 0x100000000 if value_dw > 0x7FFFFFFF else value_dw) elif vt_tag == VT_R4: val = _struct.unpack('<f', _struct.pack('<I', value_dw))[0] return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val) elif vt_tag == VT_R8: if value_dw + 8 <= len(ind): val = _struct.unpack('<d', bytes(ind[value_dw:value_dw + 8]))[0] return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val) elif vt_tag == VT_CY: val = value_dw / 10000 return str(int(val)) if val == int(val) else str(val) elif vt_tag == VT_BSTR: if value_dw + 4 <= len(ind): str_len = _get_dword(ind, value_dw, self.endian) if str_len == 0: return '""' if 0 < str_len < 0x10000 and value_dw + 4 + str_len <= len(ind): s = bytes(ind[value_dw + 4:value_dw + 4 + str_len]).decode(self.codec, errors='replace') return F'"{s}"' elif vt_tag == VT_BOOL: return 'True' if (value_dw & 0xFFFF) != 0 else 'False' elif vt_tag == VT_UI1: return str(value_dw & 0xFF) return None def _patch_64bit_defaults(self, arg_list: list[ArgInfo], func_name: str) -> None: md = self.module_data if md is None: return raw = bytes(md) need = sum(1 for a in arg_list if a.is_optional and a.default_value is None) if need == 0: return blocks: list[tuple[int, list[str]]] = [] pos = 0 while True: idx = raw.find(b'\xfa\x00\xb9\x00', pos) if idx < 0: break defaults: list[str] = [] cur = idx + 2 while cur + 4 <= len(raw) and raw[cur:cur + 2] == b'\xb9\x00': str_len = _get_word(raw, cur + 2, '<') if str_len <= 0 or str_len > 0x1000 or cur + 4 + str_len > len(raw): break str_data = raw[cur + 4:cur + 4 + str_len] if not all(32 <= b < 127 or b in (9, 10, 13) for b in str_data): break defaults.append(str_data.decode(self.codec, errors='replace')) cur = cur + 4 + str_len if defaults: blocks.append((idx, defaults)) pos = idx + 4 if not blocks: return func_bytes = func_name.encode(self.codec, errors='replace') func_pos = raw.find(func_bytes) if func_pos >= 0: best = min(blocks, key=lambda b: abs(b[0] - func_pos)) defaults = best[1] elif len(blocks) == 1: defaults = blocks[0][1] else: return defaults = list(reversed(defaults)) di = 0 for arg in arg_list: if not arg.is_optional or arg.default_value is not None: continue if di >= len(defaults): break arg.default_value = F'"{defaults[di]}"' di += 1 def _declare64(self, decl_offset: int, func_name: str) -> tuple[str | None, str | None]: """ Extract Lib and Alias names from a 64-bit Declare entry in the declaration table. The 64-bit entry structure differs significantly from 32-bit: the lib name identifier word is not at a fixed offset within the entry header. Instead, we extract the lib name from VBA source text stored later in the declaration table, falling back to the binary structure when source text is not available. """ decl = self.declaration_table decl_bytes = bytes(decl) lib_name = None alias_name = None # Strategy 1: Extract from VBA source text in the declaration table. # The source text may contain embedded null bytes, so strip them before matching. text = decl_bytes.replace(b'\x00', b'').decode('ascii', errors='replace') match = re.search( rf'(?:Function|Sub)\s+{re.escape(func_name)}\b.*?Lib\s+"([^"]+)"', text) if match: lib_name = match.group(1) after_lib = text[match.end():] alias_match = re.match(r'\s*Alias\s*"([^"]+)"', after_lib) if alias_match: alias_name = alias_match.group(1) # Strategy 2: Binary structure fallback. The alias string offset depends on version: # VBA7 version 0x0097 has 4 extra bytes of padding (alias at +0x20), later versions # use the standard offset (+0x1C). _alias_off = 0x20 if self.version <= 0x97 else 0x1C if lib_name is None and self.version > 0x97 and decl_offset >= 2: # For VBA7 versions after 0x97 the lib identifier word for each entry is stored # in the 2 bytes immediately preceding the entry header, placed there as trailing # data of the previous entry. This does not apply to the very first entry # (decl_offset == 0) or to versions <= 0x97 where the lib word sits at header +2. lib_word = _get_word(decl, decl_offset - 2, self.endian) if lib_word != 0 and lib_word != 0xFFFF: lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit) if lib_name is None: alias_start = decl_offset + _alias_off if alias_start < len(decl): alias_bytes_raw = bytes(decl[alias_start:]) null_pos = alias_bytes_raw.find(0) if null_pos > 0 and all(32 <= b < 127 for b in alias_bytes_raw[:null_pos]): abs_null = alias_start + null_pos dword_aligned = (abs_null + 1 + 3) & ~3 lib_word_offset = dword_aligned + 2 if lib_word_offset + 2 <= len(decl): lib_word = _get_word(decl, lib_word_offset, self.endian) if lib_word != 0 and lib_word != 0xFFFF: lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit) if lib_name is None: lib_word = _get_word(decl, decl_offset + 2, self.endian) if lib_word != 0: lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit) # Read alias from binary structure if not found via source text. if alias_name is None and not match: alias_start = decl_offset + _alias_off if alias_start < len(decl): alias_bytes_raw = bytes(decl[alias_start:]) null_pos = alias_bytes_raw.find(0) if null_pos > 0: alias_name = alias_bytes_raw[:null_pos].decode(self.codec, errors='replace') return lib_name, alias_name def disasm_func(self, dword: int, op_type: int) -> FuncInfo: flags = _get_word(self.indirect_table, dword, self.endian) name_word = _get_word(self.indirect_table, dword + 2, self.endian) offs2 = 4 if self.vba_ver > 5 else 0 if self.is_64bit: offs2 += 16 self._linecont_pending = False sub_name = _get_id(name_word, self.identifiers, self.vba_ver, self.is_64bit) arg_offset = _get_dword(self.indirect_table, dword + offs2 + 36, self.endian) ret_type = _get_dword(self.indirect_table, dword + offs2 + 40, self.endian) decl_offset = _get_word(self.indirect_table, dword + offs2 + 44, self.endian) c_options_offset = 60 if self.is_64bit and self.version > 0x97 else 54 c_options = self.indirect_table[dword + offs2 + c_options_offset] new_flags_offset = 63 if self.is_64bit and self.version > 0x97 else 57 new_flags = self.indirect_table[dword + offs2 + new_flags_offset] scope = '' is_friend = False if self.vba_ver > 5: if (new_flags & 0x0002) == 0: scope = 'Private' elif op_type & 0x04: scope = 'Public' if new_flags & 0x0004: is_friend = True else: if (flags & 0x0008) == 0: scope = 'Private' elif op_type & 0x04: scope = 'Public' is_static = bool(flags & 0x0080) has_declare = (c_options & 0x90) == 0 and decl_offset != 0xFFFF is_ptrsafe = bool(self.vba_ver > 5 and new_flags & 0x20) has_as = (flags & 0x0020) != 0 if flags & 0x1000: kind = 'Function' if op_type in (2, 6) else 'Sub' elif flags & 0x2000: kind = 'Property Get' elif flags & 0x4000: kind = 'Property Let' elif flags & 0x8000: kind = 'Property Set' else: kind = 'Sub' return_type: TypeRef | None = None if has_as: type_name = '' is_array = False if (ret_type & 0xFFFF0000) == 0xFFFF0000: type_id = ret_type & 0x000000FF type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs2 + 40) if type_name: return_type = TypeRef(type_name, is_array) elif (ret_type & 0xFFFF0000) == 0xFFFF0000: ret_type_id = ret_type & 0x000000FF if ret_type_id in _SUFFIX_TYPE_IDS: type_name = _get_type_name(ret_type_id) if type_name: return_type = TypeRef(type_name, from_suffix=True) lib_name: str | None = None alias_name: str | None = None if has_declare: if self.is_64bit: lib_name, alias_name = self._declare64(decl_offset, sub_name) else: lib_name = _get_name( self.declaration_table, self.identifiers, decl_offset + 2, self.endian, self.vba_ver, self.is_64bit) alias_offset = _get_word( self.declaration_table, decl_offset + 4, self.endian) if alias_offset < len(self.declaration_table): alias_bytes = bytes(self.declaration_table[alias_offset:]) null_pos = alias_bytes.find(0) if null_pos > 0: alias_name = alias_bytes[:null_pos].decode( self.codec, errors='replace') if alias_name == sub_name: alias_name = None arg_list: list[ArgInfo] = [] while ( arg_offset != 0xFFFFFFFF and arg_offset != 0 and arg_offset + 26 < len(self.indirect_table) ): arg = self.disasm_arg(arg_offset) if arg is not None: arg_list.append(arg) arg_offset = _get_dword( self.indirect_table, arg_offset + (24 if self.is_64bit else 20), self.endian, ) if self.is_64bit and any( a.is_optional and a.default_value is None for a in arg_list ): self._patch_64bit_defaults(arg_list, sub_name) if ( arg_list and not self._has_pa_bit and not any(a.is_paramarray for a in arg_list) ): last = arg_list[-1] _pa_candidate = ( last.type is not None and last.type.is_array and (last.type.name == 'Variant' or last.type.name == '') ) or ( last.type is None and last.name.endswith('()') ) _pa_no_modifiers = not last.is_byval and not last.is_byref and not last.is_optional if _pa_candidate and _pa_no_modifiers: last.is_paramarray = True if is_friend: scope = 'Friend' if not scope else F'{scope} Friend' return FuncInfo( scope, is_static, kind, sub_name, arg_list, return_type, has_declare, is_ptrsafe, lib_name, alias_name, ) def disasm_var_arg( self, module_data: bytes | bytearray | memoryview, offset: int, w_length: int, mnemonic: str, ) -> list[str]: substring = module_data[offset:offset + w_length] length_str = F'0x{w_length:04X}' if mnemonic in ('LitStr', 'QuoteRem', 'Rem', 'Reparse'): quoted = F'"{codecs.decode(substring, self.codec, "replace")}"' return [length_str, quoted] elif mnemonic in ('OnGosub', 'OnGoto'): offset1 = offset names: list[str] = [] for _ in range(w_length // 2): offset1, word = _get_var(module_data, offset1, self.endian, False) names.append(_get_id(word, self.identifiers, self.vba_ver, self.is_64bit)) return [length_str, ', '.join(names)] else: hex_dump = ' '.join(F'{c:02X}' for c in substring) return [length_str, hex_dump] def dump_line( self, module_data: bytes | bytearray | memoryview, line_start: int, line_length: int, ) -> list[tuple[str, list[OpcodeArg]]]: """ Disassemble one p-code line into a list of (mnemonic, [arg, ...]) tuples. """ self._linecont_pending = False result: list[tuple[str, list[OpcodeArg]]] = [] if line_length <= 0: return result offset = line_start end_of_line = line_start + line_length while offset < end_of_line: offset, opcode = _get_var(module_data, offset, self.endian, False) op_type = (opcode & ~0x03FF) >> 10 opcode &= 0x03FF translated = _translate_opcode(opcode, self.vba_ver, self.is_64bit) if translated not in OPCODES: return result instruction = OPCODES[translated] mnemonic = instruction.mnem if op_type == 8 and mnemonic in ('FnMid', 'FnMidB', 'FnCurDir', 'FnError', 'Mid', 'MidB'): mnemonic += '$' parts: list[OpcodeArg] = [] if mnemonic in ('Coerce', 'CoerceVar', 'DefType'): if op_type < len(_VAR_TYPES_LONG): parts.append(CoerceType(_VAR_TYPES_LONG[op_type])) elif op_type == 17: parts.append(CoerceType('Byte')) else: parts.append(CoerceType(str(op_type))) elif mnemonic in ('Dim', 'DimImplicit', 'Type'): dim_type: list[str] = [] if op_type & 0x04: dim_type.append('Global') elif op_type & 0x08: dim_type.append('Public') elif op_type & 0x10: dim_type.append('Private') elif op_type & 0x20: dim_type.append('Static') if (op_type & 0x01) and (mnemonic != 'Type'): dim_type.append('Const') if dim_type: parts.append(DimScope(dim_type)) elif mnemonic == 'LitVarSpecial': parts.append(_SPECIALS[op_type]) elif mnemonic in ('ArgsCall', 'ArgsMemCall', 'ArgsMemCallWith'): if op_type < 16: parts.append('(Call)') else: op_type -= 16 elif mnemonic == 'Option': parts.append(_OPTIONS[op_type]) elif mnemonic in ('Redim', 'RedimAs'): if op_type & 16: parts.append('(Preserve)') elif mnemonic in ( 'FnDir', 'FnFormat', 'FnStringVar', 'FnStringStr', ): parts.append(F'0x{op_type:04X}') elif mnemonic == 'LitSmallI2': parts.append(str(op_type)) for arg in instruction.args: if arg == 'name': offset, word = _get_var(module_data, offset, self.endian, False) the_name = self.disasm_name(word, mnemonic, op_type) if the_name: parts.append(the_name) elif arg in ('0x', 'imp_'): offset, word = _get_var(module_data, offset, self.endian, False) the_imp = self.disasm_imp(arg, word, mnemonic) if the_imp: parts.append(the_imp) elif arg in ('func_', 'var_', 'rec_', 'type_', 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) if ( arg == 'rec_' and len(self.indirect_table) >= dword + 20 ): parts.append(RecordInfo(self.disasm_rec(dword))) elif ( arg == 'type_' and len(self.indirect_table) >= dword + 7 ): type_id = self.indirect_table[dword + 6] if type_id == 0x1D: the_type = self._resolve_udt_name(dword) else: the_type = '' if not the_type: the_type = _disasm_type(self.indirect_table, dword) parts.append(TypeRef(the_type)) elif ( arg == 'var_' and len(self.indirect_table) >= dword + 16 ): var_info = self.disasm_var(dword) if op_type & 0x20: var_info.has_withevents = True parts.append(var_info) if op_type & 0x10: word = _get_word(module_data, offset, self.endian) offset += 2 parts.append(F'0x{word:04X}') elif ( arg == 'func_' and len(self.indirect_table) >= dword + 61 ): parts.append(self.disasm_func(dword, op_type)) else: parts.append(F'{arg}{dword:08X}') if self.is_64bit and (arg == 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) parts.append(F'{dword:08X}') if instruction.varg: offset, w_length = _get_var(module_data, offset, self.endian, False) var_arg_parts = self.disasm_var_arg( module_data, offset, w_length, mnemonic) parts.extend(var_arg_parts) offset += w_length if w_length & 1: offset += 1 result.append((mnemonic, parts)) if mnemonic == 'LineCont': self._linecont_pending = True return resultMethods
def disasm_name(self, word, mnemonic, op_type)-
Expand source code Browse git
def disasm_name(self, word: int, mnemonic: str, op_type: int) -> str: var_types = [ '', '?', '%', '&', '!', '#', '@', '?', '$', '?', '?', '?', '?', '?', ] var_name = _get_id(word, self.identifiers, self.vba_ver, self.is_64bit) if op_type < len(var_types): str_type = var_types[op_type] else: str_type = '' if op_type == 32: var_name = F'[{var_name}]' if mnemonic == 'OnError': str_type = '' if op_type == 1: var_name = '(Resume Next)' elif op_type == 2: var_name = '(GoTo 0)' elif mnemonic == 'Resume': str_type = '' if op_type == 1: var_name = '(Next)' elif op_type != 0: var_name = '' return (var_name + str_type).rstrip() def disasm_imp(self, arg, word, mnemonic)-
Expand source code Browse git
def disasm_imp(self, arg: str, word: int, mnemonic: str) -> str: if mnemonic != 'Open': if arg == 'imp_': shift = 3 if self.is_64bit else 2 offs = (word >> shift) * 10 if offs + 8 <= len(self.object_table): hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: return self.external_types.get(offs, '') name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES: return self.external_types.get(offs, '') return name return F'{arg}{word:04X}' access_mode = ['Read', 'Write', 'Read Write'] lock_mode = ['Read Write', 'Write', 'Read'] mode = word & 0x00FF access = (word & 0x0F00) >> 8 lock = (word & 0xF000) >> 12 imp_name = '(For ' if mode & 0x01: imp_name += 'Input' elif mode & 0x02: imp_name += 'Output' elif mode & 0x04: imp_name += 'Random' elif mode & 0x08: imp_name += 'Append' elif mode == 0x20: imp_name += 'Binary' if access and (access <= len(access_mode)): imp_name += F' Access {access_mode[access - 1]}' if lock: if lock & 0x04: imp_name += ' Shared' elif lock <= len(lock_mode): imp_name += F' Lock {lock_mode[lock - 1]}' imp_name += ')' return imp_name def disasm_rec(self, dword)-
Expand source code Browse git
def disasm_rec(self, dword: int) -> str: object_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) options = _get_word(self.indirect_table, dword + 18, self.endian) if (options & 1) == 0: object_name = F'(Private) {object_name}' else: object_name = F'(Public) {object_name}' return object_name def disasm_object(self, offset)-
Expand source code Browse git
def disasm_object(self, offset: int) -> tuple[str, bool]: if self.is_64bit: type_desc = _get_dword(self.indirect_table, offset, self.endian) if type_desc + 4 > len(self.indirect_table): return '', False flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: type_id = self.indirect_table[type_desc + 6] if type_id == 0x1D: name = self._resolve_udt_name(type_desc) if name: return name, is_array return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 3) * 10 if offs + 8 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: ext = self.external_types.get(offs) return ext or '', is_array if hl_name == 0xFFFF: type_name = _get_type_name(self.indirect_table[type_desc + 6]) if not type_name and type_desc + 17 <= len(self.indirect_table): type_name = _get_type_name(self.indirect_table[type_desc + 16]) return type_name, is_array name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES: ext = self.external_types.get(offs) return ext or '', is_array return name, is_array type_desc = _get_dword(self.indirect_table, offset, self.endian) flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: type_id = self.indirect_table[type_desc + 6] if type_id == 0x1D: name = self._resolve_udt_name(type_desc) if name: return name, is_array return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 2) * 10 if offs + 4 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0: ext = self.external_types.get(offs) return ext or '', is_array if hl_name == 0xFFFF: type_name = _get_type_name(self.indirect_table[type_desc + 6]) if not type_name and type_desc + 17 <= len(self.indirect_table): type_name = _get_type_name(self.indirect_table[type_desc + 16]) return type_name, is_array name = _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) if (hl_name >> 1) < 0x100 and name not in _VALID_INTERNAL_TYPE_NAMES: ext = self.external_types.get(offs) return ext or '', is_array return name, is_array def disasm_var(self, dword)-
Expand source code Browse git
def disasm_var(self, dword: int) -> VarInfo: b_flag1 = self.indirect_table[dword] b_flag2 = self.indirect_table[dword + 1] has_as = (b_flag1 & 0x20) != 0 has_new = (b_flag2 & 0x20) != 0 var_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) type_ref: TypeRef | None = None if has_new or has_as: type_name = '' is_array = False if has_as: offs = 16 if self.is_64bit else 12 word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: type_id = self.indirect_table[dword + offs] type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs) if type_name: type_ref = TypeRef(type_name, is_array) else: offs = 16 if self.is_64bit else 12 if len(self.indirect_table) >= dword + offs + 4: word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: type_id = self.indirect_table[dword + offs] if (type_id & 0x40) and (b_flag1 & 0x10): type_id &= ~0x40 if type_id in _SUFFIX_TYPE_IDS: type_name = _get_type_name(type_id) if type_name: type_ref = TypeRef(type_name, from_suffix=True) else: try: type_name, is_array = self.disasm_object(dword + offs) except Exception: type_name = '' is_array = False if type_name in _SUFFIX_TYPES: type_ref = TypeRef(type_name, is_array, from_suffix=True) elif is_array: var_name += '()' return VarInfo(var_name, type_ref, has_new) def disasm_arg(self, arg_offset)-
Expand source code Browse git
def disasm_arg(self, arg_offset: int) -> ArgInfo | None: flags = _get_word(self.indirect_table, arg_offset, self.endian) offs = 4 if self.is_64bit else 0 name_word = _get_word(self.indirect_table, arg_offset + 2, self.endian) if name_word >= 0xFFFE: return None arg_name = _get_name( self.indirect_table, self.identifiers, arg_offset + 2, self.endian, self.vba_ver, self.is_64bit) arg_type = _get_dword(self.indirect_table, arg_offset + offs + 12, self.endian) arg_opts = _get_word(self.indirect_table, arg_offset + offs + 24, self.endian) is_paramarray = bool(arg_opts & 0x0001) if is_paramarray: self._has_pa_bit = True is_byval = bool(arg_opts & 0x0004) is_byref = bool(arg_opts & 0x0002) is_optional = bool(arg_opts & 0x0200) type_ref: TypeRef | None = None if flags & 0x0020: arg_type_name = '' is_array = False if (arg_type & 0xFFFF0000) == 0xFFFF0000: arg_type_id = arg_type & 0x000000FF arg_type_name = _get_type_name(arg_type_id) elif self.is_64bit and arg_type < len(DIM_TYPES) and DIM_TYPES[arg_type]: arg_type_name = _get_type_name(arg_type) else: arg_type_name, is_array = self.disasm_object(arg_offset + offs + 12) if arg_type_name.startswith('type_') and self.is_64bit: arg_type_id = arg_type & 0x000000FF if arg_type_id < len(DIM_TYPES) and DIM_TYPES[arg_type_id]: arg_type_name = _get_type_name(arg_type_id) is_array = False if arg_type_name: type_ref = TypeRef(arg_type_name, is_array) elif (arg_type & 0xFFFF0000) == 0xFFFF0000: arg_type_id = arg_type & 0x000000FF if arg_type_id in _SUFFIX_TYPE_IDS: type_name = _get_type_name(arg_type_id) if type_name: type_ref = TypeRef(type_name, from_suffix=True) elif self.is_64bit and arg_type < len(DIM_TYPES) and DIM_TYPES[arg_type]: if arg_type in _SUFFIX_TYPE_IDS: type_name = _get_type_name(arg_type) if type_name: type_ref = TypeRef(type_name, from_suffix=True) else: try: type_name, is_array = self.disasm_object(arg_offset + offs + 12) except Exception: type_name = '' is_array = False if type_name in _SUFFIX_TYPES: type_ref = TypeRef(type_name, is_array, from_suffix=True) elif (not type_name or type_name.startswith('type_')) and self.is_64bit: arg_type_id = arg_type & 0x000000FF if arg_type_id < len(DIM_TYPES) and DIM_TYPES[arg_type_id]: type_name = _get_type_name(arg_type_id) if type_name in _SUFFIX_TYPES: type_ref = TypeRef(type_name, from_suffix=True) elif is_array: arg_name += '()' elif is_array: arg_name += '()' default_value: str | None = None if is_optional: default_tag_off = arg_offset + offs + 28 default_val_off = arg_offset + offs + 32 ind = self.indirect_table if default_tag_off + 2 <= len(ind) and default_val_off + 4 <= len(ind): vt_tag = _get_word(ind, default_tag_off, self.endian) value_dw = _get_dword(ind, default_val_off, self.endian) default_value = self._format_default_value(vt_tag, value_dw) return ArgInfo( arg_name, type_ref, is_byval, is_byref, is_optional, is_paramarray, default_value, ) def disasm_func(self, dword, op_type)-
Expand source code Browse git
def disasm_func(self, dword: int, op_type: int) -> FuncInfo: flags = _get_word(self.indirect_table, dword, self.endian) name_word = _get_word(self.indirect_table, dword + 2, self.endian) offs2 = 4 if self.vba_ver > 5 else 0 if self.is_64bit: offs2 += 16 self._linecont_pending = False sub_name = _get_id(name_word, self.identifiers, self.vba_ver, self.is_64bit) arg_offset = _get_dword(self.indirect_table, dword + offs2 + 36, self.endian) ret_type = _get_dword(self.indirect_table, dword + offs2 + 40, self.endian) decl_offset = _get_word(self.indirect_table, dword + offs2 + 44, self.endian) c_options_offset = 60 if self.is_64bit and self.version > 0x97 else 54 c_options = self.indirect_table[dword + offs2 + c_options_offset] new_flags_offset = 63 if self.is_64bit and self.version > 0x97 else 57 new_flags = self.indirect_table[dword + offs2 + new_flags_offset] scope = '' is_friend = False if self.vba_ver > 5: if (new_flags & 0x0002) == 0: scope = 'Private' elif op_type & 0x04: scope = 'Public' if new_flags & 0x0004: is_friend = True else: if (flags & 0x0008) == 0: scope = 'Private' elif op_type & 0x04: scope = 'Public' is_static = bool(flags & 0x0080) has_declare = (c_options & 0x90) == 0 and decl_offset != 0xFFFF is_ptrsafe = bool(self.vba_ver > 5 and new_flags & 0x20) has_as = (flags & 0x0020) != 0 if flags & 0x1000: kind = 'Function' if op_type in (2, 6) else 'Sub' elif flags & 0x2000: kind = 'Property Get' elif flags & 0x4000: kind = 'Property Let' elif flags & 0x8000: kind = 'Property Set' else: kind = 'Sub' return_type: TypeRef | None = None if has_as: type_name = '' is_array = False if (ret_type & 0xFFFF0000) == 0xFFFF0000: type_id = ret_type & 0x000000FF type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs2 + 40) if type_name: return_type = TypeRef(type_name, is_array) elif (ret_type & 0xFFFF0000) == 0xFFFF0000: ret_type_id = ret_type & 0x000000FF if ret_type_id in _SUFFIX_TYPE_IDS: type_name = _get_type_name(ret_type_id) if type_name: return_type = TypeRef(type_name, from_suffix=True) lib_name: str | None = None alias_name: str | None = None if has_declare: if self.is_64bit: lib_name, alias_name = self._declare64(decl_offset, sub_name) else: lib_name = _get_name( self.declaration_table, self.identifiers, decl_offset + 2, self.endian, self.vba_ver, self.is_64bit) alias_offset = _get_word( self.declaration_table, decl_offset + 4, self.endian) if alias_offset < len(self.declaration_table): alias_bytes = bytes(self.declaration_table[alias_offset:]) null_pos = alias_bytes.find(0) if null_pos > 0: alias_name = alias_bytes[:null_pos].decode( self.codec, errors='replace') if alias_name == sub_name: alias_name = None arg_list: list[ArgInfo] = [] while ( arg_offset != 0xFFFFFFFF and arg_offset != 0 and arg_offset + 26 < len(self.indirect_table) ): arg = self.disasm_arg(arg_offset) if arg is not None: arg_list.append(arg) arg_offset = _get_dword( self.indirect_table, arg_offset + (24 if self.is_64bit else 20), self.endian, ) if self.is_64bit and any( a.is_optional and a.default_value is None for a in arg_list ): self._patch_64bit_defaults(arg_list, sub_name) if ( arg_list and not self._has_pa_bit and not any(a.is_paramarray for a in arg_list) ): last = arg_list[-1] _pa_candidate = ( last.type is not None and last.type.is_array and (last.type.name == 'Variant' or last.type.name == '') ) or ( last.type is None and last.name.endswith('()') ) _pa_no_modifiers = not last.is_byval and not last.is_byref and not last.is_optional if _pa_candidate and _pa_no_modifiers: last.is_paramarray = True if is_friend: scope = 'Friend' if not scope else F'{scope} Friend' return FuncInfo( scope, is_static, kind, sub_name, arg_list, return_type, has_declare, is_ptrsafe, lib_name, alias_name, ) def disasm_var_arg(self, module_data, offset, w_length, mnemonic)-
Expand source code Browse git
def disasm_var_arg( self, module_data: bytes | bytearray | memoryview, offset: int, w_length: int, mnemonic: str, ) -> list[str]: substring = module_data[offset:offset + w_length] length_str = F'0x{w_length:04X}' if mnemonic in ('LitStr', 'QuoteRem', 'Rem', 'Reparse'): quoted = F'"{codecs.decode(substring, self.codec, "replace")}"' return [length_str, quoted] elif mnemonic in ('OnGosub', 'OnGoto'): offset1 = offset names: list[str] = [] for _ in range(w_length // 2): offset1, word = _get_var(module_data, offset1, self.endian, False) names.append(_get_id(word, self.identifiers, self.vba_ver, self.is_64bit)) return [length_str, ', '.join(names)] else: hex_dump = ' '.join(F'{c:02X}' for c in substring) return [length_str, hex_dump] def dump_line(self, module_data, line_start, line_length)-
Disassemble one p-code line into a list of (mnemonic, [arg, …]) tuples.
Expand source code Browse git
def dump_line( self, module_data: bytes | bytearray | memoryview, line_start: int, line_length: int, ) -> list[tuple[str, list[OpcodeArg]]]: """ Disassemble one p-code line into a list of (mnemonic, [arg, ...]) tuples. """ self._linecont_pending = False result: list[tuple[str, list[OpcodeArg]]] = [] if line_length <= 0: return result offset = line_start end_of_line = line_start + line_length while offset < end_of_line: offset, opcode = _get_var(module_data, offset, self.endian, False) op_type = (opcode & ~0x03FF) >> 10 opcode &= 0x03FF translated = _translate_opcode(opcode, self.vba_ver, self.is_64bit) if translated not in OPCODES: return result instruction = OPCODES[translated] mnemonic = instruction.mnem if op_type == 8 and mnemonic in ('FnMid', 'FnMidB', 'FnCurDir', 'FnError', 'Mid', 'MidB'): mnemonic += '$' parts: list[OpcodeArg] = [] if mnemonic in ('Coerce', 'CoerceVar', 'DefType'): if op_type < len(_VAR_TYPES_LONG): parts.append(CoerceType(_VAR_TYPES_LONG[op_type])) elif op_type == 17: parts.append(CoerceType('Byte')) else: parts.append(CoerceType(str(op_type))) elif mnemonic in ('Dim', 'DimImplicit', 'Type'): dim_type: list[str] = [] if op_type & 0x04: dim_type.append('Global') elif op_type & 0x08: dim_type.append('Public') elif op_type & 0x10: dim_type.append('Private') elif op_type & 0x20: dim_type.append('Static') if (op_type & 0x01) and (mnemonic != 'Type'): dim_type.append('Const') if dim_type: parts.append(DimScope(dim_type)) elif mnemonic == 'LitVarSpecial': parts.append(_SPECIALS[op_type]) elif mnemonic in ('ArgsCall', 'ArgsMemCall', 'ArgsMemCallWith'): if op_type < 16: parts.append('(Call)') else: op_type -= 16 elif mnemonic == 'Option': parts.append(_OPTIONS[op_type]) elif mnemonic in ('Redim', 'RedimAs'): if op_type & 16: parts.append('(Preserve)') elif mnemonic in ( 'FnDir', 'FnFormat', 'FnStringVar', 'FnStringStr', ): parts.append(F'0x{op_type:04X}') elif mnemonic == 'LitSmallI2': parts.append(str(op_type)) for arg in instruction.args: if arg == 'name': offset, word = _get_var(module_data, offset, self.endian, False) the_name = self.disasm_name(word, mnemonic, op_type) if the_name: parts.append(the_name) elif arg in ('0x', 'imp_'): offset, word = _get_var(module_data, offset, self.endian, False) the_imp = self.disasm_imp(arg, word, mnemonic) if the_imp: parts.append(the_imp) elif arg in ('func_', 'var_', 'rec_', 'type_', 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) if ( arg == 'rec_' and len(self.indirect_table) >= dword + 20 ): parts.append(RecordInfo(self.disasm_rec(dword))) elif ( arg == 'type_' and len(self.indirect_table) >= dword + 7 ): type_id = self.indirect_table[dword + 6] if type_id == 0x1D: the_type = self._resolve_udt_name(dword) else: the_type = '' if not the_type: the_type = _disasm_type(self.indirect_table, dword) parts.append(TypeRef(the_type)) elif ( arg == 'var_' and len(self.indirect_table) >= dword + 16 ): var_info = self.disasm_var(dword) if op_type & 0x20: var_info.has_withevents = True parts.append(var_info) if op_type & 0x10: word = _get_word(module_data, offset, self.endian) offset += 2 parts.append(F'0x{word:04X}') elif ( arg == 'func_' and len(self.indirect_table) >= dword + 61 ): parts.append(self.disasm_func(dword, op_type)) else: parts.append(F'{arg}{dword:08X}') if self.is_64bit and (arg == 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) parts.append(F'{dword:08X}') if instruction.varg: offset, w_length = _get_var(module_data, offset, self.endian, False) var_arg_parts = self.disasm_var_arg( module_data, offset, w_length, mnemonic) parts.extend(var_arg_parts) offset += w_length if w_length & 1: offset += 1 result.append((mnemonic, parts)) if mnemonic == 'LineCont': self._linecont_pending = True return result
class PCodeDisassembler (data)-
VBA p-code disassembler that produces structured PCodeModule output. The output is suitable for consumption by the decompiler for reconstruction to VBA source code.
Expand source code Browse git
class PCodeDisassembler: """ VBA p-code disassembler that produces structured PCodeModule output. The output is suitable for consumption by the decompiler for reconstruction to VBA source code. """ def __init__(self, data: bytes | bytearray | memoryview): self._data = data def iter_modules(self): """ Yield PCodeModule objects for each VBA module. """ for ole_data in self._get_ole_streams(): ole = OleFile(ole_data) yield from self._iter_project_modules(ole) def _iter_project_modules( self, ole: OleFile, ): """ Iterate over VBA modules in an OLE file, yielding PCodeModule per module. """ vba_projects = _find_vba_projects(ole) if not vba_projects: return for vba_root, _, dir_path in vba_projects: codec, code_modules, is_64bit = self._process_dir(ole, dir_path) vba_project_path = vba_root + 'VBA/_VBA_PROJECT' vba_project_data = self._process_vba_project(ole, vba_project_path) identifiers = _get_identifiers(vba_project_data, codec) identifiers_stripped = not identifiers for module in code_modules: module_path = F'{vba_root}VBA/{module}' try: module_data = ole.openstream(module_path).read() except Exception: continue lines = _pcode_dump( module_data, vba_project_data, identifiers, is_64bit, codec) yield PCodeModule(module_path, lines, identifiers_stripped) def _get_ole_streams(self) -> list[bytes | bytearray | memoryview]: """ Extract OLE data from the input. If the input is already an OLE compound file, returns it directly. If it's a ZIP (OOXML), extracts all vbaProject.bin entries. """ if self._data[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1': return [self._data] if self._data[:2] == b'PK': import zipfile from refinery.lib.structures import MemoryFile results: list[bytes | bytearray | memoryview] = [] try: with zipfile.ZipFile(MemoryFile(self._data, bytes)) as zf: for name in zf.namelist(): if name.lower().endswith('vbaproject.bin'): results.append(zf.read(name)) except zipfile.BadZipFile: pass return results return [self._data] def _process_dir( self, ole: OleFile, dir_path: str, ) -> tuple[str, list[str], bool]: """ Parse the VBA dir stream to find module names and codepage. Returns (codec, code_modules, is_64bit). """ dir_data_compressed = ole.openstream(dir_path).read() dir_data = decompress_stream(dir_data_compressed) stream_size = len(dir_data) code_modules: list[str] = [] is_64bit = False codec = 'latin1' offset = 0 while offset < stream_size: try: tag = _get_word(dir_data, offset, '<') w_length = _get_word(dir_data, offset + 2, '<') if tag == 9: w_length = 6 elif tag == 3: w_length = 2 offset += 6 if w_length: if tag == 3: codepage = _get_word(dir_data, offset, '<') codec = _codepage_to_codec(codepage) elif tag == 50: stream_name = codecs.decode( dir_data[offset:offset + w_length], 'utf_16_le', errors='replace') code_modules.append(stream_name) elif tag == 1: sys_kind = _get_dword(dir_data, offset, '<') is_64bit = sys_kind == 3 offset += w_length except Exception: break return codec, code_modules, is_64bit def _process_vba_project( self, ole: OleFile, vba_project_path: str, ) -> bytes | bytearray | memoryview: """ Read the _VBA_PROJECT stream (raw, not compressed). """ return ole.openstream(vba_project_path).read()Methods
def iter_modules(self)-
Yield PCodeModule objects for each VBA module.
Expand source code Browse git
def iter_modules(self): """ Yield PCodeModule objects for each VBA module. """ for ole_data in self._get_ole_streams(): ole = OleFile(ole_data) yield from self._iter_project_modules(ole)