Module refinery.lib.ole.pcode
VBA p-code disassembler for Microsoft Office documents.
This module is a port of pcodedmp by Vesselin Bontchev, adapted for the Binary Refinery project. Since then, many bugs have been fixed and improvements made.
The original work is copyright (c) Vesselin Bontchev and licensed under GPL v3. The source code has been modified to fit the code requirements of this project.
Regardless of the license used for the binary refinery, this code file is also subject to the terms and conditions of the GNU General Public License version 3.
References
[pcodedmp] https://github.com/bontchev/pcodedmp [MS-OVBA] https://docs.microsoft.com/en-us/openspecs/
Expand source code Browse git
"""
VBA p-code disassembler for Microsoft Office documents.
This module is a port of pcodedmp by Vesselin Bontchev, adapted for the Binary Refinery project.
Since then, many bugs have been fixed and improvements made.
The original work is copyright (c) Vesselin Bontchev and licensed under GPL v3. The source code
has been modified to fit the code requirements of this project.
Regardless of the license used for the binary refinery, this code file is also subject to the
terms and conditions of the GNU General Public License version 3.
References:
[pcodedmp] https://github.com/bontchev/pcodedmp
[MS-OVBA] https://docs.microsoft.com/en-us/openspecs/
"""
from __future__ import annotations
import codecs
import logging
import re
import struct as _struct
from typing import NamedTuple
from refinery.lib.ole.file import OleFile
from refinery.lib.ole.vba import _codepage_to_codec, _find_vba_projects, decompress_stream
logger = logging.getLogger(__name__)
_STRUCT_WORD: dict[str, _struct.Struct] = {
'<': _struct.Struct('<H'),
'>': _struct.Struct('>H'),
}
_STRUCT_DWORD: dict[str, _struct.Struct] = {
'<': _struct.Struct('<L'),
'>': _struct.Struct('>L'),
}
_VAR_TYPES_LONG: tuple[str, ...] = (
'Var', '?', 'Int', 'Lng', 'Sng', 'Dbl', 'Cur', 'Date',
'Str', 'Obj', 'Err', 'Bool', 'Var',
)
_SPECIALS: tuple[str, ...] = ('False', 'True', 'Null', 'Empty')
_OPTIONS: tuple[str, ...] = (
'Base 0', 'Base 1', 'Compare Text', 'Compare Binary',
'Explicit', 'Private Module',
)
class Opcode(NamedTuple):
mnem: str
args: list[str] = []
varg: bool = False
class PCodeLine(NamedTuple):
"""
Structured representation of one line of disassembled p-code.
Each line contains a list of (mnemonic, [arg1, arg2, ...]) tuples.
"""
opcodes: list[tuple[str, list[str]]]
class PCodeModule(NamedTuple):
"""
Structured representation of a disassembled VBA module.
"""
path: str
lines: list[PCodeLine]
identifiers_stripped: bool = False
# VBA7 opcodes; VBA3, VBA5 and VBA6 will be upconverted to these.
Op = Opcode
OPCODES: dict[int, Opcode] = {
0x000: Op('Imp'),
0x001: Op('Eqv'),
0x002: Op('Xor'),
0x003: Op('Or'),
0x004: Op('And'),
0x005: Op('Eq'),
0x006: Op('Ne'),
0x007: Op('Le'),
0x008: Op('Ge'),
0x009: Op('Lt'),
0x00A: Op('Gt'),
0x00B: Op('Add'),
0x00C: Op('Sub'),
0x00D: Op('Mod'),
0x00E: Op('IDiv'),
0x00F: Op('Mul'),
0x010: Op('Div'),
0x011: Op('Concat'),
0x012: Op('Like'),
0x013: Op('Pwr'),
0x014: Op('Is'),
0x015: Op('Not'),
0x016: Op('UMi'),
0x017: Op('FnAbs'),
0x018: Op('FnFix'),
0x019: Op('FnInt'),
0x01A: Op('FnSgn'),
0x01B: Op('FnLen'),
0x01C: Op('FnLenB'),
0x01D: Op('Paren'),
0x01E: Op('Sharp'),
0x01F: Op('LdLHS', ['name']),
0x020: Op('Ld', ['name']),
0x021: Op('MemLd', ['name']),
0x022: Op('DictLd', ['name']),
0x023: Op('IndexLd', ['0x']),
0x024: Op('ArgsLd', ['name', '0x']),
0x025: Op('ArgsMemLd', ['name', '0x']),
0x026: Op('ArgsDictLd', ['name', '0x']),
0x027: Op('St', ['name']),
0x028: Op('MemSt', ['name']),
0x029: Op('DictSt', ['name']),
0x02A: Op('IndexSt', ['0x']),
0x02B: Op('ArgsSt', ['name', '0x']),
0x02C: Op('ArgsMemSt', ['name', '0x']),
0x02D: Op('ArgsDictSt', ['name', '0x']),
0x02E: Op('Set', ['name']),
0x02F: Op('Memset', ['name']),
0x030: Op('Dictset', ['name']),
0x031: Op('Indexset', ['0x']),
0x032: Op('ArgsSet', ['name', '0x']),
0x033: Op('ArgsMemSet', ['name', '0x']),
0x034: Op('ArgsDictSet', ['name', '0x']),
0x035: Op('MemLdWith', ['name']),
0x036: Op('DictLdWith', ['name']),
0x037: Op('ArgsMemLdWith', ['name', '0x']),
0x038: Op('ArgsDictLdWith', ['name', '0x']),
0x039: Op('MemStWith', ['name']),
0x03A: Op('DictStWith', ['name']),
0x03B: Op('ArgsMemStWith', ['name', '0x']),
0x03C: Op('ArgsDictStWith', ['name', '0x']),
0x03D: Op('MemSetWith', ['name']),
0x03E: Op('DictSetWith', ['name']),
0x03F: Op('ArgsMemSetWith', ['name', '0x']),
0x040: Op('ArgsDictSetWith', ['name', '0x']),
0x041: Op('ArgsCall', ['name', '0x']),
0x042: Op('ArgsMemCall', ['name', '0x']),
0x043: Op('ArgsMemCallWith', ['name', '0x']),
0x044: Op('ArgsArray', ['name', '0x']),
0x045: Op('Assert'),
0x046: Op('BoS', ['0x']),
0x047: Op('BoSImplicit'),
0x048: Op('BoL'),
0x049: Op('LdAddressOf', ['name']),
0x04A: Op('MemAddressOf', ['name']),
0x04B: Op('Case'),
0x04C: Op('CaseTo'),
0x04D: Op('CaseGt'),
0x04E: Op('CaseLt'),
0x04F: Op('CaseGe'),
0x050: Op('CaseLe'),
0x051: Op('CaseNe'),
0x052: Op('CaseEq'),
0x053: Op('CaseElse'),
0x054: Op('CaseDone'),
0x055: Op('Circle', ['0x']),
0x056: Op('Close', ['0x']),
0x057: Op('CloseAll'),
0x058: Op('Coerce'),
0x059: Op('CoerceVar'),
0x05A: Op('Context', ['context_']),
0x05B: Op('Debug'),
0x05C: Op('DefType', ['0x', '0x']),
0x05D: Op('Dim'),
0x05E: Op('DimImplicit'),
0x05F: Op('Do'),
0x060: Op('DoEvents'),
0x061: Op('DoUnitil'),
0x062: Op('DoWhile'),
0x063: Op('Else'),
0x064: Op('ElseBlock'),
0x065: Op('ElseIfBlock'),
0x066: Op('ElseIfTypeBlock', ['imp_']),
0x067: Op('End'),
0x068: Op('EndContext'),
0x069: Op('EndFunc'),
0x06A: Op('EndIf'),
0x06B: Op('EndIfBlock'),
0x06C: Op('EndImmediate'),
0x06D: Op('EndProp'),
0x06E: Op('EndSelect'),
0x06F: Op('EndSub'),
0x070: Op('EndType'),
0x071: Op('EndWith'),
0x072: Op('Erase', ['0x']),
0x073: Op('Error'),
0x074: Op('EventDecl', ['func_']),
0x075: Op('RaiseEvent', ['name', '0x']),
0x076: Op('ArgsMemRaiseEvent', ['name', '0x']),
0x077: Op('ArgsMemRaiseEventWith', ['name', '0x']),
0x078: Op('ExitDo'),
0x079: Op('ExitFor'),
0x07A: Op('ExitFunc'),
0x07B: Op('ExitProp'),
0x07C: Op('ExitSub'),
0x07D: Op('FnCurDir'),
0x07E: Op('FnDir'),
0x07F: Op('Empty0'),
0x080: Op('Empty1'),
0x081: Op('FnError'),
0x082: Op('FnFormat'),
0x083: Op('FnFreeFile'),
0x084: Op('FnInStr'),
0x085: Op('FnInStr3'),
0x086: Op('FnInStr4'),
0x087: Op('FnInStrB'),
0x088: Op('FnInStrB3'),
0x089: Op('FnInStrB4'),
0x08A: Op('FnLBound', ['0x']),
0x08B: Op('FnMid'),
0x08C: Op('FnMidB'),
0x08D: Op('FnStrComp'),
0x08E: Op('FnStrComp3'),
0x08F: Op('FnStringVar'),
0x090: Op('FnStringStr'),
0x091: Op('FnUBound', ['0x']),
0x092: Op('For'),
0x093: Op('ForEach'),
0x094: Op('ForEachAs', ['imp_']),
0x095: Op('ForStep'),
0x096: Op('FuncDefn', ['func_']),
0x097: Op('FuncDefnSave', ['func_']),
0x098: Op('GetRec'),
0x099: Op('GoSub', ['name']),
0x09A: Op('GoTo', ['name']),
0x09B: Op('If'),
0x09C: Op('IfBlock'),
0x09D: Op('TypeOf', ['imp_']),
0x09E: Op('IfTypeBlock', ['imp_']),
0x09F: Op('Implements', ['0x', '0x', '0x', '0x']),
0x0A0: Op('Input'),
0x0A1: Op('InputDone'),
0x0A2: Op('InputItem'),
0x0A3: Op('Label', ['name']),
0x0A4: Op('Let'),
0x0A5: Op('Line', ['0x']),
0x0A6: Op('LineCont', [], True),
0x0A7: Op('LineInput'),
0x0A8: Op('LineNum', ['name']),
0x0A9: Op('LitCy', ['0x', '0x', '0x', '0x']),
0x0AA: Op('LitDate', ['0x', '0x', '0x', '0x']),
0x0AB: Op('LitDefault'),
0x0AC: Op('LitDI2', ['0x']),
0x0AD: Op('LitDI4', ['0x', '0x']),
0x0AE: Op('LitDI8', ['0x', '0x', '0x', '0x']),
0x0AF: Op('LitHI2', ['0x']),
0x0B0: Op('LitHI4', ['0x', '0x']),
0x0B1: Op('LitHI8', ['0x', '0x', '0x', '0x']),
0x0B2: Op('LitNothing'),
0x0B3: Op('LitOI2', ['0x']),
0x0B4: Op('LitOI4', ['0x', '0x']),
0x0B5: Op('LitOI8', ['0x', '0x', '0x', '0x']),
0x0B6: Op('LitR4', ['0x', '0x']),
0x0B7: Op('LitR8', ['0x', '0x', '0x', '0x']),
0x0B8: Op('LitSmallI2'),
0x0B9: Op('LitStr', [], True),
0x0BA: Op('LitVarSpecial'),
0x0BB: Op('Lock'),
0x0BC: Op('Loop'),
0x0BD: Op('LoopUntil'),
0x0BE: Op('LoopWhile'),
0x0BF: Op('LSet'),
0x0C0: Op('Me'),
0x0C1: Op('MeImplicit'),
0x0C2: Op('MemRedim', ['name', '0x', 'type_']),
0x0C3: Op('MemRedimWith', ['name', '0x', 'type_']),
0x0C4: Op('MemRedimAs', ['name', '0x', 'type_']),
0x0C5: Op('MemRedimAsWith', ['name', '0x', 'type_']),
0x0C6: Op('Mid'),
0x0C7: Op('MidB'),
0x0C8: Op('Name'),
0x0C9: Op('New', ['imp_']),
0x0CA: Op('Next'),
0x0CB: Op('NextVar'),
0x0CC: Op('OnError', ['name']),
0x0CD: Op('OnGosub', [], True),
0x0CE: Op('OnGoto', [], True),
0x0CF: Op('Open', ['0x']),
0x0D0: Op('Option'),
0x0D1: Op('OptionBase'),
0x0D2: Op('ParamByVal'),
0x0D3: Op('ParamOmitted'),
0x0D4: Op('ParamNamed', ['name']),
0x0D5: Op('PrintChan'),
0x0D6: Op('PrintComma'),
0x0D7: Op('PrintEoS'),
0x0D8: Op('PrintItemComma'),
0x0D9: Op('PrintItemNL'),
0x0DA: Op('PrintItemSemi'),
0x0DB: Op('PrintNL'),
0x0DC: Op('PrintObj'),
0x0DD: Op('PrintSemi'),
0x0DE: Op('PrintSpc'),
0x0DF: Op('PrintTab'),
0x0E0: Op('PrintTabComma'),
0x0E1: Op('PSet', ['0x']),
0x0E2: Op('PutRec'),
0x0E3: Op('QuoteRem', ['0x'], True),
0x0E4: Op('Redim', ['name', '0x', 'type_']),
0x0E5: Op('RedimAs', ['name', '0x', 'type_']),
0x0E6: Op('Reparse', [], True),
0x0E7: Op('Rem', [], True),
0x0E8: Op('Resume', ['name']),
0x0E9: Op('Return'),
0x0EA: Op('RSet'),
0x0EB: Op('Scale', ['0x']),
0x0EC: Op('Seek'),
0x0ED: Op('SelectCase'),
0x0EE: Op('SelectIs', ['imp_']),
0x0EF: Op('SelectType'),
0x0F0: Op('SetStmt'),
0x0F1: Op('Stack', ['0x', '0x']),
0x0F2: Op('Stop'),
0x0F3: Op('Type', ['rec_']),
0x0F4: Op('Unlock'),
0x0F5: Op('VarDefn', ['var_']),
0x0F6: Op('Wend'),
0x0F7: Op('While'),
0x0F8: Op('With'),
0x0F9: Op('WriteChan'),
0x0FA: Op('ConstFuncExpr'),
0x0FB: Op('LbConst', ['name']),
0x0FC: Op('LbIf'),
0x0FD: Op('LbElse'),
0x0FE: Op('LbElseIf'),
0x0FF: Op('LbEndIf'),
0x100: Op('LbMark'),
0x101: Op('EndForVariable'),
0x102: Op('StartForVariable'),
0x103: Op('NewRedim'),
0x104: Op('StartWithExpr'),
0x105: Op('SetOrSt', ['name']),
0x106: Op('EndEnum'),
0x107: Op('Illegal'),
}
INTERNAL_NAMES: list[str] = [
'<crash>',
'0',
'Abs',
'Access',
'AddressOf',
'Alias',
'And',
'Any',
'Append',
'Array',
'As',
'Assert',
'B',
'Base',
'BF',
'Binary',
'Boolean',
'ByRef',
'Byte',
'ByVal',
'Call',
'Case',
'CBool',
'CByte',
'CCur',
'CDate',
'CDec',
'CDbl',
'CDecl',
'ChDir',
'CInt',
'Circle',
'CLng',
'Close',
'Compare',
'Const',
'CSng',
'CStr',
'CurDir',
'CurDir$',
'CVar',
'CVDate',
'CVErr',
'Currency',
'Database',
'Date',
'Date$',
'Debug',
'Decimal',
'Declare',
'DefBool',
'DefByte',
'DefCur',
'DefDate',
'DefDec',
'DefDbl',
'DefInt',
'DefLng',
'DefObj',
'DefSng',
'DefStr',
'DefVar',
'Dim',
'Dir',
'Dir$',
'Do',
'DoEvents',
'Double',
'Each',
'Else',
'ElseIf',
'Empty',
'End',
'EndIf',
'Enum',
'Eqv',
'Erase',
'Error',
'Error$',
'Event',
'WithEvents',
'Explicit',
'F',
'False',
'Fix',
'For',
'Format',
'Format$',
'FreeFile',
'Friend',
'Function',
'Get',
'Global',
'Go',
'GoSub',
'Goto',
'If',
'Imp',
'Implements',
'In',
'Input',
'Input$',
'InputB',
'InputB',
'InStr',
'InputB$',
'Int',
'InStrB',
'Is',
'Integer',
'Left',
'LBound',
'LenB',
'Len',
'Lib',
'Let',
'Line',
'Like',
'Load',
'Local',
'Lock',
'Long',
'Loop',
'LSet',
'Me',
'Mid',
'Mid$',
'MidB',
'MidB$',
'Mod',
'Module',
'Name',
'New',
'Next',
'Not',
'Nothing',
'Null',
'Object',
'On',
'Open',
'Option',
'Optional',
'Or',
'Output',
'ParamArray',
'Preserve',
'Print',
'Private',
'Property',
'PSet',
'Public',
'Put',
'RaiseEvent',
'Random',
'Randomize',
'Read',
'ReDim',
'Rem',
'Resume',
'Return',
'RGB',
'RSet',
'Scale',
'Seek',
'Select',
'Set',
'Sgn',
'Shared',
'Single',
'Spc',
'Static',
'Step',
'Stop',
'StrComp',
'String',
'String$',
'Sub',
'Tab',
'Text',
'Then',
'To',
'True',
'Type',
'TypeOf',
'UBound',
'Unload',
'Unlock',
'Unknown',
'Until',
'Variant',
'WEnd',
'While',
'Width',
'With',
'Write',
'Xor',
'#Const',
'#Else',
'#ElseIf',
'#End',
'#If',
'Attribute',
'VB_Base',
'VB_Control',
'VB_Creatable',
'VB_Customizable',
'VB_Description',
'VB_Exposed',
'VB_Ext_Key',
'VB_HelpID',
'VB_Invoke_Func',
'VB_Invoke_Property',
'VB_Invoke_PropertyPut',
'VB_Invoke_PropertyPutRef',
'VB_MemberFlags',
'VB_Name',
'VB_PredecraredID',
'VB_ProcData',
'VB_TemplateDerived',
'VB_VarDescription',
'VB_VarHelpID',
'VB_VarMemberFlags',
'VB_VarProcData',
'VB_UserMemID',
'VB_VarUserMemID',
'VB_GlobalNameSpace',
',',
'.',
'"',
'_',
'!',
'#',
'&',
"'",
'(',
')',
'*',
'+',
'-',
' /',
':',
';',
'<',
'<=',
'<>',
'=',
'=<',
'=>',
'>',
'><',
'>=',
'?',
'\\',
'^',
':=',
]
DIM_TYPES: list[str] = [
'', 'Null', 'Integer', 'Long', 'Single', 'Double', 'Currency',
'Date', 'String', 'Object', 'Error', 'Boolean', 'Variant', '',
'Decimal', '', '', 'Byte', '', '', 'LongLong', '', '', '',
'Any',
]
def _get_word(buffer: bytes | bytearray | memoryview, offset: int, endian: str) -> int:
return _STRUCT_WORD[endian].unpack_from(buffer, offset)[0]
def _get_dword(buffer: bytes | bytearray | memoryview, offset: int, endian: str) -> int:
return _STRUCT_DWORD[endian].unpack_from(buffer, offset)[0]
def _skip_structure(
buffer: bytes | bytearray | memoryview,
offset: int,
endian: str,
is_length_dw: bool,
element_size: int,
check_minus_one: bool,
) -> int:
if is_length_dw:
length = _get_dword(buffer, offset, endian)
offset += 4
skip = check_minus_one and (length == 0xFFFFFFFF)
else:
length = _get_word(buffer, offset, endian)
offset += 2
skip = check_minus_one and (length == 0xFFFF)
if not skip:
offset += length * element_size
return offset
def _get_var(
buffer: bytes | bytearray | memoryview,
offset: int,
endian: str,
is_dword: bool,
) -> tuple[int, int]:
if is_dword:
value = _get_dword(buffer, offset, endian)
offset += 4
else:
value = _get_word(buffer, offset, endian)
offset += 2
return offset, value
def _get_type_and_length(
buffer: bytes | bytearray | memoryview,
offset: int,
endian: str,
) -> tuple[int, int]:
if endian == '>':
return buffer[offset], buffer[offset + 1]
else:
return buffer[offset + 1], buffer[offset]
def _translate_opcode(opcode: int, vba_ver: int, is_64bit: bool) -> int:
if vba_ver == 3:
if 0 <= opcode <= 67:
return opcode
elif 68 <= opcode <= 70:
return opcode + 2
elif 71 <= opcode <= 111:
return opcode + 4
elif 112 <= opcode <= 150:
return opcode + 8
elif 151 <= opcode <= 164:
return opcode + 9
elif 165 <= opcode <= 166:
return opcode + 10
elif 167 <= opcode <= 169:
return opcode + 11
elif 170 <= opcode <= 238:
return opcode + 12
else:
return opcode + 24
elif vba_ver == 5:
if 0 <= opcode <= 68:
return opcode
elif 69 <= opcode <= 71:
return opcode + 1
elif 72 <= opcode <= 112:
return opcode + 3
elif 113 <= opcode <= 151:
return opcode + 7
elif 152 <= opcode <= 165:
return opcode + 8
elif 166 <= opcode <= 167:
return opcode + 9
elif 168 <= opcode <= 170:
return opcode + 10
else:
return opcode + 11
elif not is_64bit:
if 0 <= opcode <= 173:
return opcode
elif 174 <= opcode <= 175:
return opcode + 1
elif 176 <= opcode <= 178:
return opcode + 2
else:
return opcode + 3
else:
return opcode
def _get_id(
id_code: int,
identifiers: list[str],
vba_ver: int,
is_64bit: bool,
) -> str:
orig_code = id_code
id_code >>= 1
try:
if id_code >= 0x100:
id_code -= 0x100
if vba_ver >= 7:
id_code -= 4
if is_64bit:
id_code -= 3
return identifiers[id_code]
else:
if vba_ver >= 7:
if id_code == 0xE9:
return 'PtrSafe'
if id_code > 0xE9:
id_code -= 1
if vba_ver >= 6 and id_code >= 0xC3:
id_code -= 1
return INTERNAL_NAMES[id_code]
except (IndexError, KeyError):
return F'id_{orig_code:04X}'
def _get_name(
buffer: bytes | bytearray | memoryview,
identifiers: list[str],
offset: int,
endian: str,
vba_ver: int,
is_64bit: bool,
) -> str:
object_id = _get_word(buffer, offset, endian)
return _get_id(object_id, identifiers, vba_ver, is_64bit)
def _get_type_name(type_id: int) -> str:
type_flags = type_id & 0xE0
type_id &= ~0xE0
type_name = DIM_TYPES[type_id] if type_id < len(DIM_TYPES) else ''
if type_flags & 0x80:
if type_name == 'LongLong':
type_name = 'Long'
type_name += 'Ptr'
return type_name
def _disasm_type(
indirect_table: bytes | bytearray | memoryview,
dword: int,
) -> str:
type_id = indirect_table[dword + 6]
if type_id < len(DIM_TYPES):
return DIM_TYPES[type_id]
else:
return F'type_{dword:08X}'
class DisassemblyContext:
"""
Holds shared state for the disassembly of a single VBA module, eliminating repeated parameter
threading through every helper function.
"""
def __init__(
self,
indirect_table: bytes | bytearray | memoryview,
object_table: bytes | bytearray | memoryview,
declaration_table: bytes | bytearray | memoryview,
identifiers: list[str],
endian: str,
vba_ver: int,
is_64bit: bool,
codec: str,
version: int = 0,
):
self.indirect_table = indirect_table
self.object_table = object_table
self.declaration_table = declaration_table
self.identifiers = identifiers
self.endian = endian
self.vba_ver = vba_ver
self.is_64bit = is_64bit
self.codec = codec
self.version = version
self._linecont_pending = False
self._has_pa_bit = False
def disasm_name(self, word: int, mnemonic: str, op_type: int) -> str:
var_types = [
'', '?', '%', '&', '!', '#', '@', '?', '$', '?', '?', '?', '?', '?',
]
var_name = _get_id(word, self.identifiers, self.vba_ver, self.is_64bit)
if op_type < len(var_types):
str_type = var_types[op_type]
else:
str_type = ''
if op_type == 32:
var_name = F'[{var_name}]'
if mnemonic == 'OnError':
str_type = ''
if op_type == 1:
var_name = '(Resume Next)'
elif op_type == 2:
var_name = '(GoTo 0)'
elif mnemonic == 'Resume':
str_type = ''
if op_type == 1:
var_name = '(Next)'
elif op_type != 0:
var_name = ''
return (var_name + str_type).rstrip()
def disasm_imp(self, arg: str, word: int, mnemonic: str) -> str:
if mnemonic != 'Open':
if arg == 'imp_':
shift = 3 if self.is_64bit else 2
offs = (word >> shift) * 10
if offs + 8 <= len(self.object_table):
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name != 0:
return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit)
return ''
return F'{arg}{word:04X}'
access_mode = ['Read', 'Write', 'Read Write']
lock_mode = ['Read Write', 'Write', 'Read']
mode = word & 0x00FF
access = (word & 0x0F00) >> 8
lock = (word & 0xF000) >> 12
imp_name = '(For '
if mode & 0x01:
imp_name += 'Input'
elif mode & 0x02:
imp_name += 'Output'
elif mode & 0x04:
imp_name += 'Random'
elif mode & 0x08:
imp_name += 'Append'
elif mode == 0x20:
imp_name += 'Binary'
if access and (access <= len(access_mode)):
imp_name += F' Access {access_mode[access - 1]}'
if lock:
if lock & 0x04:
imp_name += ' Shared'
elif lock <= len(lock_mode):
imp_name += F' Lock {lock_mode[lock - 1]}'
imp_name += ')'
return imp_name
def disasm_rec(self, dword: int) -> str:
object_name = _get_name(
self.indirect_table, self.identifiers, dword + 2,
self.endian, self.vba_ver, self.is_64bit)
options = _get_word(self.indirect_table, dword + 18, self.endian)
if (options & 1) == 0:
object_name = F'(Private) {object_name}'
return object_name
def disasm_object(self, offset: int) -> tuple[str, bool]:
if self.is_64bit:
type_desc = _get_dword(self.indirect_table, offset, self.endian)
if type_desc + 4 > len(self.indirect_table):
return '', False
flags = _get_word(self.indirect_table, type_desc, self.endian)
is_array = bool(flags & 0x0800)
if flags & 0x02:
return _disasm_type(self.indirect_table, type_desc), is_array
word = _get_word(self.indirect_table, type_desc + 2, self.endian)
offs = (word >> 3) * 10
if offs + 8 > len(self.object_table):
return '', False
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name == 0 or (hl_name >> 1) < 0x100:
return '', is_array
return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit), is_array
type_desc = _get_dword(self.indirect_table, offset, self.endian)
flags = _get_word(self.indirect_table, type_desc, self.endian)
is_array = bool(flags & 0x0800)
if flags & 0x02:
return _disasm_type(self.indirect_table, type_desc), is_array
word = _get_word(self.indirect_table, type_desc + 2, self.endian)
offs = (word >> 2) * 10
if offs + 4 > len(self.object_table):
return '', False
hl_name = _get_word(self.object_table, offs + 6, self.endian)
if hl_name == 0 or (hl_name >> 1) < 0x100:
return '', is_array
return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit), is_array
def disasm_var(self, dword: int) -> str:
b_flag1 = self.indirect_table[dword]
b_flag2 = self.indirect_table[dword + 1]
has_as = (b_flag1 & 0x20) != 0
has_new = (b_flag2 & 0x20) != 0
var_name = _get_name(
self.indirect_table, self.identifiers, dword + 2,
self.endian, self.vba_ver, self.is_64bit)
is_array = False
if has_new or has_as:
type_name = ''
if has_as:
offs = 16 if self.is_64bit else 12
word = _get_word(self.indirect_table, dword + offs + 2, self.endian)
if word == 0xFFFF:
type_id = self.indirect_table[dword + offs]
type_name = _get_type_name(type_id)
else:
type_name, is_array = self.disasm_object(dword + offs)
var_type = ''
if has_as and len(type_name) > 0:
var_type += 'As '
if has_new and (not has_as or len(type_name) > 0):
var_type += 'New '
if has_as and len(type_name) > 0:
var_type += type_name
if is_array:
var_name += '()'
if len(var_type) > 0:
var_name += F' ({var_type.rstrip()})'
else:
offs = 16 if self.is_64bit else 12
if len(self.indirect_table) >= dword + offs + 4:
word = _get_word(self.indirect_table, dword + offs + 2, self.endian)
if word == 0xFFFF:
_TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'}
type_id = self.indirect_table[dword + offs]
suffix = _TYPE_SUFFIXES.get(type_id)
if suffix is not None:
var_name += suffix
else:
try:
_, is_array = self.disasm_object(dword + offs)
except Exception:
is_array = False
if is_array:
var_name += '()'
return var_name
def disasm_arg(self, arg_offset: int) -> str | None:
flags = _get_word(self.indirect_table, arg_offset, self.endian)
offs = 4 if self.is_64bit else 0
name_word = _get_word(self.indirect_table, arg_offset + 2, self.endian)
if name_word == 0xFFFE:
return None
arg_name = _get_name(
self.indirect_table, self.identifiers, arg_offset + 2,
self.endian, self.vba_ver, self.is_64bit)
arg_type = _get_dword(self.indirect_table, arg_offset + offs + 12, self.endian)
arg_opts = _get_word(self.indirect_table, arg_offset + offs + 24, self.endian)
is_paramarray = bool(arg_opts & 0x0001)
if is_paramarray:
self._has_pa_bit = True
if arg_opts & 0x0004:
arg_name = F'ByVal {arg_name}'
if arg_opts & 0x0002:
arg_name = F'ByRef {arg_name}'
if arg_opts & 0x0200:
arg_name = F'Optional {arg_name}'
if flags & 0x0020:
arg_type_name = ''
is_array = False
if (arg_type & 0xFFFF0000) == 0xFFFF0000:
arg_type_id = arg_type & 0x000000FF
arg_type_name = _get_type_name(arg_type_id)
else:
arg_type_name, is_array = self.disasm_object(arg_offset + offs + 12)
if is_array:
arg_name += '()'
if arg_type_name:
arg_name += ' As '
arg_name += arg_type_name
elif (arg_type & 0xFFFF0000) == 0xFFFF0000:
_TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'}
arg_type_id = arg_type & 0x000000FF
suffix = _TYPE_SUFFIXES.get(arg_type_id)
if suffix is not None:
arg_name += suffix
else:
try:
_type_name, is_array = self.disasm_object(arg_offset + offs + 12) # noqa: F841
except Exception:
is_array = False
if is_array:
arg_name += '()'
if arg_opts & 0x0200:
default_tag_off = arg_offset + offs + 28
default_val_off = arg_offset + offs + 32
ind = self.indirect_table
if default_tag_off + 2 <= len(ind) and default_val_off + 4 <= len(ind):
vt_tag = _get_word(ind, default_tag_off, self.endian)
value_dw = _get_dword(ind, default_val_off, self.endian)
default_str = self._format_default_value(vt_tag, value_dw)
if default_str is not None:
arg_name += F' = {default_str}'
if is_paramarray:
arg_name = F'ParamArray {arg_name}'
return arg_name
def _format_default_value(self, vt_tag: int, value_dw: int) -> str | None:
VT_I2 = 2
VT_I4 = 3
VT_R4 = 4
VT_R8 = 5
VT_CY = 6
VT_BSTR = 8
VT_BOOL = 11
VT_UI1 = 17
ind = self.indirect_table
if vt_tag == 0:
return None
elif vt_tag == VT_I2:
val = value_dw & 0xFFFF
return str(val - 0x10000 if val > 0x7FFF else val)
elif vt_tag == VT_I4:
return str(value_dw - 0x100000000 if value_dw > 0x7FFFFFFF else value_dw)
elif vt_tag == VT_R4:
val = _struct.unpack('<f', _struct.pack('<I', value_dw))[0]
return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val)
elif vt_tag == VT_R8:
if value_dw + 8 <= len(ind):
val = _struct.unpack('<d', bytes(ind[value_dw:value_dw + 8]))[0]
return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val)
elif vt_tag == VT_CY:
val = value_dw / 10000
return str(int(val)) if val == int(val) else str(val)
elif vt_tag == VT_BSTR:
if value_dw + 4 <= len(ind):
str_len = _get_dword(ind, value_dw, self.endian)
if str_len == 0:
return '""'
if 0 < str_len < 0x10000 and value_dw + 4 + str_len <= len(ind):
s = bytes(ind[value_dw + 4:value_dw + 4 + str_len]).decode(self.codec, errors='replace')
return F'"{s}"'
elif vt_tag == VT_BOOL:
return 'True' if (value_dw & 0xFFFF) != 0 else 'False'
elif vt_tag == VT_UI1:
return str(value_dw & 0xFF)
return None
def _declare64(self, decl_offset: int, func_name: str) -> tuple[str, str | None]:
"""
Extract Lib and Alias names from a 64-bit Declare entry in the declaration table.
The 64-bit entry structure differs significantly from 32-bit: the lib name identifier
word is not at a fixed offset within the entry header. Instead, we extract the lib name
from VBA source text stored later in the declaration table, falling back to the binary
structure when source text is not available.
"""
decl = self.declaration_table
decl_bytes = bytes(decl)
lib_name = None
alias_name = None
# Strategy 1: Extract from VBA source text in the declaration table.
# The source text may contain embedded null bytes, so strip them before matching.
text = decl_bytes.replace(b'\x00', b'').decode('ascii', errors='replace')
match = re.search(
rf'(?:Function|Sub)\s+{re.escape(func_name)}\b.*?Lib\s+"([^"]+)"', text)
if match:
lib_name = match.group(1)
after_lib = text[match.end():]
alias_match = re.match(r'\s*Alias\s*"([^"]+)"', after_lib)
if alias_match:
alias_name = alias_match.group(1)
# Strategy 2: Binary structure fallback. The alias string offset depends on version:
# VBA7 version 0x0097 has 4 extra bytes of padding (alias at +0x20), later versions
# use the standard offset (+0x1C).
_alias_off = 0x20 if self.version <= 0x97 else 0x1C
if lib_name is None and self.version > 0x97 and decl_offset >= 2:
# For VBA7 versions after 0x97 the lib identifier word for each entry is stored
# in the 2 bytes immediately preceding the entry header, placed there as trailing
# data of the previous entry. This does not apply to the very first entry
# (decl_offset == 0) or to versions <= 0x97 where the lib word sits at header +2.
lib_word = _get_word(decl, decl_offset - 2, self.endian)
if lib_word != 0 and lib_word != 0xFFFF:
lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit)
if lib_name is None:
alias_start = decl_offset + _alias_off
if alias_start < len(decl):
alias_bytes_raw = bytes(decl[alias_start:])
null_pos = alias_bytes_raw.find(0)
if null_pos > 0 and all(32 <= b < 127 for b in alias_bytes_raw[:null_pos]):
abs_null = alias_start + null_pos
dword_aligned = (abs_null + 1 + 3) & ~3
lib_word_offset = dword_aligned + 2
if lib_word_offset + 2 <= len(decl):
lib_word = _get_word(decl, lib_word_offset, self.endian)
if lib_word != 0 and lib_word != 0xFFFF:
lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit)
if lib_name is None:
lib_word = _get_word(decl, decl_offset + 2, self.endian)
if lib_word != 0:
lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit)
# Read alias from binary structure if not found via source text.
if alias_name is None:
alias_start = decl_offset + _alias_off
if alias_start < len(decl):
alias_bytes_raw = bytes(decl[alias_start:])
null_pos = alias_bytes_raw.find(0)
if null_pos > 0:
alias_name = alias_bytes_raw[:null_pos].decode(self.codec, errors='replace')
return lib_name, alias_name
def disasm_func(self, dword: int, op_type: int) -> str:
func_decl = '('
flags = _get_word(self.indirect_table, dword, self.endian)
name_word = _get_word(self.indirect_table, dword + 2, self.endian)
offs2 = 4 if self.vba_ver > 5 else 0
if self.is_64bit:
offs2 += 16
if (
self._linecont_pending
and offs2 >= 4
and self.indirect_table[dword + 4:dword + 8] == b'\xFF\xFF\xFF\xFF'
and (name_word >> 1) >= 0x100
):
name_word += 2
self._linecont_pending = False
sub_name = _get_id(name_word, self.identifiers, self.vba_ver, self.is_64bit)
arg_offset = _get_dword(self.indirect_table, dword + offs2 + 36, self.endian)
ret_type = _get_dword(self.indirect_table, dword + offs2 + 40, self.endian)
decl_offset = _get_word(self.indirect_table, dword + offs2 + 44, self.endian)
c_options_offset = 60 if self.is_64bit and self.version > 0x97 else 54
c_options = self.indirect_table[dword + offs2 + c_options_offset]
new_flags_offset = 63 if self.is_64bit and self.version > 0x97 else 57
new_flags = self.indirect_table[dword + offs2 + new_flags_offset]
has_declare = False
if self.vba_ver > 5:
if (new_flags & 0x0002) == 0:
func_decl += 'Private '
elif op_type & 0x04:
func_decl += 'Public '
if new_flags & 0x0004:
func_decl += 'Friend '
else:
if (flags & 0x0008) == 0:
func_decl += 'Private '
elif op_type & 0x04:
func_decl += 'Public '
if flags & 0x0080:
func_decl += 'Static '
if (
(c_options & 0x90) == 0
and (decl_offset != 0xFFFF)
):
has_declare = True
func_decl += 'Declare '
if self.vba_ver > 5:
if new_flags & 0x20:
func_decl += 'PtrSafe '
has_as = (flags & 0x0020) != 0
if flags & 0x1000:
if op_type in (2, 6):
func_decl += 'Function '
else:
func_decl += 'Sub '
elif flags & 0x2000:
func_decl += 'Property Get '
elif flags & 0x4000:
func_decl += 'Property Let '
elif flags & 0x8000:
func_decl += 'Property Set '
func_decl += sub_name
if not has_as and (ret_type & 0xFFFF0000) == 0xFFFF0000:
_TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'}
ret_type_id = ret_type & 0x000000FF
suffix = _TYPE_SUFFIXES.get(ret_type_id)
if suffix is not None:
func_decl += suffix
if has_declare:
if self.is_64bit:
lib_name, alias_name = self._declare64(decl_offset, sub_name)
else:
lib_name = _get_name(
self.declaration_table, self.identifiers, decl_offset + 2,
self.endian, self.vba_ver, self.is_64bit)
alias_name = None
alias_offset = _get_word(self.declaration_table, decl_offset + 4, self.endian)
if alias_offset < len(self.declaration_table):
alias_bytes = bytes(self.declaration_table[alias_offset:])
null_pos = alias_bytes.find(0)
if null_pos > 0:
alias_name = alias_bytes[:null_pos].decode(self.codec, errors='replace')
func_decl += F' Lib "{lib_name}"'
if alias_name and alias_name != sub_name:
func_decl += F' Alias "{alias_name}"'
func_decl += ' '
arg_list: list[str] = []
while (
arg_offset != 0xFFFFFFFF
and arg_offset != 0
and arg_offset + 26 < len(self.indirect_table)
):
arg_name = self.disasm_arg(arg_offset)
if arg_name is not None:
arg_list.append(arg_name)
arg_offset = _get_dword(self.indirect_table, arg_offset + (24 if self.is_64bit else 20), self.endian)
if arg_list and not self._has_pa_bit and not any(a.startswith('ParamArray ') for a in arg_list):
last = arg_list[-1]
_pa_candidate = (
last.endswith('() As Variant')
or (last.endswith('()') and ' As ' not in last)
)
_pa_no_modifiers = not any(
last.startswith(p) for p in ('ByVal ', 'ByRef ', 'Optional '))
if _pa_candidate and _pa_no_modifiers:
arg_list[-1] = F'ParamArray {last}'
func_decl += F'({", ".join(arg_list)})'
if has_as:
func_decl += ' As '
type_name = ''
is_array = False
if (ret_type & 0xFFFF0000) == 0xFFFF0000:
type_id = ret_type & 0x000000FF
type_name = _get_type_name(type_id)
else:
type_name, is_array = self.disasm_object(dword + offs2 + 40)
func_decl += type_name
if is_array:
func_decl += '()'
func_decl += ')'
return func_decl
def disasm_var_arg(
self,
module_data: bytes | bytearray | memoryview,
offset: int,
w_length: int,
mnemonic: str,
) -> list[str]:
substring = module_data[offset:offset + w_length]
length_str = F'0x{w_length:04X}'
if mnemonic in ('LitStr', 'QuoteRem', 'Rem', 'Reparse'):
quoted = F'"{codecs.decode(substring, self.codec, "replace")}"'
return [length_str, quoted]
elif mnemonic in ('OnGosub', 'OnGoto'):
offset1 = offset
names: list[str] = []
for _ in range(w_length // 2):
offset1, word = _get_var(module_data, offset1, self.endian, False)
names.append(_get_id(word, self.identifiers, self.vba_ver, self.is_64bit))
return [length_str, ', '.join(names)]
else:
hex_dump = ' '.join(F'{c:02X}' for c in substring)
return [length_str, hex_dump]
def dump_line(
self,
module_data: bytes | bytearray | memoryview,
line_start: int,
line_length: int,
) -> list[tuple[str, list[str]]]:
"""
Disassemble one p-code line into a list of (mnemonic, [arg, ...]) tuples.
"""
self._linecont_pending = False
result: list[tuple[str, list[str]]] = []
if line_length <= 0:
return result
offset = line_start
end_of_line = line_start + line_length
while offset < end_of_line:
offset, opcode = _get_var(module_data, offset, self.endian, False)
op_type = (opcode & ~0x03FF) >> 10
opcode &= 0x03FF
translated = _translate_opcode(opcode, self.vba_ver, self.is_64bit)
if translated not in OPCODES:
return result
instruction = OPCODES[translated]
mnemonic = instruction.mnem
parts: list[str] = []
if mnemonic in ('Coerce', 'CoerceVar', 'DefType'):
if op_type < len(_VAR_TYPES_LONG):
parts.append(F'({_VAR_TYPES_LONG[op_type]})')
elif op_type == 17:
parts.append('(Byte)')
else:
parts.append(F'({op_type:d})')
elif mnemonic in ('Dim', 'DimImplicit', 'Type'):
dim_type: list[str] = []
if op_type & 0x04:
dim_type.append('Global')
elif op_type & 0x08:
dim_type.append('Public')
elif op_type & 0x10:
dim_type.append('Private')
elif op_type & 0x20:
dim_type.append('Static')
if (op_type & 0x01) and (mnemonic != 'Type'):
dim_type.append('Const')
if dim_type:
parts.append(F'({" ".join(dim_type)})')
elif mnemonic == 'LitVarSpecial':
parts.append(F'({_SPECIALS[op_type]})')
elif mnemonic in ('ArgsCall', 'ArgsMemCall', 'ArgsMemCallWith'):
if op_type < 16:
parts.append('(Call)')
else:
op_type -= 16
elif mnemonic == 'Option':
parts.append(F'({_OPTIONS[op_type]})')
elif mnemonic in ('Redim', 'RedimAs'):
if op_type & 16:
parts.append('(Preserve)')
elif mnemonic in (
'FnDir', 'FnFormat', 'FnStringVar', 'FnStringStr',
):
parts.append(F'0x{op_type:04X}')
elif mnemonic == 'LitSmallI2':
parts.append(str(op_type))
for arg in instruction.args:
if arg == 'name':
offset, word = _get_var(module_data, offset, self.endian, False)
the_name = self.disasm_name(word, mnemonic, op_type)
parts.append(the_name)
elif arg in ('0x', 'imp_'):
offset, word = _get_var(module_data, offset, self.endian, False)
the_imp = self.disasm_imp(arg, word, mnemonic)
parts.append(the_imp)
elif arg in ('func_', 'var_', 'rec_', 'type_', 'context_'):
offset, dword = _get_var(module_data, offset, self.endian, True)
if (
arg == 'rec_'
and len(self.indirect_table) >= dword + 20
):
parts.append(self.disasm_rec(dword))
elif (
arg == 'type_'
and len(self.indirect_table) >= dword + 7
):
the_type = _disasm_type(self.indirect_table, dword)
parts.append(F'(As {the_type})')
elif (
arg == 'var_'
and len(self.indirect_table) >= dword + 16
):
if op_type & 0x20:
parts.append('(WithEvents)')
parts.append(self.disasm_var(dword))
if op_type & 0x10:
word = _get_word(module_data, offset, self.endian)
offset += 2
parts.append(F'0x{word:04X}')
elif (
arg == 'func_'
and len(self.indirect_table) >= dword + 61
):
parts.append(self.disasm_func(dword, op_type))
else:
parts.append(F'{arg}{dword:08X}')
if self.is_64bit and (arg == 'context_'):
offset, dword = _get_var(module_data, offset, self.endian, True)
parts.append(F'{dword:08X}')
if instruction.varg:
offset, w_length = _get_var(module_data, offset, self.endian, False)
var_arg_parts = self.disasm_var_arg(
module_data, offset, w_length, mnemonic)
parts.extend(var_arg_parts)
offset += w_length
if w_length & 1:
offset += 1
result.append((mnemonic, parts))
if mnemonic == 'LineCont':
self._linecont_pending = True
return result
# MS-OVBA specification offsets for module stream parsing
_OFFSET_DW_LENGTH = 0x0005
_OFFSET_VBA6_INDIRECT_START = 0x0011
_OFFSET_VBA6_32_DECL_LENGTH = 0x003F
_OFFSET_VBA6_32_DECL_DATA = 0x0043
_OFFSET_VBA6_64_DECL_LENGTH = 0x0043
_OFFSET_VBA6_64_DECL_DATA = 0x0047
_OFFSET_VBA6_64_LINE_START = 0x0019
_OFFSET_OBJECT_TABLE = 0x008A
_OFFSET_PCODE_LINES = 0x003C
_PCODE_MAGIC = 0xCAFE
def _pcode_dump(
module_data: bytes | bytearray | memoryview,
vba_project_data: bytes | bytearray | memoryview,
identifiers: list[str],
is_64bit: bool,
codec: str,
) -> list[PCodeLine]:
"""
Disassemble p-code from a VBA module stream. Returns structured PCodeLine objects.
"""
lines: list[PCodeLine] = []
if _get_word(module_data, 2, '<') > 0xFF:
endian = '>'
else:
endian = '<'
vba_ver = 3
try:
version = _get_word(vba_project_data, 2, endian)
if version >= 0x6B:
if version >= 0x97:
vba_ver = 7
else:
vba_ver = 6
if is_64bit:
dw_length = _get_dword(module_data, _OFFSET_VBA6_64_DECL_LENGTH, endian)
declaration_table = module_data[
_OFFSET_VBA6_64_DECL_DATA:_OFFSET_VBA6_64_DECL_DATA + dw_length]
dw_length = _get_dword(module_data, _OFFSET_VBA6_INDIRECT_START, endian)
table_start = dw_length + 12
else:
dw_length = _get_dword(module_data, _OFFSET_VBA6_32_DECL_LENGTH, endian)
declaration_table = module_data[
_OFFSET_VBA6_32_DECL_DATA:_OFFSET_VBA6_32_DECL_DATA + dw_length]
dw_length = _get_dword(module_data, _OFFSET_VBA6_INDIRECT_START, endian)
table_start = dw_length + 10
dw_length = _get_dword(module_data, table_start, endian)
table_start += 4
indirect_table = module_data[
table_start:table_start + dw_length]
dw_length = _get_dword(module_data, _OFFSET_DW_LENGTH, endian)
dw_length2 = dw_length + _OFFSET_OBJECT_TABLE
dw_length = _get_dword(module_data, dw_length2, endian)
dw_length2 += 4
object_table = module_data[
dw_length2:dw_length2 + dw_length]
offset = _OFFSET_VBA6_64_LINE_START
else:
vba_ver = 5
offset = 11
dw_length = _get_dword(module_data, offset, endian)
offs = offset + 4
declaration_table = module_data[offs:offs + dw_length]
offset = _skip_structure(module_data, offset, endian, True, 1, False)
offset += 64
offset = _skip_structure(module_data, offset, endian, False, 16, False)
offset = _skip_structure(module_data, offset, endian, True, 1, False)
offset += 6
offset = _skip_structure(module_data, offset, endian, True, 1, False)
offs = offset + 8
dw_length = _get_dword(module_data, offs, endian)
table_start = dw_length + 14
offs = dw_length + 10
dw_length = _get_dword(module_data, offs, endian)
indirect_table = module_data[
table_start:table_start + dw_length]
dw_length = _get_dword(module_data, offset, endian)
offs = dw_length + _OFFSET_OBJECT_TABLE
dw_length = _get_dword(module_data, offs, endian)
offs += 4
object_table = module_data[offs:offs + dw_length]
offset += 77
ctx = DisassemblyContext(
indirect_table, object_table, declaration_table,
identifiers, endian, vba_ver, is_64bit, codec, version)
dw_length = _get_dword(module_data, offset, endian)
offset = dw_length + _OFFSET_PCODE_LINES
offset, magic = _get_var(module_data, offset, endian, False)
if magic != _PCODE_MAGIC:
return lines
offset += 2
offset, num_lines = _get_var(module_data, offset, endian, False)
pcode_start = offset + num_lines * 12 + 10
for _ in range(num_lines):
offset += 4
offset, line_length = _get_var(module_data, offset, endian, False)
offset += 2
offset, line_offset = _get_var(module_data, offset, endian, True)
opcodes = ctx.dump_line(module_data, pcode_start + line_offset, line_length)
lines.append(PCodeLine(opcodes))
except Exception as exc:
logger.warning(F'p-code disassembly error: {exc}')
return lines
def _get_identifiers(
vba_project_data: bytes | bytearray | memoryview,
codec: str,
) -> list[str]:
"""
Extract identifier names from the _VBA_PROJECT stream.
"""
identifiers: list[str] = []
try:
magic = _get_word(vba_project_data, 0, '<')
if magic != 0x61CC:
return identifiers
version = _get_word(vba_project_data, 2, '<')
unicode_ref = ((version >= 0x5B)
and (version not in (0x60, 0x62, 0x63))
or (version == 0x4E)
)
unicode_name = ((version >= 0x59)
and (version not in (0x60, 0x62, 0x63))
or (version == 0x4E)
)
non_unicode_name = (((version <= 0x59) and (version != 0x4E))
or (0x5F < version < 0x6B)
)
word = _get_word(vba_project_data, 5, '<')
endian = '>' if word == 0x000E else '<'
offset = 0x1E
offset, num_refs = _get_var(vba_project_data, offset, endian, False)
offset += 2
for _ in range(num_refs):
offset, ref_length = _get_var(vba_project_data, offset, endian, False)
if ref_length == 0:
offset += 6
elif ref_length < 3 + 2 * unicode_ref:
offset += ref_length
else:
if unicode_ref:
c = vba_project_data[offset + 4]
else:
c = vba_project_data[offset + 2]
offset += ref_length
if chr(c) in ('C', 'D'):
offset = _skip_structure(vba_project_data, offset, endian, False, 1, False)
offset += 10
offset, word = _get_var(vba_project_data, offset, endian, False)
if word:
offset = _skip_structure(vba_project_data, offset, endian, False, 1, False)
offset, w_length = _get_var(vba_project_data, offset, endian, False)
if w_length:
offset += 2
offset += w_length + 30
offset = _skip_structure(vba_project_data, offset, endian, False, 2, False)
offset = _skip_structure(vba_project_data, offset, endian, False, 4, False)
offset += 2
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset += 0x64
offset, num_projects = _get_var(vba_project_data, offset, endian, False)
for _ in range(num_projects):
offset, w_length = _get_var(vba_project_data, offset, endian, False)
if unicode_name:
offset += w_length
if non_unicode_name:
if w_length:
offset, w_length = _get_var(vba_project_data, offset, endian, False)
offset += w_length
offset = _skip_structure(vba_project_data, offset, endian, False, 1, False)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset, _ = _get_var(vba_project_data, offset, endian, False)
if version >= 0x6B:
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset = _skip_structure(vba_project_data, offset, endian, False, 1, True)
offset += 2
if version != 0x51:
offset += 4
offset = _skip_structure(vba_project_data, offset, endian, False, 8, False)
offset += 11
offset += 6
offset = _skip_structure(vba_project_data, offset, endian, True, 1, False)
offset += 6
offset, w0 = _get_var(vba_project_data, offset, endian, False)
offset, num_ids = _get_var(vba_project_data, offset, endian, False)
offset, w1 = _get_var(vba_project_data, offset, endian, False)
offset += 4
num_junk_ids = num_ids + w1 - w0
num_ids = w0 - w1
for _ in range(num_junk_ids):
offset += 4
id_type, id_length = _get_type_and_length(vba_project_data, offset, endian)
offset += 2
if id_type > 0x7F:
offset += 6
offset += id_length
for _ in range(num_ids):
is_kwd = False
ident = ''
id_type, id_length = _get_type_and_length(vba_project_data, offset, endian)
offset += 2
if (id_length == 0) and (id_type == 0):
offset += 2
id_type, id_length = _get_type_and_length(vba_project_data, offset, endian)
offset += 2
is_kwd = True
if id_type & 0x80:
offset += 6
if id_length:
ident = codecs.decode(
vba_project_data[offset:offset + id_length], codec, 'replace')
identifiers.append(ident)
offset += id_length
if not is_kwd:
offset += 4
except Exception as exc:
logger.warning(F'identifier extraction error: {exc}')
return identifiers
def format_pcode_text(
module_path: str,
module_data_size: int,
lines: list[PCodeLine],
) -> str:
"""
Render structured PCodeLine data into pcodedmp-compatible text output.
"""
output: list[str] = []
output.append(F'{module_path} - {module_data_size:d} bytes')
for line_num, pcode_line in enumerate(lines):
output.append(F'Line #{line_num:d}:')
for mnemonic, args in pcode_line.opcodes:
text = F'\t{mnemonic} {" ".join(args)}'
output.append(text)
return '\n'.join(output) + '\n'
class PCodeDisassembler:
"""
VBA p-code disassembler that produces structured PCodeModule output. The output is suitable for
consumption by the decompiler for reconstruction to VBA source code.
"""
def __init__(self, data: bytes | bytearray | memoryview):
self._data = data
def iter_modules(self):
"""
Yield PCodeModule objects for each VBA module.
"""
for ole_data in self._get_ole_streams():
ole = OleFile(ole_data)
yield from self._iter_project_modules(ole)
def _iter_project_modules(
self,
ole: OleFile,
):
"""
Iterate over VBA modules in an OLE file, yielding PCodeModule per module.
"""
vba_projects = _find_vba_projects(ole)
if not vba_projects:
return
for vba_root, _, dir_path in vba_projects:
codec, code_modules, is_64bit = self._process_dir(ole, dir_path)
vba_project_path = vba_root + 'VBA/_VBA_PROJECT'
vba_project_data = self._process_vba_project(ole, vba_project_path)
identifiers = _get_identifiers(vba_project_data, codec)
identifiers_stripped = not identifiers
for module in code_modules:
module_path = F'{vba_root}VBA/{module}'
try:
module_data = ole.openstream(module_path).read()
except Exception:
continue
lines = _pcode_dump(
module_data, vba_project_data, identifiers, is_64bit, codec)
yield PCodeModule(module_path, lines, identifiers_stripped)
def _get_ole_streams(self) -> list[bytes | bytearray | memoryview]:
"""
Extract OLE data from the input. If the input is already an OLE compound file, returns it
directly. If it's a ZIP (OOXML), extracts all vbaProject.bin entries.
"""
if self._data[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
return [self._data]
if self._data[:2] == b'PK':
import zipfile
from refinery.lib.structures import MemoryFile
results: list[bytes | bytearray] = []
try:
with zipfile.ZipFile(MemoryFile(self._data, bytes)) as zf:
for name in zf.namelist():
if name.lower().endswith('vbaproject.bin'):
results.append(zf.read(name))
except zipfile.BadZipFile:
pass
return results
return [self._data]
def _process_dir(
self,
ole: OleFile,
dir_path: str,
) -> tuple[str, list[str], bool]:
"""
Parse the VBA dir stream to find module names and codepage. Returns (codec, code_modules,
is_64bit).
"""
dir_data_compressed = ole.openstream(dir_path).read()
dir_data = decompress_stream(dir_data_compressed)
stream_size = len(dir_data)
code_modules: list[str] = []
is_64bit = False
codec = 'latin1'
offset = 0
while offset < stream_size:
try:
tag = _get_word(dir_data, offset, '<')
w_length = _get_word(dir_data, offset + 2, '<')
if tag == 9:
w_length = 6
elif tag == 3:
w_length = 2
offset += 6
if w_length:
if tag == 3:
codepage = _get_word(dir_data, offset, '<')
codec = _codepage_to_codec(codepage)
elif tag == 50:
stream_name = codecs.decode(
dir_data[offset:offset + w_length], 'utf_16_le', errors='replace')
code_modules.append(stream_name)
elif tag == 1:
sys_kind = _get_dword(dir_data, offset, '<')
is_64bit = sys_kind == 3
offset += w_length
except Exception:
break
return codec, code_modules, is_64bit
def _process_vba_project(
self,
ole: OleFile,
vba_project_path: str,
) -> bytes | bytearray | memoryview:
"""
Read the _VBA_PROJECT stream (raw, not compressed).
"""
return ole.openstream(vba_project_path).read()
Functions
def format_pcode_text(module_path, module_data_size, lines)-
Render structured PCodeLine data into pcodedmp-compatible text output.
Expand source code Browse git
def format_pcode_text( module_path: str, module_data_size: int, lines: list[PCodeLine], ) -> str: """ Render structured PCodeLine data into pcodedmp-compatible text output. """ output: list[str] = [] output.append(F'{module_path} - {module_data_size:d} bytes') for line_num, pcode_line in enumerate(lines): output.append(F'Line #{line_num:d}:') for mnemonic, args in pcode_line.opcodes: text = F'\t{mnemonic} {" ".join(args)}' output.append(text) return '\n'.join(output) + '\n'
Classes
class Opcode (mnem, args=[], varg=False)-
Opcode(mnem, args, varg)
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = FalseAncestors
- builtins.tuple
Instance variables
var mnem-
Alias for field number 0
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var args-
Alias for field number 1
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var varg-
Alias for field number 2
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False
class PCodeLine (opcodes)-
Structured representation of one line of disassembled p-code. Each line contains a list of (mnemonic, [arg1, arg2, …]) tuples.
Expand source code Browse git
class PCodeLine(NamedTuple): """ Structured representation of one line of disassembled p-code. Each line contains a list of (mnemonic, [arg1, arg2, ...]) tuples. """ opcodes: list[tuple[str, list[str]]]Ancestors
- builtins.tuple
Instance variables
var opcodes-
Alias for field number 0
Expand source code Browse git
class PCodeLine(NamedTuple): """ Structured representation of one line of disassembled p-code. Each line contains a list of (mnemonic, [arg1, arg2, ...]) tuples. """ opcodes: list[tuple[str, list[str]]]
class PCodeModule (path, lines, identifiers_stripped=False)-
Structured representation of a disassembled VBA module.
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = FalseAncestors
- builtins.tuple
Instance variables
var path-
Alias for field number 0
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = False var lines-
Alias for field number 1
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = False var identifiers_stripped-
Alias for field number 2
Expand source code Browse git
class PCodeModule(NamedTuple): """ Structured representation of a disassembled VBA module. """ path: str lines: list[PCodeLine] identifiers_stripped: bool = False
class Op (mnem, args=[], varg=False)-
Opcode(mnem, args, varg)
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = FalseAncestors
- builtins.tuple
Instance variables
var mnem-
Alias for field number 0
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var args-
Alias for field number 1
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False var varg-
Alias for field number 2
Expand source code Browse git
class Opcode(NamedTuple): mnem: str args: list[str] = [] varg: bool = False
class DisassemblyContext (indirect_table, object_table, declaration_table, identifiers, endian, vba_ver, is_64bit, codec, version=0)-
Holds shared state for the disassembly of a single VBA module, eliminating repeated parameter threading through every helper function.
Expand source code Browse git
class DisassemblyContext: """ Holds shared state for the disassembly of a single VBA module, eliminating repeated parameter threading through every helper function. """ def __init__( self, indirect_table: bytes | bytearray | memoryview, object_table: bytes | bytearray | memoryview, declaration_table: bytes | bytearray | memoryview, identifiers: list[str], endian: str, vba_ver: int, is_64bit: bool, codec: str, version: int = 0, ): self.indirect_table = indirect_table self.object_table = object_table self.declaration_table = declaration_table self.identifiers = identifiers self.endian = endian self.vba_ver = vba_ver self.is_64bit = is_64bit self.codec = codec self.version = version self._linecont_pending = False self._has_pa_bit = False def disasm_name(self, word: int, mnemonic: str, op_type: int) -> str: var_types = [ '', '?', '%', '&', '!', '#', '@', '?', '$', '?', '?', '?', '?', '?', ] var_name = _get_id(word, self.identifiers, self.vba_ver, self.is_64bit) if op_type < len(var_types): str_type = var_types[op_type] else: str_type = '' if op_type == 32: var_name = F'[{var_name}]' if mnemonic == 'OnError': str_type = '' if op_type == 1: var_name = '(Resume Next)' elif op_type == 2: var_name = '(GoTo 0)' elif mnemonic == 'Resume': str_type = '' if op_type == 1: var_name = '(Next)' elif op_type != 0: var_name = '' return (var_name + str_type).rstrip() def disasm_imp(self, arg: str, word: int, mnemonic: str) -> str: if mnemonic != 'Open': if arg == 'imp_': shift = 3 if self.is_64bit else 2 offs = (word >> shift) * 10 if offs + 8 <= len(self.object_table): hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name != 0: return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) return '' return F'{arg}{word:04X}' access_mode = ['Read', 'Write', 'Read Write'] lock_mode = ['Read Write', 'Write', 'Read'] mode = word & 0x00FF access = (word & 0x0F00) >> 8 lock = (word & 0xF000) >> 12 imp_name = '(For ' if mode & 0x01: imp_name += 'Input' elif mode & 0x02: imp_name += 'Output' elif mode & 0x04: imp_name += 'Random' elif mode & 0x08: imp_name += 'Append' elif mode == 0x20: imp_name += 'Binary' if access and (access <= len(access_mode)): imp_name += F' Access {access_mode[access - 1]}' if lock: if lock & 0x04: imp_name += ' Shared' elif lock <= len(lock_mode): imp_name += F' Lock {lock_mode[lock - 1]}' imp_name += ')' return imp_name def disasm_rec(self, dword: int) -> str: object_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) options = _get_word(self.indirect_table, dword + 18, self.endian) if (options & 1) == 0: object_name = F'(Private) {object_name}' return object_name def disasm_object(self, offset: int) -> tuple[str, bool]: if self.is_64bit: type_desc = _get_dword(self.indirect_table, offset, self.endian) if type_desc + 4 > len(self.indirect_table): return '', False flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 3) * 10 if offs + 8 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0 or (hl_name >> 1) < 0x100: return '', is_array return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit), is_array type_desc = _get_dword(self.indirect_table, offset, self.endian) flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 2) * 10 if offs + 4 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0 or (hl_name >> 1) < 0x100: return '', is_array return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit), is_array def disasm_var(self, dword: int) -> str: b_flag1 = self.indirect_table[dword] b_flag2 = self.indirect_table[dword + 1] has_as = (b_flag1 & 0x20) != 0 has_new = (b_flag2 & 0x20) != 0 var_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) is_array = False if has_new or has_as: type_name = '' if has_as: offs = 16 if self.is_64bit else 12 word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: type_id = self.indirect_table[dword + offs] type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs) var_type = '' if has_as and len(type_name) > 0: var_type += 'As ' if has_new and (not has_as or len(type_name) > 0): var_type += 'New ' if has_as and len(type_name) > 0: var_type += type_name if is_array: var_name += '()' if len(var_type) > 0: var_name += F' ({var_type.rstrip()})' else: offs = 16 if self.is_64bit else 12 if len(self.indirect_table) >= dword + offs + 4: word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: _TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'} type_id = self.indirect_table[dword + offs] suffix = _TYPE_SUFFIXES.get(type_id) if suffix is not None: var_name += suffix else: try: _, is_array = self.disasm_object(dword + offs) except Exception: is_array = False if is_array: var_name += '()' return var_name def disasm_arg(self, arg_offset: int) -> str | None: flags = _get_word(self.indirect_table, arg_offset, self.endian) offs = 4 if self.is_64bit else 0 name_word = _get_word(self.indirect_table, arg_offset + 2, self.endian) if name_word == 0xFFFE: return None arg_name = _get_name( self.indirect_table, self.identifiers, arg_offset + 2, self.endian, self.vba_ver, self.is_64bit) arg_type = _get_dword(self.indirect_table, arg_offset + offs + 12, self.endian) arg_opts = _get_word(self.indirect_table, arg_offset + offs + 24, self.endian) is_paramarray = bool(arg_opts & 0x0001) if is_paramarray: self._has_pa_bit = True if arg_opts & 0x0004: arg_name = F'ByVal {arg_name}' if arg_opts & 0x0002: arg_name = F'ByRef {arg_name}' if arg_opts & 0x0200: arg_name = F'Optional {arg_name}' if flags & 0x0020: arg_type_name = '' is_array = False if (arg_type & 0xFFFF0000) == 0xFFFF0000: arg_type_id = arg_type & 0x000000FF arg_type_name = _get_type_name(arg_type_id) else: arg_type_name, is_array = self.disasm_object(arg_offset + offs + 12) if is_array: arg_name += '()' if arg_type_name: arg_name += ' As ' arg_name += arg_type_name elif (arg_type & 0xFFFF0000) == 0xFFFF0000: _TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'} arg_type_id = arg_type & 0x000000FF suffix = _TYPE_SUFFIXES.get(arg_type_id) if suffix is not None: arg_name += suffix else: try: _type_name, is_array = self.disasm_object(arg_offset + offs + 12) # noqa: F841 except Exception: is_array = False if is_array: arg_name += '()' if arg_opts & 0x0200: default_tag_off = arg_offset + offs + 28 default_val_off = arg_offset + offs + 32 ind = self.indirect_table if default_tag_off + 2 <= len(ind) and default_val_off + 4 <= len(ind): vt_tag = _get_word(ind, default_tag_off, self.endian) value_dw = _get_dword(ind, default_val_off, self.endian) default_str = self._format_default_value(vt_tag, value_dw) if default_str is not None: arg_name += F' = {default_str}' if is_paramarray: arg_name = F'ParamArray {arg_name}' return arg_name def _format_default_value(self, vt_tag: int, value_dw: int) -> str | None: VT_I2 = 2 VT_I4 = 3 VT_R4 = 4 VT_R8 = 5 VT_CY = 6 VT_BSTR = 8 VT_BOOL = 11 VT_UI1 = 17 ind = self.indirect_table if vt_tag == 0: return None elif vt_tag == VT_I2: val = value_dw & 0xFFFF return str(val - 0x10000 if val > 0x7FFF else val) elif vt_tag == VT_I4: return str(value_dw - 0x100000000 if value_dw > 0x7FFFFFFF else value_dw) elif vt_tag == VT_R4: val = _struct.unpack('<f', _struct.pack('<I', value_dw))[0] return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val) elif vt_tag == VT_R8: if value_dw + 8 <= len(ind): val = _struct.unpack('<d', bytes(ind[value_dw:value_dw + 8]))[0] return str(int(val)) if val == int(val) and abs(val) < 1e15 else str(val) elif vt_tag == VT_CY: val = value_dw / 10000 return str(int(val)) if val == int(val) else str(val) elif vt_tag == VT_BSTR: if value_dw + 4 <= len(ind): str_len = _get_dword(ind, value_dw, self.endian) if str_len == 0: return '""' if 0 < str_len < 0x10000 and value_dw + 4 + str_len <= len(ind): s = bytes(ind[value_dw + 4:value_dw + 4 + str_len]).decode(self.codec, errors='replace') return F'"{s}"' elif vt_tag == VT_BOOL: return 'True' if (value_dw & 0xFFFF) != 0 else 'False' elif vt_tag == VT_UI1: return str(value_dw & 0xFF) return None def _declare64(self, decl_offset: int, func_name: str) -> tuple[str, str | None]: """ Extract Lib and Alias names from a 64-bit Declare entry in the declaration table. The 64-bit entry structure differs significantly from 32-bit: the lib name identifier word is not at a fixed offset within the entry header. Instead, we extract the lib name from VBA source text stored later in the declaration table, falling back to the binary structure when source text is not available. """ decl = self.declaration_table decl_bytes = bytes(decl) lib_name = None alias_name = None # Strategy 1: Extract from VBA source text in the declaration table. # The source text may contain embedded null bytes, so strip them before matching. text = decl_bytes.replace(b'\x00', b'').decode('ascii', errors='replace') match = re.search( rf'(?:Function|Sub)\s+{re.escape(func_name)}\b.*?Lib\s+"([^"]+)"', text) if match: lib_name = match.group(1) after_lib = text[match.end():] alias_match = re.match(r'\s*Alias\s*"([^"]+)"', after_lib) if alias_match: alias_name = alias_match.group(1) # Strategy 2: Binary structure fallback. The alias string offset depends on version: # VBA7 version 0x0097 has 4 extra bytes of padding (alias at +0x20), later versions # use the standard offset (+0x1C). _alias_off = 0x20 if self.version <= 0x97 else 0x1C if lib_name is None and self.version > 0x97 and decl_offset >= 2: # For VBA7 versions after 0x97 the lib identifier word for each entry is stored # in the 2 bytes immediately preceding the entry header, placed there as trailing # data of the previous entry. This does not apply to the very first entry # (decl_offset == 0) or to versions <= 0x97 where the lib word sits at header +2. lib_word = _get_word(decl, decl_offset - 2, self.endian) if lib_word != 0 and lib_word != 0xFFFF: lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit) if lib_name is None: alias_start = decl_offset + _alias_off if alias_start < len(decl): alias_bytes_raw = bytes(decl[alias_start:]) null_pos = alias_bytes_raw.find(0) if null_pos > 0 and all(32 <= b < 127 for b in alias_bytes_raw[:null_pos]): abs_null = alias_start + null_pos dword_aligned = (abs_null + 1 + 3) & ~3 lib_word_offset = dword_aligned + 2 if lib_word_offset + 2 <= len(decl): lib_word = _get_word(decl, lib_word_offset, self.endian) if lib_word != 0 and lib_word != 0xFFFF: lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit) if lib_name is None: lib_word = _get_word(decl, decl_offset + 2, self.endian) if lib_word != 0: lib_name = _get_id(lib_word, self.identifiers, self.vba_ver, self.is_64bit) # Read alias from binary structure if not found via source text. if alias_name is None: alias_start = decl_offset + _alias_off if alias_start < len(decl): alias_bytes_raw = bytes(decl[alias_start:]) null_pos = alias_bytes_raw.find(0) if null_pos > 0: alias_name = alias_bytes_raw[:null_pos].decode(self.codec, errors='replace') return lib_name, alias_name def disasm_func(self, dword: int, op_type: int) -> str: func_decl = '(' flags = _get_word(self.indirect_table, dword, self.endian) name_word = _get_word(self.indirect_table, dword + 2, self.endian) offs2 = 4 if self.vba_ver > 5 else 0 if self.is_64bit: offs2 += 16 if ( self._linecont_pending and offs2 >= 4 and self.indirect_table[dword + 4:dword + 8] == b'\xFF\xFF\xFF\xFF' and (name_word >> 1) >= 0x100 ): name_word += 2 self._linecont_pending = False sub_name = _get_id(name_word, self.identifiers, self.vba_ver, self.is_64bit) arg_offset = _get_dword(self.indirect_table, dword + offs2 + 36, self.endian) ret_type = _get_dword(self.indirect_table, dword + offs2 + 40, self.endian) decl_offset = _get_word(self.indirect_table, dword + offs2 + 44, self.endian) c_options_offset = 60 if self.is_64bit and self.version > 0x97 else 54 c_options = self.indirect_table[dword + offs2 + c_options_offset] new_flags_offset = 63 if self.is_64bit and self.version > 0x97 else 57 new_flags = self.indirect_table[dword + offs2 + new_flags_offset] has_declare = False if self.vba_ver > 5: if (new_flags & 0x0002) == 0: func_decl += 'Private ' elif op_type & 0x04: func_decl += 'Public ' if new_flags & 0x0004: func_decl += 'Friend ' else: if (flags & 0x0008) == 0: func_decl += 'Private ' elif op_type & 0x04: func_decl += 'Public ' if flags & 0x0080: func_decl += 'Static ' if ( (c_options & 0x90) == 0 and (decl_offset != 0xFFFF) ): has_declare = True func_decl += 'Declare ' if self.vba_ver > 5: if new_flags & 0x20: func_decl += 'PtrSafe ' has_as = (flags & 0x0020) != 0 if flags & 0x1000: if op_type in (2, 6): func_decl += 'Function ' else: func_decl += 'Sub ' elif flags & 0x2000: func_decl += 'Property Get ' elif flags & 0x4000: func_decl += 'Property Let ' elif flags & 0x8000: func_decl += 'Property Set ' func_decl += sub_name if not has_as and (ret_type & 0xFFFF0000) == 0xFFFF0000: _TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'} ret_type_id = ret_type & 0x000000FF suffix = _TYPE_SUFFIXES.get(ret_type_id) if suffix is not None: func_decl += suffix if has_declare: if self.is_64bit: lib_name, alias_name = self._declare64(decl_offset, sub_name) else: lib_name = _get_name( self.declaration_table, self.identifiers, decl_offset + 2, self.endian, self.vba_ver, self.is_64bit) alias_name = None alias_offset = _get_word(self.declaration_table, decl_offset + 4, self.endian) if alias_offset < len(self.declaration_table): alias_bytes = bytes(self.declaration_table[alias_offset:]) null_pos = alias_bytes.find(0) if null_pos > 0: alias_name = alias_bytes[:null_pos].decode(self.codec, errors='replace') func_decl += F' Lib "{lib_name}"' if alias_name and alias_name != sub_name: func_decl += F' Alias "{alias_name}"' func_decl += ' ' arg_list: list[str] = [] while ( arg_offset != 0xFFFFFFFF and arg_offset != 0 and arg_offset + 26 < len(self.indirect_table) ): arg_name = self.disasm_arg(arg_offset) if arg_name is not None: arg_list.append(arg_name) arg_offset = _get_dword(self.indirect_table, arg_offset + (24 if self.is_64bit else 20), self.endian) if arg_list and not self._has_pa_bit and not any(a.startswith('ParamArray ') for a in arg_list): last = arg_list[-1] _pa_candidate = ( last.endswith('() As Variant') or (last.endswith('()') and ' As ' not in last) ) _pa_no_modifiers = not any( last.startswith(p) for p in ('ByVal ', 'ByRef ', 'Optional ')) if _pa_candidate and _pa_no_modifiers: arg_list[-1] = F'ParamArray {last}' func_decl += F'({", ".join(arg_list)})' if has_as: func_decl += ' As ' type_name = '' is_array = False if (ret_type & 0xFFFF0000) == 0xFFFF0000: type_id = ret_type & 0x000000FF type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs2 + 40) func_decl += type_name if is_array: func_decl += '()' func_decl += ')' return func_decl def disasm_var_arg( self, module_data: bytes | bytearray | memoryview, offset: int, w_length: int, mnemonic: str, ) -> list[str]: substring = module_data[offset:offset + w_length] length_str = F'0x{w_length:04X}' if mnemonic in ('LitStr', 'QuoteRem', 'Rem', 'Reparse'): quoted = F'"{codecs.decode(substring, self.codec, "replace")}"' return [length_str, quoted] elif mnemonic in ('OnGosub', 'OnGoto'): offset1 = offset names: list[str] = [] for _ in range(w_length // 2): offset1, word = _get_var(module_data, offset1, self.endian, False) names.append(_get_id(word, self.identifiers, self.vba_ver, self.is_64bit)) return [length_str, ', '.join(names)] else: hex_dump = ' '.join(F'{c:02X}' for c in substring) return [length_str, hex_dump] def dump_line( self, module_data: bytes | bytearray | memoryview, line_start: int, line_length: int, ) -> list[tuple[str, list[str]]]: """ Disassemble one p-code line into a list of (mnemonic, [arg, ...]) tuples. """ self._linecont_pending = False result: list[tuple[str, list[str]]] = [] if line_length <= 0: return result offset = line_start end_of_line = line_start + line_length while offset < end_of_line: offset, opcode = _get_var(module_data, offset, self.endian, False) op_type = (opcode & ~0x03FF) >> 10 opcode &= 0x03FF translated = _translate_opcode(opcode, self.vba_ver, self.is_64bit) if translated not in OPCODES: return result instruction = OPCODES[translated] mnemonic = instruction.mnem parts: list[str] = [] if mnemonic in ('Coerce', 'CoerceVar', 'DefType'): if op_type < len(_VAR_TYPES_LONG): parts.append(F'({_VAR_TYPES_LONG[op_type]})') elif op_type == 17: parts.append('(Byte)') else: parts.append(F'({op_type:d})') elif mnemonic in ('Dim', 'DimImplicit', 'Type'): dim_type: list[str] = [] if op_type & 0x04: dim_type.append('Global') elif op_type & 0x08: dim_type.append('Public') elif op_type & 0x10: dim_type.append('Private') elif op_type & 0x20: dim_type.append('Static') if (op_type & 0x01) and (mnemonic != 'Type'): dim_type.append('Const') if dim_type: parts.append(F'({" ".join(dim_type)})') elif mnemonic == 'LitVarSpecial': parts.append(F'({_SPECIALS[op_type]})') elif mnemonic in ('ArgsCall', 'ArgsMemCall', 'ArgsMemCallWith'): if op_type < 16: parts.append('(Call)') else: op_type -= 16 elif mnemonic == 'Option': parts.append(F'({_OPTIONS[op_type]})') elif mnemonic in ('Redim', 'RedimAs'): if op_type & 16: parts.append('(Preserve)') elif mnemonic in ( 'FnDir', 'FnFormat', 'FnStringVar', 'FnStringStr', ): parts.append(F'0x{op_type:04X}') elif mnemonic == 'LitSmallI2': parts.append(str(op_type)) for arg in instruction.args: if arg == 'name': offset, word = _get_var(module_data, offset, self.endian, False) the_name = self.disasm_name(word, mnemonic, op_type) parts.append(the_name) elif arg in ('0x', 'imp_'): offset, word = _get_var(module_data, offset, self.endian, False) the_imp = self.disasm_imp(arg, word, mnemonic) parts.append(the_imp) elif arg in ('func_', 'var_', 'rec_', 'type_', 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) if ( arg == 'rec_' and len(self.indirect_table) >= dword + 20 ): parts.append(self.disasm_rec(dword)) elif ( arg == 'type_' and len(self.indirect_table) >= dword + 7 ): the_type = _disasm_type(self.indirect_table, dword) parts.append(F'(As {the_type})') elif ( arg == 'var_' and len(self.indirect_table) >= dword + 16 ): if op_type & 0x20: parts.append('(WithEvents)') parts.append(self.disasm_var(dword)) if op_type & 0x10: word = _get_word(module_data, offset, self.endian) offset += 2 parts.append(F'0x{word:04X}') elif ( arg == 'func_' and len(self.indirect_table) >= dword + 61 ): parts.append(self.disasm_func(dword, op_type)) else: parts.append(F'{arg}{dword:08X}') if self.is_64bit and (arg == 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) parts.append(F'{dword:08X}') if instruction.varg: offset, w_length = _get_var(module_data, offset, self.endian, False) var_arg_parts = self.disasm_var_arg( module_data, offset, w_length, mnemonic) parts.extend(var_arg_parts) offset += w_length if w_length & 1: offset += 1 result.append((mnemonic, parts)) if mnemonic == 'LineCont': self._linecont_pending = True return resultMethods
def disasm_name(self, word, mnemonic, op_type)-
Expand source code Browse git
def disasm_name(self, word: int, mnemonic: str, op_type: int) -> str: var_types = [ '', '?', '%', '&', '!', '#', '@', '?', '$', '?', '?', '?', '?', '?', ] var_name = _get_id(word, self.identifiers, self.vba_ver, self.is_64bit) if op_type < len(var_types): str_type = var_types[op_type] else: str_type = '' if op_type == 32: var_name = F'[{var_name}]' if mnemonic == 'OnError': str_type = '' if op_type == 1: var_name = '(Resume Next)' elif op_type == 2: var_name = '(GoTo 0)' elif mnemonic == 'Resume': str_type = '' if op_type == 1: var_name = '(Next)' elif op_type != 0: var_name = '' return (var_name + str_type).rstrip() def disasm_imp(self, arg, word, mnemonic)-
Expand source code Browse git
def disasm_imp(self, arg: str, word: int, mnemonic: str) -> str: if mnemonic != 'Open': if arg == 'imp_': shift = 3 if self.is_64bit else 2 offs = (word >> shift) * 10 if offs + 8 <= len(self.object_table): hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name != 0: return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit) return '' return F'{arg}{word:04X}' access_mode = ['Read', 'Write', 'Read Write'] lock_mode = ['Read Write', 'Write', 'Read'] mode = word & 0x00FF access = (word & 0x0F00) >> 8 lock = (word & 0xF000) >> 12 imp_name = '(For ' if mode & 0x01: imp_name += 'Input' elif mode & 0x02: imp_name += 'Output' elif mode & 0x04: imp_name += 'Random' elif mode & 0x08: imp_name += 'Append' elif mode == 0x20: imp_name += 'Binary' if access and (access <= len(access_mode)): imp_name += F' Access {access_mode[access - 1]}' if lock: if lock & 0x04: imp_name += ' Shared' elif lock <= len(lock_mode): imp_name += F' Lock {lock_mode[lock - 1]}' imp_name += ')' return imp_name def disasm_rec(self, dword)-
Expand source code Browse git
def disasm_rec(self, dword: int) -> str: object_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) options = _get_word(self.indirect_table, dword + 18, self.endian) if (options & 1) == 0: object_name = F'(Private) {object_name}' return object_name def disasm_object(self, offset)-
Expand source code Browse git
def disasm_object(self, offset: int) -> tuple[str, bool]: if self.is_64bit: type_desc = _get_dword(self.indirect_table, offset, self.endian) if type_desc + 4 > len(self.indirect_table): return '', False flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 3) * 10 if offs + 8 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0 or (hl_name >> 1) < 0x100: return '', is_array return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit), is_array type_desc = _get_dword(self.indirect_table, offset, self.endian) flags = _get_word(self.indirect_table, type_desc, self.endian) is_array = bool(flags & 0x0800) if flags & 0x02: return _disasm_type(self.indirect_table, type_desc), is_array word = _get_word(self.indirect_table, type_desc + 2, self.endian) offs = (word >> 2) * 10 if offs + 4 > len(self.object_table): return '', False hl_name = _get_word(self.object_table, offs + 6, self.endian) if hl_name == 0 or (hl_name >> 1) < 0x100: return '', is_array return _get_id(hl_name, self.identifiers, self.vba_ver, self.is_64bit), is_array def disasm_var(self, dword)-
Expand source code Browse git
def disasm_var(self, dword: int) -> str: b_flag1 = self.indirect_table[dword] b_flag2 = self.indirect_table[dword + 1] has_as = (b_flag1 & 0x20) != 0 has_new = (b_flag2 & 0x20) != 0 var_name = _get_name( self.indirect_table, self.identifiers, dword + 2, self.endian, self.vba_ver, self.is_64bit) is_array = False if has_new or has_as: type_name = '' if has_as: offs = 16 if self.is_64bit else 12 word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: type_id = self.indirect_table[dword + offs] type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs) var_type = '' if has_as and len(type_name) > 0: var_type += 'As ' if has_new and (not has_as or len(type_name) > 0): var_type += 'New ' if has_as and len(type_name) > 0: var_type += type_name if is_array: var_name += '()' if len(var_type) > 0: var_name += F' ({var_type.rstrip()})' else: offs = 16 if self.is_64bit else 12 if len(self.indirect_table) >= dword + offs + 4: word = _get_word(self.indirect_table, dword + offs + 2, self.endian) if word == 0xFFFF: _TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'} type_id = self.indirect_table[dword + offs] suffix = _TYPE_SUFFIXES.get(type_id) if suffix is not None: var_name += suffix else: try: _, is_array = self.disasm_object(dword + offs) except Exception: is_array = False if is_array: var_name += '()' return var_name def disasm_arg(self, arg_offset)-
Expand source code Browse git
def disasm_arg(self, arg_offset: int) -> str | None: flags = _get_word(self.indirect_table, arg_offset, self.endian) offs = 4 if self.is_64bit else 0 name_word = _get_word(self.indirect_table, arg_offset + 2, self.endian) if name_word == 0xFFFE: return None arg_name = _get_name( self.indirect_table, self.identifiers, arg_offset + 2, self.endian, self.vba_ver, self.is_64bit) arg_type = _get_dword(self.indirect_table, arg_offset + offs + 12, self.endian) arg_opts = _get_word(self.indirect_table, arg_offset + offs + 24, self.endian) is_paramarray = bool(arg_opts & 0x0001) if is_paramarray: self._has_pa_bit = True if arg_opts & 0x0004: arg_name = F'ByVal {arg_name}' if arg_opts & 0x0002: arg_name = F'ByRef {arg_name}' if arg_opts & 0x0200: arg_name = F'Optional {arg_name}' if flags & 0x0020: arg_type_name = '' is_array = False if (arg_type & 0xFFFF0000) == 0xFFFF0000: arg_type_id = arg_type & 0x000000FF arg_type_name = _get_type_name(arg_type_id) else: arg_type_name, is_array = self.disasm_object(arg_offset + offs + 12) if is_array: arg_name += '()' if arg_type_name: arg_name += ' As ' arg_name += arg_type_name elif (arg_type & 0xFFFF0000) == 0xFFFF0000: _TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'} arg_type_id = arg_type & 0x000000FF suffix = _TYPE_SUFFIXES.get(arg_type_id) if suffix is not None: arg_name += suffix else: try: _type_name, is_array = self.disasm_object(arg_offset + offs + 12) # noqa: F841 except Exception: is_array = False if is_array: arg_name += '()' if arg_opts & 0x0200: default_tag_off = arg_offset + offs + 28 default_val_off = arg_offset + offs + 32 ind = self.indirect_table if default_tag_off + 2 <= len(ind) and default_val_off + 4 <= len(ind): vt_tag = _get_word(ind, default_tag_off, self.endian) value_dw = _get_dword(ind, default_val_off, self.endian) default_str = self._format_default_value(vt_tag, value_dw) if default_str is not None: arg_name += F' = {default_str}' if is_paramarray: arg_name = F'ParamArray {arg_name}' return arg_name def disasm_func(self, dword, op_type)-
Expand source code Browse git
def disasm_func(self, dword: int, op_type: int) -> str: func_decl = '(' flags = _get_word(self.indirect_table, dword, self.endian) name_word = _get_word(self.indirect_table, dword + 2, self.endian) offs2 = 4 if self.vba_ver > 5 else 0 if self.is_64bit: offs2 += 16 if ( self._linecont_pending and offs2 >= 4 and self.indirect_table[dword + 4:dword + 8] == b'\xFF\xFF\xFF\xFF' and (name_word >> 1) >= 0x100 ): name_word += 2 self._linecont_pending = False sub_name = _get_id(name_word, self.identifiers, self.vba_ver, self.is_64bit) arg_offset = _get_dword(self.indirect_table, dword + offs2 + 36, self.endian) ret_type = _get_dword(self.indirect_table, dword + offs2 + 40, self.endian) decl_offset = _get_word(self.indirect_table, dword + offs2 + 44, self.endian) c_options_offset = 60 if self.is_64bit and self.version > 0x97 else 54 c_options = self.indirect_table[dword + offs2 + c_options_offset] new_flags_offset = 63 if self.is_64bit and self.version > 0x97 else 57 new_flags = self.indirect_table[dword + offs2 + new_flags_offset] has_declare = False if self.vba_ver > 5: if (new_flags & 0x0002) == 0: func_decl += 'Private ' elif op_type & 0x04: func_decl += 'Public ' if new_flags & 0x0004: func_decl += 'Friend ' else: if (flags & 0x0008) == 0: func_decl += 'Private ' elif op_type & 0x04: func_decl += 'Public ' if flags & 0x0080: func_decl += 'Static ' if ( (c_options & 0x90) == 0 and (decl_offset != 0xFFFF) ): has_declare = True func_decl += 'Declare ' if self.vba_ver > 5: if new_flags & 0x20: func_decl += 'PtrSafe ' has_as = (flags & 0x0020) != 0 if flags & 0x1000: if op_type in (2, 6): func_decl += 'Function ' else: func_decl += 'Sub ' elif flags & 0x2000: func_decl += 'Property Get ' elif flags & 0x4000: func_decl += 'Property Let ' elif flags & 0x8000: func_decl += 'Property Set ' func_decl += sub_name if not has_as and (ret_type & 0xFFFF0000) == 0xFFFF0000: _TYPE_SUFFIXES = {2: '%', 3: '&', 4: '!', 5: '#', 6: '@', 8: '$'} ret_type_id = ret_type & 0x000000FF suffix = _TYPE_SUFFIXES.get(ret_type_id) if suffix is not None: func_decl += suffix if has_declare: if self.is_64bit: lib_name, alias_name = self._declare64(decl_offset, sub_name) else: lib_name = _get_name( self.declaration_table, self.identifiers, decl_offset + 2, self.endian, self.vba_ver, self.is_64bit) alias_name = None alias_offset = _get_word(self.declaration_table, decl_offset + 4, self.endian) if alias_offset < len(self.declaration_table): alias_bytes = bytes(self.declaration_table[alias_offset:]) null_pos = alias_bytes.find(0) if null_pos > 0: alias_name = alias_bytes[:null_pos].decode(self.codec, errors='replace') func_decl += F' Lib "{lib_name}"' if alias_name and alias_name != sub_name: func_decl += F' Alias "{alias_name}"' func_decl += ' ' arg_list: list[str] = [] while ( arg_offset != 0xFFFFFFFF and arg_offset != 0 and arg_offset + 26 < len(self.indirect_table) ): arg_name = self.disasm_arg(arg_offset) if arg_name is not None: arg_list.append(arg_name) arg_offset = _get_dword(self.indirect_table, arg_offset + (24 if self.is_64bit else 20), self.endian) if arg_list and not self._has_pa_bit and not any(a.startswith('ParamArray ') for a in arg_list): last = arg_list[-1] _pa_candidate = ( last.endswith('() As Variant') or (last.endswith('()') and ' As ' not in last) ) _pa_no_modifiers = not any( last.startswith(p) for p in ('ByVal ', 'ByRef ', 'Optional ')) if _pa_candidate and _pa_no_modifiers: arg_list[-1] = F'ParamArray {last}' func_decl += F'({", ".join(arg_list)})' if has_as: func_decl += ' As ' type_name = '' is_array = False if (ret_type & 0xFFFF0000) == 0xFFFF0000: type_id = ret_type & 0x000000FF type_name = _get_type_name(type_id) else: type_name, is_array = self.disasm_object(dword + offs2 + 40) func_decl += type_name if is_array: func_decl += '()' func_decl += ')' return func_decl def disasm_var_arg(self, module_data, offset, w_length, mnemonic)-
Expand source code Browse git
def disasm_var_arg( self, module_data: bytes | bytearray | memoryview, offset: int, w_length: int, mnemonic: str, ) -> list[str]: substring = module_data[offset:offset + w_length] length_str = F'0x{w_length:04X}' if mnemonic in ('LitStr', 'QuoteRem', 'Rem', 'Reparse'): quoted = F'"{codecs.decode(substring, self.codec, "replace")}"' return [length_str, quoted] elif mnemonic in ('OnGosub', 'OnGoto'): offset1 = offset names: list[str] = [] for _ in range(w_length // 2): offset1, word = _get_var(module_data, offset1, self.endian, False) names.append(_get_id(word, self.identifiers, self.vba_ver, self.is_64bit)) return [length_str, ', '.join(names)] else: hex_dump = ' '.join(F'{c:02X}' for c in substring) return [length_str, hex_dump] def dump_line(self, module_data, line_start, line_length)-
Disassemble one p-code line into a list of (mnemonic, [arg, …]) tuples.
Expand source code Browse git
def dump_line( self, module_data: bytes | bytearray | memoryview, line_start: int, line_length: int, ) -> list[tuple[str, list[str]]]: """ Disassemble one p-code line into a list of (mnemonic, [arg, ...]) tuples. """ self._linecont_pending = False result: list[tuple[str, list[str]]] = [] if line_length <= 0: return result offset = line_start end_of_line = line_start + line_length while offset < end_of_line: offset, opcode = _get_var(module_data, offset, self.endian, False) op_type = (opcode & ~0x03FF) >> 10 opcode &= 0x03FF translated = _translate_opcode(opcode, self.vba_ver, self.is_64bit) if translated not in OPCODES: return result instruction = OPCODES[translated] mnemonic = instruction.mnem parts: list[str] = [] if mnemonic in ('Coerce', 'CoerceVar', 'DefType'): if op_type < len(_VAR_TYPES_LONG): parts.append(F'({_VAR_TYPES_LONG[op_type]})') elif op_type == 17: parts.append('(Byte)') else: parts.append(F'({op_type:d})') elif mnemonic in ('Dim', 'DimImplicit', 'Type'): dim_type: list[str] = [] if op_type & 0x04: dim_type.append('Global') elif op_type & 0x08: dim_type.append('Public') elif op_type & 0x10: dim_type.append('Private') elif op_type & 0x20: dim_type.append('Static') if (op_type & 0x01) and (mnemonic != 'Type'): dim_type.append('Const') if dim_type: parts.append(F'({" ".join(dim_type)})') elif mnemonic == 'LitVarSpecial': parts.append(F'({_SPECIALS[op_type]})') elif mnemonic in ('ArgsCall', 'ArgsMemCall', 'ArgsMemCallWith'): if op_type < 16: parts.append('(Call)') else: op_type -= 16 elif mnemonic == 'Option': parts.append(F'({_OPTIONS[op_type]})') elif mnemonic in ('Redim', 'RedimAs'): if op_type & 16: parts.append('(Preserve)') elif mnemonic in ( 'FnDir', 'FnFormat', 'FnStringVar', 'FnStringStr', ): parts.append(F'0x{op_type:04X}') elif mnemonic == 'LitSmallI2': parts.append(str(op_type)) for arg in instruction.args: if arg == 'name': offset, word = _get_var(module_data, offset, self.endian, False) the_name = self.disasm_name(word, mnemonic, op_type) parts.append(the_name) elif arg in ('0x', 'imp_'): offset, word = _get_var(module_data, offset, self.endian, False) the_imp = self.disasm_imp(arg, word, mnemonic) parts.append(the_imp) elif arg in ('func_', 'var_', 'rec_', 'type_', 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) if ( arg == 'rec_' and len(self.indirect_table) >= dword + 20 ): parts.append(self.disasm_rec(dword)) elif ( arg == 'type_' and len(self.indirect_table) >= dword + 7 ): the_type = _disasm_type(self.indirect_table, dword) parts.append(F'(As {the_type})') elif ( arg == 'var_' and len(self.indirect_table) >= dword + 16 ): if op_type & 0x20: parts.append('(WithEvents)') parts.append(self.disasm_var(dword)) if op_type & 0x10: word = _get_word(module_data, offset, self.endian) offset += 2 parts.append(F'0x{word:04X}') elif ( arg == 'func_' and len(self.indirect_table) >= dword + 61 ): parts.append(self.disasm_func(dword, op_type)) else: parts.append(F'{arg}{dword:08X}') if self.is_64bit and (arg == 'context_'): offset, dword = _get_var(module_data, offset, self.endian, True) parts.append(F'{dword:08X}') if instruction.varg: offset, w_length = _get_var(module_data, offset, self.endian, False) var_arg_parts = self.disasm_var_arg( module_data, offset, w_length, mnemonic) parts.extend(var_arg_parts) offset += w_length if w_length & 1: offset += 1 result.append((mnemonic, parts)) if mnemonic == 'LineCont': self._linecont_pending = True return result
class PCodeDisassembler (data)-
VBA p-code disassembler that produces structured PCodeModule output. The output is suitable for consumption by the decompiler for reconstruction to VBA source code.
Expand source code Browse git
class PCodeDisassembler: """ VBA p-code disassembler that produces structured PCodeModule output. The output is suitable for consumption by the decompiler for reconstruction to VBA source code. """ def __init__(self, data: bytes | bytearray | memoryview): self._data = data def iter_modules(self): """ Yield PCodeModule objects for each VBA module. """ for ole_data in self._get_ole_streams(): ole = OleFile(ole_data) yield from self._iter_project_modules(ole) def _iter_project_modules( self, ole: OleFile, ): """ Iterate over VBA modules in an OLE file, yielding PCodeModule per module. """ vba_projects = _find_vba_projects(ole) if not vba_projects: return for vba_root, _, dir_path in vba_projects: codec, code_modules, is_64bit = self._process_dir(ole, dir_path) vba_project_path = vba_root + 'VBA/_VBA_PROJECT' vba_project_data = self._process_vba_project(ole, vba_project_path) identifiers = _get_identifiers(vba_project_data, codec) identifiers_stripped = not identifiers for module in code_modules: module_path = F'{vba_root}VBA/{module}' try: module_data = ole.openstream(module_path).read() except Exception: continue lines = _pcode_dump( module_data, vba_project_data, identifiers, is_64bit, codec) yield PCodeModule(module_path, lines, identifiers_stripped) def _get_ole_streams(self) -> list[bytes | bytearray | memoryview]: """ Extract OLE data from the input. If the input is already an OLE compound file, returns it directly. If it's a ZIP (OOXML), extracts all vbaProject.bin entries. """ if self._data[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1': return [self._data] if self._data[:2] == b'PK': import zipfile from refinery.lib.structures import MemoryFile results: list[bytes | bytearray] = [] try: with zipfile.ZipFile(MemoryFile(self._data, bytes)) as zf: for name in zf.namelist(): if name.lower().endswith('vbaproject.bin'): results.append(zf.read(name)) except zipfile.BadZipFile: pass return results return [self._data] def _process_dir( self, ole: OleFile, dir_path: str, ) -> tuple[str, list[str], bool]: """ Parse the VBA dir stream to find module names and codepage. Returns (codec, code_modules, is_64bit). """ dir_data_compressed = ole.openstream(dir_path).read() dir_data = decompress_stream(dir_data_compressed) stream_size = len(dir_data) code_modules: list[str] = [] is_64bit = False codec = 'latin1' offset = 0 while offset < stream_size: try: tag = _get_word(dir_data, offset, '<') w_length = _get_word(dir_data, offset + 2, '<') if tag == 9: w_length = 6 elif tag == 3: w_length = 2 offset += 6 if w_length: if tag == 3: codepage = _get_word(dir_data, offset, '<') codec = _codepage_to_codec(codepage) elif tag == 50: stream_name = codecs.decode( dir_data[offset:offset + w_length], 'utf_16_le', errors='replace') code_modules.append(stream_name) elif tag == 1: sys_kind = _get_dword(dir_data, offset, '<') is_64bit = sys_kind == 3 offset += w_length except Exception: break return codec, code_modules, is_64bit def _process_vba_project( self, ole: OleFile, vba_project_path: str, ) -> bytes | bytearray | memoryview: """ Read the _VBA_PROJECT stream (raw, not compressed). """ return ole.openstream(vba_project_path).read()Methods
def iter_modules(self)-
Yield PCodeModule objects for each VBA module.
Expand source code Browse git
def iter_modules(self): """ Yield PCodeModule objects for each VBA module. """ for ole_data in self._get_ole_streams(): ole = OleFile(ole_data) yield from self._iter_project_modules(ole)