Cvdump parser and comparing library functions (#383)

* Cvdump wrapper and parser. Matching library functions

* Remove 'Self' type int (3.11+)

* Add temp reference for entrypoints

* ISLE using multithreaded libc

* 🙄
This commit is contained in:
MS 2023-12-28 16:10:57 -05:00 committed by GitHub
parent ff4845a6ea
commit 9a6d555508
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 395 additions and 117 deletions

View File

@ -294,7 +294,7 @@ if (MSVC)
# game was originally built with) and tweaked slightly to produce more debugging info for reccmp. # game was originally built with) and tweaked slightly to produce more debugging info for reccmp.
# They ensure a recompilation that can be byte/instruction accurate to the original binaries. # They ensure a recompilation that can be byte/instruction accurate to the original binaries.
if (ISLE_BUILD_APP) if (ISLE_BUILD_APP)
target_compile_options(isle PRIVATE "/ML$<$<CONFIG:Debug>:d>") target_compile_options(isle PRIVATE "/MT$<$<CONFIG:Debug>:d>")
endif() endif()
target_compile_options(lego1 PRIVATE "/MT$<$<CONFIG:Debug>:d>") target_compile_options(lego1 PRIVATE "/MT$<$<CONFIG:Debug>:d>")

47
LEGO1/library_msvc.h Normal file
View File

@ -0,0 +1,47 @@
#ifdef 0
// LIBRARY: ISLE 0x402f80
// LIBRARY: LEGO1 0x10086240
// _malloc
// LIBRARY: ISLE 0x402fa0
// LIBRARY: LEGO1 0x10086260
// _free
// LIBRARY: ISLE 0x408220
// LIBRARY: LEGO1 0x1008b400
// _atol
// LIBRARY: ISLE 0x4082d0
// LIBRARY: LEGO1 0x1008b4b0
// _atoi
// LIBRARY: LEGO1 0x1008b4c0
// _strtok
// LIBRARY: ISLE 0x4085c0
// LIBRARY: LEGO1 0x1008b5a0
// _sprintf
// LIBRARY: ISLE 0x4081e0
// _srand
// LIBRARY: ISLE 0x4081f0
// LIBRARY: LEGO1 0x1008b640
// _rand
// entry
// LIBRARY: ISLE 0x4082e0
// _WinMainCRTStartup
// entry
// LIBRARY: LEGO1 0x1008c860
// __DllMainCRTStartup@12
// LIBRARY: ISLE 0x409110
// __mtinit
// LIBRARY: ISLE 0x409190
// __getptd
#endif

View File

@ -1,4 +1,6 @@
import struct import struct
from typing import List, Optional
from dataclasses import dataclass
from collections import namedtuple from collections import namedtuple
@ -33,44 +35,56 @@ PEHeader = namedtuple(
], ],
) )
ImageSectionHeader = namedtuple(
"ImageSectionHeader",
[
"Name",
"Misc",
"VirtualAddress",
"SizeOfRawData",
"PointerToRawData",
"PointerToRelocations",
"PointerToLineNumbers",
"NumberOfRelocations",
"NumberOfLineNumbers",
"Characteristics",
],
)
@dataclass
class ImageSectionHeader:
# pylint: disable=too-many-instance-attributes
# Most attributes are unused, but this is the struct format
name: bytes
virtual_size: int
virtual_address: int
size_of_raw_data: int
pointer_to_raw_data: int
pointer_to_relocations: int
pointer_to_line_numbers: int
number_of_relocations: int
number_of_line_numbers: int
characteristics: int
def section_name_match(section, name): def match_name(self, name: str) -> bool:
return section.Name == struct.pack("8s", name.encode("ascii")) return self.name == struct.pack("8s", name.encode("ascii"))
def contains_vaddr(self, vaddr: int) -> bool:
ofs = vaddr - self.virtual_address
return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size)
def section_contains_vaddr(section, imagebase, vaddr) -> bool: def addr_is_uninitialized(self, vaddr: int) -> bool:
debased = vaddr - imagebase """We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
ofs = debased - section.VirtualAddress the characteristics field so instead we determine it this way."""
return 0 <= ofs < section.SizeOfRawData if not self.contains_vaddr(vaddr):
return False
# Should include the case where size_of_raw_data == 0,
# meaning the entire section is uninitialized
return (self.virtual_size > self.size_of_raw_data) and (
vaddr - self.virtual_address >= self.size_of_raw_data
)
class Bin: class Bin:
"""Parses a PE format EXE and allows reading data from a virtual address. """Parses a PE format EXE and allows reading data from a virtual address.
Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format""" Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
def __init__(self, filename, logger=None): # pylint: disable=too-many-instance-attributes
def __init__(self, filename: str, logger=None) -> None:
self.logger = logger self.logger = logger
self._debuglog(f'Parsing headers of "{filename}"... ') self._debuglog(f'Parsing headers of "{filename}"... ')
self.filename = filename self.filename = filename
self.file = None self.file = None
self.imagebase = None self.imagebase = None
self.sections = [] self.entry = None
self.sections: List[ImageSectionHeader] = []
self.last_section = None self.last_section = None
self._relocated_addrs = set() self._relocated_addrs = set()
@ -95,12 +109,18 @@ class Bin:
optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader) optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader)
(self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20]) (self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20])
(entry,) = struct.unpack("<i", optional_hdr[0x10:0x14])
self.entry = entry + self.imagebase
self.sections = [ self.sections = [
ImageSectionHeader(*struct.unpack("<8s6I2HI", self.file.read(0x28))) ImageSectionHeader(*struct.unpack("<8s6I2HI", self.file.read(0x28)))
for i in range(pe_hdr.NumberOfSections) for i in range(pe_hdr.NumberOfSections)
] ]
# Add the imagebase here because we almost never need the base vaddr without it
for sect in self.sections:
sect.virtual_address += self.imagebase
self._populate_relocations() self._populate_relocations()
text_section = self._get_section_by_name(".text") text_section = self._get_section_by_name(".text")
@ -119,7 +139,7 @@ class Bin:
if self.logger is not None: if self.logger is not None:
self.logger.debug(msg) self.logger.debug(msg)
def get_relocated_addresses(self): def get_relocated_addresses(self) -> List[int]:
return sorted(self._relocated_addrs) return sorted(self._relocated_addrs)
def is_relocated_addr(self, vaddr) -> bool: def is_relocated_addr(self, vaddr) -> bool:
@ -165,27 +185,25 @@ class Bin:
(relocated_addr,) = struct.unpack("<I", self.read(addr, 4)) (relocated_addr,) = struct.unpack("<I", self.read(addr, 4))
self._relocated_addrs.add(relocated_addr) self._relocated_addrs.add(relocated_addr)
def _set_section_for_vaddr(self, vaddr): def _set_section_for_vaddr(self, vaddr: int):
if self.last_section is not None and section_contains_vaddr( if self.last_section is not None and self.last_section.contains_vaddr(vaddr):
self.last_section, self.imagebase, vaddr
):
return return
# TODO: assumes no potential for section overlap. reasonable? # TODO: assumes no potential for section overlap. reasonable?
self.last_section = next( self.last_section = next(
filter( filter(
lambda section: section_contains_vaddr(section, self.imagebase, vaddr), lambda section: section.contains_vaddr(vaddr),
self.sections, self.sections,
), ),
None, None,
) )
if self.last_section is None: if self.last_section is None:
raise InvalidVirtualAddressError raise InvalidVirtualAddressError(f"0x{vaddr:08x}")
def _get_section_by_name(self, name): def _get_section_by_name(self, name: str):
section = next( section = next(
filter(lambda section: section_name_match(section, name), self.sections), filter(lambda section: section.match_name(name), self.sections),
None, None,
) )
@ -194,7 +212,7 @@ class Bin:
return section return section
def get_section_offset_by_index(self, index) -> int: def get_section_offset_by_index(self, index: int) -> int:
"""The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB """The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
where A is the index (1-based) into the section table and B is the local offset. where A is the index (1-based) into the section table and B is the local offset.
This will return the virtual address for the start of the section at the given index This will return the virtual address for the start of the section at the given index
@ -202,29 +220,33 @@ class Bin:
""" """
section = self.sections[index - 1] section = self.sections[index - 1]
return self.imagebase + section.VirtualAddress return section.virtual_address
def get_section_offset_by_name(self, name) -> int: def get_section_offset_by_name(self, name: str) -> int:
"""Same as above, but use the section name as the lookup""" """Same as above, but use the section name as the lookup"""
section = self._get_section_by_name(name) section = self._get_section_by_name(name)
return self.imagebase + section.VirtualAddress return section.virtual_address
def get_raw_addr(self, vaddr) -> int: def get_abs_addr(self, section: int, offset: int) -> int:
"""Convenience function for converting section:offset pairs from cvdump
into an absolute vaddr."""
return self.get_section_offset_by_index(section) + offset
def get_raw_addr(self, vaddr: int) -> int:
"""Returns the raw offset in the PE binary for the given virtual address.""" """Returns the raw offset in the PE binary for the given virtual address."""
self._set_section_for_vaddr(vaddr) self._set_section_for_vaddr(vaddr)
return ( return (
vaddr vaddr
- self.imagebase - self.last_section.virtual_address
- self.last_section.VirtualAddress + self.last_section.pointer_to_raw_data
+ self.last_section.PointerToRawData
) )
def is_valid_vaddr(self, vaddr) -> bool: def is_valid_vaddr(self, vaddr: int) -> bool:
"""Does this virtual address point to anything in the exe?""" """Does this virtual address point to anything in the exe?"""
section = next( section = next(
filter( filter(
lambda section: section_contains_vaddr(section, self.imagebase, vaddr), lambda section: section.contains_vaddr(vaddr),
self.sections, self.sections,
), ),
None, None,
@ -232,9 +254,14 @@ class Bin:
return section is not None return section is not None
def read(self, offset, size): def read(self, offset: int, size: int) -> Optional[bytes]:
"""Read (at most) the given number of bytes at the given virtual address.
If we return None, the given address points to uninitialized data."""
self._set_section_for_vaddr(offset) self._set_section_for_vaddr(offset)
if self.last_section.addr_is_uninitialized(offset):
return None
raw_addr = self.get_raw_addr(offset) raw_addr = self.get_raw_addr(offset)
self.file.seek(raw_addr) self.file.seek(raw_addr)
@ -242,8 +269,8 @@ class Bin:
# Reading off the end will most likely misrepresent the virtual addressing. # Reading off the end will most likely misrepresent the virtual addressing.
_size = min( _size = min(
size, size,
self.last_section.PointerToRawData self.last_section.pointer_to_raw_data
+ self.last_section.SizeOfRawData + self.last_section.size_of_raw_data
- raw_addr, - raw_addr,
) )
return self.file.read(_size) return self.file.read(_size)

View File

@ -0,0 +1,2 @@
from .parser import CvdumpParser
from .runner import Cvdump

View File

@ -0,0 +1,163 @@
import re
from typing import Iterable
from collections import namedtuple
# e.g. `*** PUBLICS`
_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)")
# e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4`
_line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
# We assume no spaces in the file name
# e.g. ` Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
_lines_subsection_header = re.compile(
r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
)
# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
_publics_line_regex = re.compile(
r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
)
# e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type: 0x1024, ViewROI::IntrinsicImportance`
_symbol_line_regex = re.compile(
r"\(\w+\) (?P<type>\S+): \[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+\S+, (?P<name>.+)"
)
# e.g. ` Debug start: 00000008, Debug end: 0000016E`
_gproc_debug_regex = re.compile(
r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
)
# e.g. ` 00DA 0001:00000000 00000073 60501020`
_section_contrib_regex = re.compile(
r"\s*(?P<module>\w{4}) (?P<section>\w{4}):(?P<offset>\w{8}) (?P<size>\w{8}) (?P<flags>\w{8})"
)
# e.g. `S_GDATA32: [0003:000004A4], Type: T_32PRCHAR(0470), g_set`
_gdata32_regex = re.compile(
r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>\S+)"
)
LinesEntry = namedtuple("LinesEntry", "filename line_no addr")
PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
SizeRefEntry = namedtuple("SizeRefEntry", "section offset size")
GdataEntry = namedtuple("GdataEntry", "section offset type name")
class CvdumpParser:
def __init__(self) -> None:
self._section: str = ""
self._lines_filename: str = ""
self.lines = []
self.publics = []
self.symbols = []
self.sizerefs = []
self.globals = []
def _lines_section(self, line: str):
"""Parsing entries from the LINES section. We only care about the pairs of
line_number and address and the subsection header to indicate which code file
we are in."""
# Subheader indicates a new function and possibly a new code filename.
if (match := _lines_subsection_header.match(line)) is not None:
self._lines_filename = match.group(1)
return
if (matches := _line_addr_pairs_findall.findall(line)) is not None:
for line_no, addr in matches:
self.lines.append(
LinesEntry(
filename=self._lines_filename,
line_no=int(line_no),
addr=int(addr, 16),
)
)
def _publics_section(self, line: str):
"""Match each line from PUBLICS and pull out the symbol information.
These are MSVC mangled symbol names. String constants and vtable
addresses can only be found here."""
if (match := _publics_line_regex.match(line)) is not None:
self.publics.append(
PublicsEntry(
type=match.group("type"),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
flags=int(match.group("flags"), 16),
name=match.group("name"),
)
)
def _globals_section(self, line: str):
"""S_PROCREF may be useful later.
Right now we just want S_GDATA32 symbols because it is the simplest
way to access global variables."""
if (match := _gdata32_regex.match(line)) is not None:
self.globals.append(
GdataEntry(
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
type=match.group("type"),
name=match.group("name"),
)
)
def _symbols_section(self, line: str):
"""We are interested in S_GPROC32 symbols only."""
if (match := _symbol_line_regex.match(line)) is not None:
if match.group("type") == "S_GPROC32":
self.symbols.append(
SymbolsEntry(
type=match.group("type"),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
name=match.group("name"),
)
)
def _section_contributions(self, line: str):
"""Gives the size of elements across all sections of the binary.
This is the easiest way to get the data size for .data and .rdata
members that do not have a primitive data type."""
if (match := _section_contrib_regex.match(line)) is not None:
self.sizerefs.append(
SizeRefEntry(
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
)
)
def read_line(self, line: str):
# Blank lines are there to help the reader; they have no context significance
if line.strip() == "":
return
if (match := _section_change_regex.match(line)) is not None:
self._section = match.group(1)
return
if self._section == "LINES":
self._lines_section(line)
elif self._section == "PUBLICS":
self._publics_section(line)
elif self._section == "SYMBOLS":
self._symbols_section(line)
elif self._section == "SECTION CONTRIBUTIONS":
self._section_contributions(line)
elif self._section == "GLOBALS":
self._globals_section(line)
def read_lines(self, lines: Iterable[str]):
for line in lines:
self.read_line(line)

View File

@ -0,0 +1,66 @@
from os import name as os_name
from enum import Enum
from typing import List
import subprocess
from isledecomp.lib import lib_path_join
from isledecomp.dir import winepath_unix_to_win
from .parser import CvdumpParser
class DumpOpt(Enum):
LINES = 0
SYMBOLS = 1
GLOBALS = 2
PUBLICS = 3
SECTION_CONTRIB = 4
cvdump_opt_map = {
DumpOpt.LINES: "-l",
DumpOpt.SYMBOLS: "-s",
DumpOpt.GLOBALS: "-g",
DumpOpt.PUBLICS: "-p",
DumpOpt.SECTION_CONTRIB: "-seccontrib",
}
class Cvdump:
def __init__(self, pdb: str) -> None:
self._pdb: str = pdb
self._options = set()
def lines(self):
self._options.add(DumpOpt.LINES)
return self
def symbols(self):
self._options.add(DumpOpt.SYMBOLS)
return self
def globals(self):
self._options.add(DumpOpt.GLOBALS)
return self
def publics(self):
self._options.add(DumpOpt.PUBLICS)
return self
def section_contributions(self):
self._options.add(DumpOpt.SECTION_CONTRIB)
return self
def cmd_line(self) -> List[str]:
cvdump_exe = lib_path_join("cvdump.exe")
flags = [cvdump_opt_map[opt] for opt in self._options]
if os_name == "nt":
return [cvdump_exe, *flags, self._pdb]
return ["wine", *flags, cvdump_exe, winepath_unix_to_win(self._pdb)]
def run(self) -> CvdumpParser:
p = CvdumpParser()
call = self.cmd_line()
lines = subprocess.check_output(call).decode("utf-8").split("\r\n")
p.read_lines(lines)
return p

View File

@ -1,7 +1,6 @@
import os import os
import subprocess from isledecomp.dir import PathResolver
from isledecomp.lib import lib_path_join from isledecomp.cvdump import Cvdump
from isledecomp.dir import PathResolver, winepath_unix_to_win
class RecompiledInfo: class RecompiledInfo:
@ -20,80 +19,54 @@ class SymInfo:
def __init__(self, pdb, sym_recompfile, sym_logger, base_dir): def __init__(self, pdb, sym_recompfile, sym_logger, base_dir):
self.logger = sym_logger self.logger = sym_logger
path_resolver = PathResolver(base_dir) path_resolver = PathResolver(base_dir)
call = [lib_path_join("cvdump.exe"), "-l", "-s"]
if os.name != "nt":
# Run cvdump through wine and convert path to Windows-friendly wine path
call.insert(0, "wine")
call.append(winepath_unix_to_win(pdb))
else:
call.append(pdb)
self.logger.info("Parsing %s ...", pdb) self.logger.info("Parsing %s ...", pdb)
self.logger.debug("Command = %s", call)
line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
current_section = None
self.logger.debug("Parsing output of cvdump.exe ...") self.logger.debug("Parsing output of cvdump.exe ...")
for i, line in enumerate(line_dump): cv = Cvdump(pdb).lines().symbols().publics().section_contributions().run()
if line.startswith("***"):
current_section = line[4:]
if current_section == "SYMBOLS" and "S_GPROC32" in line: self.logger.debug("... Parsing output of cvdump.exe finished")
sym_section = int(line[21:25], 16)
sym_addr = int(line[26:34], 16) contrib_dict = {(s.section, s.offset): s.size for s in cv.sizerefs}
for pub in cv.publics:
if (
pub.type == "S_PUB32"
and pub.name.startswith("_")
and (pub.section, pub.offset) in contrib_dict
):
size = contrib_dict[(pub.section, pub.offset)]
info = RecompiledInfo() info = RecompiledInfo()
info.addr = sym_addr + sym_recompfile.get_section_offset_by_index( info.addr = sym_recompfile.get_abs_addr(pub.section, pub.offset)
sym_section
)
use_dbg_offs = False
if use_dbg_offs:
debug_offs = line_dump[i + 2]
debug_start = int(debug_offs[22:30], 16)
debug_end = int(debug_offs[43:], 16)
info.start = debug_start
info.size = debug_end - debug_start
else:
info.start = 0 info.start = 0
info.size = int(line[41:49], 16) info.size = size
info.name = pub.name
self.names[pub.name] = info
self.funcs[pub.offset] = info
info.name = line[77:] for proc in cv.symbols:
if proc.type != "S_GPROC32":
continue
self.names[info.name] = info info = RecompiledInfo()
self.funcs[sym_addr] = info info.addr = sym_recompfile.get_abs_addr(proc.section, proc.offset)
elif (
current_section == "LINES" info.start = 0
and line.startswith(" ") info.size = proc.size
and not line.startswith(" ") info.name = proc.name
):
sourcepath = line.split()[0] self.names[proc.name] = info
self.funcs[proc.offset] = info
for sourcepath, line_no, offset in cv.lines:
sourcepath = path_resolver.resolve_cvdump(sourcepath) sourcepath = path_resolver.resolve_cvdump(sourcepath)
if sourcepath not in self.lines: if sourcepath not in self.lines:
self.lines[sourcepath] = {} self.lines[sourcepath] = {}
j = i + 2 if line_no not in self.lines[sourcepath]:
while True: self.lines[sourcepath][line_no] = offset
ll = line_dump[j].split()
if len(ll) == 0:
break
k = 0
while k < len(ll):
linenum = int(ll[k + 0])
address = int(ll[k + 1], 16)
if linenum not in self.lines[sourcepath]:
self.lines[sourcepath][linenum] = address
k += 2
j += 1
self.logger.debug("... Parsing output of cvdump.exe finished")
def get_recompiled_address(self, filename, line): def get_recompiled_address(self, filename, line):
recompiled_addr = None recompiled_addr = None