Cvdump parser and comparing library functions (#383)

* Cvdump wrapper and parser. Matching library functions * Remove 'Self' type int (3.11+) * Add temp reference for entrypoints * ISLE using multithreaded libc * 🙄
2025-09-28 07:03:06 -04:00 · 2023-12-28 16:10:57 -05:00 · 2023-12-28 16:10:57 -05:00 · 9a6d555508
commit 9a6d555508
parent ff4845a6ea
7 changed files with 395 additions and 117 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -294,7 +294,7 @@ if (MSVC)
  # game was originally built with) and tweaked slightly to produce more debugging info for reccmp.
  # They ensure a recompilation that can be byte/instruction accurate to the original binaries.
  if (ISLE_BUILD_APP)
-    target_compile_options(isle PRIVATE "/ML$<$<CONFIG:Debug>:d>")
+    target_compile_options(isle PRIVATE "/MT$<$<CONFIG:Debug>:d>")
  endif()
  target_compile_options(lego1 PRIVATE "/MT$<$<CONFIG:Debug>:d>")
--- a/LEGO1/library_msvc.h
+++ b/LEGO1/library_msvc.h
@ -0,0 +1,47 @@
 #ifdef 0
 // LIBRARY: ISLE 0x402f80
 // LIBRARY: LEGO1 0x10086240
 // _malloc
 // LIBRARY: ISLE 0x402fa0
 // LIBRARY: LEGO1 0x10086260
 // _free
 // LIBRARY: ISLE 0x408220
 // LIBRARY: LEGO1 0x1008b400
 // _atol
 // LIBRARY: ISLE 0x4082d0
 // LIBRARY: LEGO1 0x1008b4b0
 // _atoi
 // LIBRARY: LEGO1 0x1008b4c0
 // _strtok
 // LIBRARY: ISLE 0x4085c0
 // LIBRARY: LEGO1 0x1008b5a0
 // _sprintf
 // LIBRARY: ISLE 0x4081e0
 // _srand
 // LIBRARY: ISLE 0x4081f0
 // LIBRARY: LEGO1 0x1008b640
 // _rand
 // entry
 // LIBRARY: ISLE 0x4082e0
 // _WinMainCRTStartup
 // entry
 // LIBRARY: LEGO1 0x1008c860
 // __DllMainCRTStartup@12
 // LIBRARY: ISLE 0x409110
 // __mtinit
 // LIBRARY: ISLE 0x409190
 // __getptd
 #endif
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@ -1,4 +1,6 @@
 import struct
 from typing import List, Optional
 from dataclasses import dataclass
 from collections import namedtuple
@ -33,44 +35,56 @@ PEHeader = namedtuple(
    ],
 )
 ImageSectionHeader = namedtuple(
    "ImageSectionHeader",
    [
        "Name",
        "Misc",
        "VirtualAddress",
        "SizeOfRawData",
        "PointerToRawData",
        "PointerToRelocations",
        "PointerToLineNumbers",
        "NumberOfRelocations",
        "NumberOfLineNumbers",
        "Characteristics",
    ],
 )
@dataclass
 class ImageSectionHeader:
    # pylint: disable=too-many-instance-attributes
    # Most attributes are unused, but this is the struct format
    name: bytes
    virtual_size: int
    virtual_address: int
    size_of_raw_data: int
    pointer_to_raw_data: int
    pointer_to_relocations: int
    pointer_to_line_numbers: int
    number_of_relocations: int
    number_of_line_numbers: int
    characteristics: int
-def section_name_match(section, name):
+    def match_name(self, name: str) -> bool:
-    return section.Name == struct.pack("8s", name.encode("ascii"))
+        return self.name == struct.pack("8s", name.encode("ascii"))
    def contains_vaddr(self, vaddr: int) -> bool:
        ofs = vaddr - self.virtual_address
        return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size)
-def section_contains_vaddr(section, imagebase, vaddr) -> bool:
+    def addr_is_uninitialized(self, vaddr: int) -> bool:
-    debased = vaddr - imagebase
+        """We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
-    ofs = debased - section.VirtualAddress
+        the characteristics field so instead we determine it this way."""
-    return 0 <= ofs < section.SizeOfRawData
+        if not self.contains_vaddr(vaddr):
            return False
        # Should include the case where size_of_raw_data == 0,
        # meaning the entire section is uninitialized
        return (self.virtual_size > self.size_of_raw_data) and (
            vaddr - self.virtual_address >= self.size_of_raw_data
        )
 class Bin:
    """Parses a PE format EXE and allows reading data from a virtual address.
    Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
-    def __init__(self, filename, logger=None):
+    # pylint: disable=too-many-instance-attributes
    def __init__(self, filename: str, logger=None) -> None:
        self.logger = logger
        self._debuglog(f'Parsing headers of "{filename}"... ')
        self.filename = filename
        self.file = None
        self.imagebase = None
-        self.sections = []
+        self.entry = None
        self.sections: List[ImageSectionHeader] = []
        self.last_section = None
        self._relocated_addrs = set()
@ -95,12 +109,18 @@ class Bin:
        optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader)
        (self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20])
        (entry,) = struct.unpack("<i", optional_hdr[0x10:0x14])
        self.entry = entry + self.imagebase
        self.sections = [
            ImageSectionHeader(*struct.unpack("<8s6I2HI", self.file.read(0x28)))
            for i in range(pe_hdr.NumberOfSections)
        ]
        # Add the imagebase here because we almost never need the base vaddr without it
        for sect in self.sections:
            sect.virtual_address += self.imagebase
        self._populate_relocations()
        text_section = self._get_section_by_name(".text")
@ -119,7 +139,7 @@ class Bin:
        if self.logger is not None:
            self.logger.debug(msg)
-    def get_relocated_addresses(self):
+    def get_relocated_addresses(self) -> List[int]:
        return sorted(self._relocated_addrs)
    def is_relocated_addr(self, vaddr) -> bool:
@ -165,27 +185,25 @@ class Bin:
            (relocated_addr,) = struct.unpack("<I", self.read(addr, 4))
            self._relocated_addrs.add(relocated_addr)
-    def _set_section_for_vaddr(self, vaddr):
+    def _set_section_for_vaddr(self, vaddr: int):
-        if self.last_section is not None and section_contains_vaddr(
+        if self.last_section is not None and self.last_section.contains_vaddr(vaddr):
            self.last_section, self.imagebase, vaddr
        ):
            return
        # TODO: assumes no potential for section overlap. reasonable?
        self.last_section = next(
            filter(
-                lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
+                lambda section: section.contains_vaddr(vaddr),
                self.sections,
            ),
            None,
        )
        if self.last_section is None:
-            raise InvalidVirtualAddressError
+            raise InvalidVirtualAddressError(f"0x{vaddr:08x}")
-    def _get_section_by_name(self, name):
+    def _get_section_by_name(self, name: str):
        section = next(
-            filter(lambda section: section_name_match(section, name), self.sections),
+            filter(lambda section: section.match_name(name), self.sections),
            None,
        )
@ -194,7 +212,7 @@ class Bin:
        return section
-    def get_section_offset_by_index(self, index) -> int:
+    def get_section_offset_by_index(self, index: int) -> int:
        """The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
        where A is the index (1-based) into the section table and B is the local offset.
        This will return the virtual address for the start of the section at the given index
@ -202,29 +220,33 @@ class Bin:
        """
        section = self.sections[index - 1]
-        return self.imagebase + section.VirtualAddress
+        return section.virtual_address
-    def get_section_offset_by_name(self, name) -> int:
+    def get_section_offset_by_name(self, name: str) -> int:
        """Same as above, but use the section name as the lookup"""
        section = self._get_section_by_name(name)
-        return self.imagebase + section.VirtualAddress
+        return section.virtual_address
-    def get_raw_addr(self, vaddr) -> int:
+    def get_abs_addr(self, section: int, offset: int) -> int:
        """Convenience function for converting section:offset pairs from cvdump
        into an absolute vaddr."""
        return self.get_section_offset_by_index(section) + offset
    def get_raw_addr(self, vaddr: int) -> int:
        """Returns the raw offset in the PE binary for the given virtual address."""
        self._set_section_for_vaddr(vaddr)
        return (
            vaddr
-            - self.imagebase
+            - self.last_section.virtual_address
-            - self.last_section.VirtualAddress
+            + self.last_section.pointer_to_raw_data
            + self.last_section.PointerToRawData
        )
-    def is_valid_vaddr(self, vaddr) -> bool:
+    def is_valid_vaddr(self, vaddr: int) -> bool:
        """Does this virtual address point to anything in the exe?"""
        section = next(
            filter(
-                lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
+                lambda section: section.contains_vaddr(vaddr),
                self.sections,
            ),
            None,
@ -232,9 +254,14 @@ class Bin:
        return section is not None
-    def read(self, offset, size):
+    def read(self, offset: int, size: int) -> Optional[bytes]:
        """Read (at most) the given number of bytes at the given virtual address.
        If we return None, the given address points to uninitialized data."""
        self._set_section_for_vaddr(offset)
        if self.last_section.addr_is_uninitialized(offset):
            return None
        raw_addr = self.get_raw_addr(offset)
        self.file.seek(raw_addr)
@ -242,8 +269,8 @@ class Bin:
        # Reading off the end will most likely misrepresent the virtual addressing.
        _size = min(
            size,
-            self.last_section.PointerToRawData
+            self.last_section.pointer_to_raw_data
-            + self.last_section.SizeOfRawData
+            + self.last_section.size_of_raw_data
            - raw_addr,
        )
        return self.file.read(_size)
--- a/tools/isledecomp/isledecomp/cvdump/init.py
+++ b/tools/isledecomp/isledecomp/cvdump/init.py
@ -0,0 +1,2 @@
 from .parser import CvdumpParser
 from .runner import Cvdump
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@ -0,0 +1,163 @@
 import re
 from typing import Iterable
 from collections import namedtuple
 # e.g. `*** PUBLICS`
 _section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)")
 # e.g. `     27 00034EC0     28 00034EE2     29 00034EE7     30 00034EF4`
 _line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
 # We assume no spaces in the file name
 # e.g. `  Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
 _lines_subsection_header = re.compile(
    r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
 )
 # e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
 _publics_line_regex = re.compile(
    r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
 )
 # e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type:             0x1024, ViewROI::IntrinsicImportance`
 _symbol_line_regex = re.compile(
    r"\(\w+\) (?P<type>\S+): \[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+\S+, (?P<name>.+)"
 )
 # e.g. `         Debug start: 00000008, Debug end: 0000016E`
 _gproc_debug_regex = re.compile(
    r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
 )
 # e.g. `  00DA  0001:00000000  00000073  60501020`
 _section_contrib_regex = re.compile(
    r"\s*(?P<module>\w{4})  (?P<section>\w{4}):(?P<offset>\w{8})  (?P<size>\w{8})  (?P<flags>\w{8})"
 )
 # e.g. `S_GDATA32: [0003:000004A4], Type:   T_32PRCHAR(0470), g_set`
 _gdata32_regex = re.compile(
    r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>\S+)"
 )
 LinesEntry = namedtuple("LinesEntry", "filename line_no addr")
 PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
 SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
 SizeRefEntry = namedtuple("SizeRefEntry", "section offset size")
 GdataEntry = namedtuple("GdataEntry", "section offset type name")
 class CvdumpParser:
    def __init__(self) -> None:
        self._section: str = ""
        self._lines_filename: str = ""
        self.lines = []
        self.publics = []
        self.symbols = []
        self.sizerefs = []
        self.globals = []
    def _lines_section(self, line: str):
        """Parsing entries from the LINES section. We only care about the pairs of
        line_number and address and the subsection header to indicate which code file
        we are in."""
        # Subheader indicates a new function and possibly a new code filename.
        if (match := _lines_subsection_header.match(line)) is not None:
            self._lines_filename = match.group(1)
            return
        if (matches := _line_addr_pairs_findall.findall(line)) is not None:
            for line_no, addr in matches:
                self.lines.append(
                    LinesEntry(
                        filename=self._lines_filename,
                        line_no=int(line_no),
                        addr=int(addr, 16),
                    )
                )
    def _publics_section(self, line: str):
        """Match each line from PUBLICS and pull out the symbol information.
        These are MSVC mangled symbol names. String constants and vtable
        addresses can only be found here."""
        if (match := _publics_line_regex.match(line)) is not None:
            self.publics.append(
                PublicsEntry(
                    type=match.group("type"),
                    section=int(match.group("section"), 16),
                    offset=int(match.group("offset"), 16),
                    flags=int(match.group("flags"), 16),
                    name=match.group("name"),
                )
            )
    def _globals_section(self, line: str):
        """S_PROCREF may be useful later.
        Right now we just want S_GDATA32 symbols because it is the simplest
        way to access global variables."""
        if (match := _gdata32_regex.match(line)) is not None:
            self.globals.append(
                GdataEntry(
                    section=int(match.group("section"), 16),
                    offset=int(match.group("offset"), 16),
                    type=match.group("type"),
                    name=match.group("name"),
                )
            )
    def _symbols_section(self, line: str):
        """We are interested in S_GPROC32 symbols only."""
        if (match := _symbol_line_regex.match(line)) is not None:
            if match.group("type") == "S_GPROC32":
                self.symbols.append(
                    SymbolsEntry(
                        type=match.group("type"),
                        section=int(match.group("section"), 16),
                        offset=int(match.group("offset"), 16),
                        size=int(match.group("size"), 16),
                        name=match.group("name"),
                    )
                )
    def _section_contributions(self, line: str):
        """Gives the size of elements across all sections of the binary.
        This is the easiest way to get the data size for .data and .rdata
        members that do not have a primitive data type."""
        if (match := _section_contrib_regex.match(line)) is not None:
            self.sizerefs.append(
                SizeRefEntry(
                    section=int(match.group("section"), 16),
                    offset=int(match.group("offset"), 16),
                    size=int(match.group("size"), 16),
                )
            )
    def read_line(self, line: str):
        # Blank lines are there to help the reader; they have no context significance
        if line.strip() == "":
            return
        if (match := _section_change_regex.match(line)) is not None:
            self._section = match.group(1)
            return
        if self._section == "LINES":
            self._lines_section(line)
        elif self._section == "PUBLICS":
            self._publics_section(line)
        elif self._section == "SYMBOLS":
            self._symbols_section(line)
        elif self._section == "SECTION CONTRIBUTIONS":
            self._section_contributions(line)
        elif self._section == "GLOBALS":
            self._globals_section(line)
    def read_lines(self, lines: Iterable[str]):
        for line in lines:
            self.read_line(line)
--- a/tools/isledecomp/isledecomp/cvdump/runner.py
+++ b/tools/isledecomp/isledecomp/cvdump/runner.py
@ -0,0 +1,66 @@
 from os import name as os_name
 from enum import Enum
 from typing import List
 import subprocess
 from isledecomp.lib import lib_path_join
 from isledecomp.dir import winepath_unix_to_win
 from .parser import CvdumpParser
 class DumpOpt(Enum):
    LINES = 0
    SYMBOLS = 1
    GLOBALS = 2
    PUBLICS = 3
    SECTION_CONTRIB = 4
 cvdump_opt_map = {
    DumpOpt.LINES: "-l",
    DumpOpt.SYMBOLS: "-s",
    DumpOpt.GLOBALS: "-g",
    DumpOpt.PUBLICS: "-p",
    DumpOpt.SECTION_CONTRIB: "-seccontrib",
 }
 class Cvdump:
    def __init__(self, pdb: str) -> None:
        self._pdb: str = pdb
        self._options = set()
    def lines(self):
        self._options.add(DumpOpt.LINES)
        return self
    def symbols(self):
        self._options.add(DumpOpt.SYMBOLS)
        return self
    def globals(self):
        self._options.add(DumpOpt.GLOBALS)
        return self
    def publics(self):
        self._options.add(DumpOpt.PUBLICS)
        return self
    def section_contributions(self):
        self._options.add(DumpOpt.SECTION_CONTRIB)
        return self
    def cmd_line(self) -> List[str]:
        cvdump_exe = lib_path_join("cvdump.exe")
        flags = [cvdump_opt_map[opt] for opt in self._options]
        if os_name == "nt":
            return [cvdump_exe, *flags, self._pdb]
        return ["wine", *flags, cvdump_exe, winepath_unix_to_win(self._pdb)]
    def run(self) -> CvdumpParser:
        p = CvdumpParser()
        call = self.cmd_line()
        lines = subprocess.check_output(call).decode("utf-8").split("\r\n")
        p.read_lines(lines)
        return p
--- a/tools/isledecomp/isledecomp/syminfo.py
+++ b/tools/isledecomp/isledecomp/syminfo.py
@ -1,7 +1,6 @@
 import os
-import subprocess
+from isledecomp.dir import PathResolver
-from isledecomp.lib import lib_path_join
+from isledecomp.cvdump import Cvdump
 from isledecomp.dir import PathResolver, winepath_unix_to_win
 class RecompiledInfo:
@ -20,80 +19,54 @@ class SymInfo:
    def __init__(self, pdb, sym_recompfile, sym_logger, base_dir):
        self.logger = sym_logger
        path_resolver = PathResolver(base_dir)
        call = [lib_path_join("cvdump.exe"), "-l", "-s"]
        if os.name != "nt":
            # Run cvdump through wine and convert path to Windows-friendly wine path
            call.insert(0, "wine")
            call.append(winepath_unix_to_win(pdb))
        else:
            call.append(pdb)
        self.logger.info("Parsing %s ...", pdb)
        self.logger.debug("Command = %s", call)
        line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
        current_section = None
        self.logger.debug("Parsing output of cvdump.exe ...")
-        for i, line in enumerate(line_dump):
+        cv = Cvdump(pdb).lines().symbols().publics().section_contributions().run()
            if line.startswith("***"):
                current_section = line[4:]
-            if current_section == "SYMBOLS" and "S_GPROC32" in line:
+        self.logger.debug("... Parsing output of cvdump.exe finished")
-                sym_section = int(line[21:25], 16)
+
-                sym_addr = int(line[26:34], 16)
+        contrib_dict = {(s.section, s.offset): s.size for s in cv.sizerefs}
        for pub in cv.publics:
            if (
                pub.type == "S_PUB32"
                and pub.name.startswith("_")
                and (pub.section, pub.offset) in contrib_dict
            ):
                size = contrib_dict[(pub.section, pub.offset)]
                info = RecompiledInfo()
-                info.addr = sym_addr + sym_recompfile.get_section_offset_by_index(
+                info.addr = sym_recompfile.get_abs_addr(pub.section, pub.offset)
                    sym_section
                )
                use_dbg_offs = False
                if use_dbg_offs:
                    debug_offs = line_dump[i + 2]
                    debug_start = int(debug_offs[22:30], 16)
                    debug_end = int(debug_offs[43:], 16)
                    info.start = debug_start
                    info.size = debug_end - debug_start
                else:
                info.start = 0
-                    info.size = int(line[41:49], 16)
+                info.size = size
                info.name = pub.name
                self.names[pub.name] = info
                self.funcs[pub.offset] = info
-                info.name = line[77:]
+        for proc in cv.symbols:
            if proc.type != "S_GPROC32":
                continue
-                self.names[info.name] = info
+            info = RecompiledInfo()
-                self.funcs[sym_addr] = info
+            info.addr = sym_recompfile.get_abs_addr(proc.section, proc.offset)
-            elif (
+
-                current_section == "LINES"
+            info.start = 0
-                and line.startswith("  ")
+            info.size = proc.size
-                and not line.startswith("   ")
+            info.name = proc.name
-            ):
+
-                sourcepath = line.split()[0]
+            self.names[proc.name] = info
            self.funcs[proc.offset] = info
        for sourcepath, line_no, offset in cv.lines:
            sourcepath = path_resolver.resolve_cvdump(sourcepath)
            if sourcepath not in self.lines:
                self.lines[sourcepath] = {}
-                j = i + 2
+            if line_no not in self.lines[sourcepath]:
-                while True:
+                self.lines[sourcepath][line_no] = offset
                    ll = line_dump[j].split()
                    if len(ll) == 0:
                        break
                    k = 0
                    while k < len(ll):
                        linenum = int(ll[k + 0])
                        address = int(ll[k + 1], 16)
                        if linenum not in self.lines[sourcepath]:
                            self.lines[sourcepath][linenum] = address
                        k += 2
                    j += 1
        self.logger.debug("... Parsing output of cvdump.exe finished")
    def get_recompiled_address(self, filename, line):
        recompiled_addr = None
		`@ -0,0 +1,2 @@`
							`from .parser import CvdumpParser`
							`from .runner import Cvdump`