diff --git a/CMakeLists.txt b/CMakeLists.txt index cdf37ff1..b4676767 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,7 +294,7 @@ if (MSVC) # game was originally built with) and tweaked slightly to produce more debugging info for reccmp. # They ensure a recompilation that can be byte/instruction accurate to the original binaries. if (ISLE_BUILD_APP) - target_compile_options(isle PRIVATE "/ML$<$:d>") + target_compile_options(isle PRIVATE "/MT$<$:d>") endif() target_compile_options(lego1 PRIVATE "/MT$<$:d>") diff --git a/LEGO1/library_msvc.h b/LEGO1/library_msvc.h new file mode 100644 index 00000000..d07ecef2 --- /dev/null +++ b/LEGO1/library_msvc.h @@ -0,0 +1,47 @@ +#ifdef 0 + +// LIBRARY: ISLE 0x402f80 +// LIBRARY: LEGO1 0x10086240 +// _malloc + +// LIBRARY: ISLE 0x402fa0 +// LIBRARY: LEGO1 0x10086260 +// _free + +// LIBRARY: ISLE 0x408220 +// LIBRARY: LEGO1 0x1008b400 +// _atol + +// LIBRARY: ISLE 0x4082d0 +// LIBRARY: LEGO1 0x1008b4b0 +// _atoi + +// LIBRARY: LEGO1 0x1008b4c0 +// _strtok + +// LIBRARY: ISLE 0x4085c0 +// LIBRARY: LEGO1 0x1008b5a0 +// _sprintf + +// LIBRARY: ISLE 0x4081e0 +// _srand + +// LIBRARY: ISLE 0x4081f0 +// LIBRARY: LEGO1 0x1008b640 +// _rand + +// entry +// LIBRARY: ISLE 0x4082e0 +// _WinMainCRTStartup + +// entry +// LIBRARY: LEGO1 0x1008c860 +// __DllMainCRTStartup@12 + +// LIBRARY: ISLE 0x409110 +// __mtinit + +// LIBRARY: ISLE 0x409190 +// __getptd + +#endif diff --git a/tools/isledecomp/isledecomp/bin.py b/tools/isledecomp/isledecomp/bin.py index 1aec9330..16f70f7a 100644 --- a/tools/isledecomp/isledecomp/bin.py +++ b/tools/isledecomp/isledecomp/bin.py @@ -1,4 +1,6 @@ import struct +from typing import List, Optional +from dataclasses import dataclass from collections import namedtuple @@ -33,44 +35,56 @@ PEHeader = namedtuple( ], ) -ImageSectionHeader = namedtuple( - "ImageSectionHeader", - [ - "Name", - "Misc", - "VirtualAddress", - "SizeOfRawData", - "PointerToRawData", - "PointerToRelocations", - "PointerToLineNumbers", - "NumberOfRelocations", - "NumberOfLineNumbers", - "Characteristics", - ], -) +@dataclass +class ImageSectionHeader: + # pylint: disable=too-many-instance-attributes + # Most attributes are unused, but this is the struct format + name: bytes + virtual_size: int + virtual_address: int + size_of_raw_data: int + pointer_to_raw_data: int + pointer_to_relocations: int + pointer_to_line_numbers: int + number_of_relocations: int + number_of_line_numbers: int + characteristics: int -def section_name_match(section, name): - return section.Name == struct.pack("8s", name.encode("ascii")) + def match_name(self, name: str) -> bool: + return self.name == struct.pack("8s", name.encode("ascii")) + def contains_vaddr(self, vaddr: int) -> bool: + ofs = vaddr - self.virtual_address + return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size) -def section_contains_vaddr(section, imagebase, vaddr) -> bool: - debased = vaddr - imagebase - ofs = debased - section.VirtualAddress - return 0 <= ofs < section.SizeOfRawData + def addr_is_uninitialized(self, vaddr: int) -> bool: + """We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in + the characteristics field so instead we determine it this way.""" + if not self.contains_vaddr(vaddr): + return False + + # Should include the case where size_of_raw_data == 0, + # meaning the entire section is uninitialized + return (self.virtual_size > self.size_of_raw_data) and ( + vaddr - self.virtual_address >= self.size_of_raw_data + ) class Bin: """Parses a PE format EXE and allows reading data from a virtual address. Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format""" - def __init__(self, filename, logger=None): + # pylint: disable=too-many-instance-attributes + + def __init__(self, filename: str, logger=None) -> None: self.logger = logger self._debuglog(f'Parsing headers of "{filename}"... ') self.filename = filename self.file = None self.imagebase = None - self.sections = [] + self.entry = None + self.sections: List[ImageSectionHeader] = [] self.last_section = None self._relocated_addrs = set() @@ -95,12 +109,18 @@ class Bin: optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader) (self.imagebase,) = struct.unpack(" List[int]: return sorted(self._relocated_addrs) def is_relocated_addr(self, vaddr) -> bool: @@ -165,27 +185,25 @@ class Bin: (relocated_addr,) = struct.unpack(" int: + def get_section_offset_by_index(self, index: int) -> int: """The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB where A is the index (1-based) into the section table and B is the local offset. This will return the virtual address for the start of the section at the given index @@ -202,29 +220,33 @@ class Bin: """ section = self.sections[index - 1] - return self.imagebase + section.VirtualAddress + return section.virtual_address - def get_section_offset_by_name(self, name) -> int: + def get_section_offset_by_name(self, name: str) -> int: """Same as above, but use the section name as the lookup""" section = self._get_section_by_name(name) - return self.imagebase + section.VirtualAddress + return section.virtual_address - def get_raw_addr(self, vaddr) -> int: + def get_abs_addr(self, section: int, offset: int) -> int: + """Convenience function for converting section:offset pairs from cvdump + into an absolute vaddr.""" + return self.get_section_offset_by_index(section) + offset + + def get_raw_addr(self, vaddr: int) -> int: """Returns the raw offset in the PE binary for the given virtual address.""" self._set_section_for_vaddr(vaddr) return ( vaddr - - self.imagebase - - self.last_section.VirtualAddress - + self.last_section.PointerToRawData + - self.last_section.virtual_address + + self.last_section.pointer_to_raw_data ) - def is_valid_vaddr(self, vaddr) -> bool: + def is_valid_vaddr(self, vaddr: int) -> bool: """Does this virtual address point to anything in the exe?""" section = next( filter( - lambda section: section_contains_vaddr(section, self.imagebase, vaddr), + lambda section: section.contains_vaddr(vaddr), self.sections, ), None, @@ -232,9 +254,14 @@ class Bin: return section is not None - def read(self, offset, size): + def read(self, offset: int, size: int) -> Optional[bytes]: + """Read (at most) the given number of bytes at the given virtual address. + If we return None, the given address points to uninitialized data.""" self._set_section_for_vaddr(offset) + if self.last_section.addr_is_uninitialized(offset): + return None + raw_addr = self.get_raw_addr(offset) self.file.seek(raw_addr) @@ -242,8 +269,8 @@ class Bin: # Reading off the end will most likely misrepresent the virtual addressing. _size = min( size, - self.last_section.PointerToRawData - + self.last_section.SizeOfRawData + self.last_section.pointer_to_raw_data + + self.last_section.size_of_raw_data - raw_addr, ) return self.file.read(_size) diff --git a/tools/isledecomp/isledecomp/cvdump/__init__.py b/tools/isledecomp/isledecomp/cvdump/__init__.py new file mode 100644 index 00000000..e9d66298 --- /dev/null +++ b/tools/isledecomp/isledecomp/cvdump/__init__.py @@ -0,0 +1,2 @@ +from .parser import CvdumpParser +from .runner import Cvdump diff --git a/tools/isledecomp/isledecomp/cvdump/parser.py b/tools/isledecomp/isledecomp/cvdump/parser.py new file mode 100644 index 00000000..ddc2e5f7 --- /dev/null +++ b/tools/isledecomp/isledecomp/cvdump/parser.py @@ -0,0 +1,163 @@ +import re +from typing import Iterable +from collections import namedtuple + +# e.g. `*** PUBLICS` +_section_change_regex = re.compile(r"^\*\*\* (?P
[A-Z/ ]+)") + +# e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4` +_line_addr_pairs_findall = re.compile(r"\s+(?P\d+) (?P[A-F0-9]{8})") + +# We assume no spaces in the file name +# e.g. ` Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2` +_lines_subsection_header = re.compile( + r"^\s*(?P\S+).*?, (?P
[A-F0-9]{4}):(?P[A-F0-9]{8})-(?P[A-F0-9]{8}), line/addr pairs = (?P\d+)" +) + +# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read` +_publics_line_regex = re.compile( + r"^(?P\w+): \[(?P
\w{4}):(?P\w{8})], Flags: (?P\w{8}), (?P\S+)" +) + +# e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type: 0x1024, ViewROI::IntrinsicImportance` +_symbol_line_regex = re.compile( + r"\(\w+\) (?P\S+): \[(?P
\w{4}):(?P\w{8})\], Cb: (?P\w+), Type:\s+\S+, (?P.+)" +) + +# e.g. ` Debug start: 00000008, Debug end: 0000016E` +_gproc_debug_regex = re.compile( + r"\s*Debug start: (?P\w{8}), Debug end: (?P\w{8})" +) + +# e.g. ` 00DA 0001:00000000 00000073 60501020` +_section_contrib_regex = re.compile( + r"\s*(?P\w{4}) (?P
\w{4}):(?P\w{8}) (?P\w{8}) (?P\w{8})" +) + +# e.g. `S_GDATA32: [0003:000004A4], Type: T_32PRCHAR(0470), g_set` +_gdata32_regex = re.compile( + r"S_GDATA32: \[(?P
\w{4}):(?P\w{8})\], Type:\s*(?P\S+), (?P\S+)" +) + + +LinesEntry = namedtuple("LinesEntry", "filename line_no addr") +PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name") +SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name") +SizeRefEntry = namedtuple("SizeRefEntry", "section offset size") +GdataEntry = namedtuple("GdataEntry", "section offset type name") + + +class CvdumpParser: + def __init__(self) -> None: + self._section: str = "" + self._lines_filename: str = "" + + self.lines = [] + self.publics = [] + self.symbols = [] + self.sizerefs = [] + self.globals = [] + + def _lines_section(self, line: str): + """Parsing entries from the LINES section. We only care about the pairs of + line_number and address and the subsection header to indicate which code file + we are in.""" + + # Subheader indicates a new function and possibly a new code filename. + if (match := _lines_subsection_header.match(line)) is not None: + self._lines_filename = match.group(1) + return + + if (matches := _line_addr_pairs_findall.findall(line)) is not None: + for line_no, addr in matches: + self.lines.append( + LinesEntry( + filename=self._lines_filename, + line_no=int(line_no), + addr=int(addr, 16), + ) + ) + + def _publics_section(self, line: str): + """Match each line from PUBLICS and pull out the symbol information. + These are MSVC mangled symbol names. String constants and vtable + addresses can only be found here.""" + if (match := _publics_line_regex.match(line)) is not None: + self.publics.append( + PublicsEntry( + type=match.group("type"), + section=int(match.group("section"), 16), + offset=int(match.group("offset"), 16), + flags=int(match.group("flags"), 16), + name=match.group("name"), + ) + ) + + def _globals_section(self, line: str): + """S_PROCREF may be useful later. + Right now we just want S_GDATA32 symbols because it is the simplest + way to access global variables.""" + if (match := _gdata32_regex.match(line)) is not None: + self.globals.append( + GdataEntry( + section=int(match.group("section"), 16), + offset=int(match.group("offset"), 16), + type=match.group("type"), + name=match.group("name"), + ) + ) + + def _symbols_section(self, line: str): + """We are interested in S_GPROC32 symbols only.""" + if (match := _symbol_line_regex.match(line)) is not None: + if match.group("type") == "S_GPROC32": + self.symbols.append( + SymbolsEntry( + type=match.group("type"), + section=int(match.group("section"), 16), + offset=int(match.group("offset"), 16), + size=int(match.group("size"), 16), + name=match.group("name"), + ) + ) + + def _section_contributions(self, line: str): + """Gives the size of elements across all sections of the binary. + This is the easiest way to get the data size for .data and .rdata + members that do not have a primitive data type.""" + if (match := _section_contrib_regex.match(line)) is not None: + self.sizerefs.append( + SizeRefEntry( + section=int(match.group("section"), 16), + offset=int(match.group("offset"), 16), + size=int(match.group("size"), 16), + ) + ) + + def read_line(self, line: str): + # Blank lines are there to help the reader; they have no context significance + if line.strip() == "": + return + + if (match := _section_change_regex.match(line)) is not None: + self._section = match.group(1) + return + + if self._section == "LINES": + self._lines_section(line) + + elif self._section == "PUBLICS": + self._publics_section(line) + + elif self._section == "SYMBOLS": + self._symbols_section(line) + + elif self._section == "SECTION CONTRIBUTIONS": + self._section_contributions(line) + + elif self._section == "GLOBALS": + self._globals_section(line) + + def read_lines(self, lines: Iterable[str]): + for line in lines: + self.read_line(line) diff --git a/tools/isledecomp/isledecomp/cvdump/runner.py b/tools/isledecomp/isledecomp/cvdump/runner.py new file mode 100644 index 00000000..02083c12 --- /dev/null +++ b/tools/isledecomp/isledecomp/cvdump/runner.py @@ -0,0 +1,66 @@ +from os import name as os_name +from enum import Enum +from typing import List +import subprocess +from isledecomp.lib import lib_path_join +from isledecomp.dir import winepath_unix_to_win +from .parser import CvdumpParser + + +class DumpOpt(Enum): + LINES = 0 + SYMBOLS = 1 + GLOBALS = 2 + PUBLICS = 3 + SECTION_CONTRIB = 4 + + +cvdump_opt_map = { + DumpOpt.LINES: "-l", + DumpOpt.SYMBOLS: "-s", + DumpOpt.GLOBALS: "-g", + DumpOpt.PUBLICS: "-p", + DumpOpt.SECTION_CONTRIB: "-seccontrib", +} + + +class Cvdump: + def __init__(self, pdb: str) -> None: + self._pdb: str = pdb + self._options = set() + + def lines(self): + self._options.add(DumpOpt.LINES) + return self + + def symbols(self): + self._options.add(DumpOpt.SYMBOLS) + return self + + def globals(self): + self._options.add(DumpOpt.GLOBALS) + return self + + def publics(self): + self._options.add(DumpOpt.PUBLICS) + return self + + def section_contributions(self): + self._options.add(DumpOpt.SECTION_CONTRIB) + return self + + def cmd_line(self) -> List[str]: + cvdump_exe = lib_path_join("cvdump.exe") + flags = [cvdump_opt_map[opt] for opt in self._options] + + if os_name == "nt": + return [cvdump_exe, *flags, self._pdb] + + return ["wine", *flags, cvdump_exe, winepath_unix_to_win(self._pdb)] + + def run(self) -> CvdumpParser: + p = CvdumpParser() + call = self.cmd_line() + lines = subprocess.check_output(call).decode("utf-8").split("\r\n") + p.read_lines(lines) + return p diff --git a/tools/isledecomp/isledecomp/syminfo.py b/tools/isledecomp/isledecomp/syminfo.py index e7ab0df4..1ecf0010 100644 --- a/tools/isledecomp/isledecomp/syminfo.py +++ b/tools/isledecomp/isledecomp/syminfo.py @@ -1,7 +1,6 @@ import os -import subprocess -from isledecomp.lib import lib_path_join -from isledecomp.dir import PathResolver, winepath_unix_to_win +from isledecomp.dir import PathResolver +from isledecomp.cvdump import Cvdump class RecompiledInfo: @@ -20,81 +19,55 @@ class SymInfo: def __init__(self, pdb, sym_recompfile, sym_logger, base_dir): self.logger = sym_logger path_resolver = PathResolver(base_dir) - call = [lib_path_join("cvdump.exe"), "-l", "-s"] - - if os.name != "nt": - # Run cvdump through wine and convert path to Windows-friendly wine path - call.insert(0, "wine") - call.append(winepath_unix_to_win(pdb)) - else: - call.append(pdb) self.logger.info("Parsing %s ...", pdb) - self.logger.debug("Command = %s", call) - line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n") - - current_section = None - self.logger.debug("Parsing output of cvdump.exe ...") - for i, line in enumerate(line_dump): - if line.startswith("***"): - current_section = line[4:] - - if current_section == "SYMBOLS" and "S_GPROC32" in line: - sym_section = int(line[21:25], 16) - sym_addr = int(line[26:34], 16) - - info = RecompiledInfo() - info.addr = sym_addr + sym_recompfile.get_section_offset_by_index( - sym_section - ) - - use_dbg_offs = False - if use_dbg_offs: - debug_offs = line_dump[i + 2] - debug_start = int(debug_offs[22:30], 16) - debug_end = int(debug_offs[43:], 16) - - info.start = debug_start - info.size = debug_end - debug_start - else: - info.start = 0 - info.size = int(line[41:49], 16) - - info.name = line[77:] - - self.names[info.name] = info - self.funcs[sym_addr] = info - elif ( - current_section == "LINES" - and line.startswith(" ") - and not line.startswith(" ") - ): - sourcepath = line.split()[0] - sourcepath = path_resolver.resolve_cvdump(sourcepath) - - if sourcepath not in self.lines: - self.lines[sourcepath] = {} - - j = i + 2 - while True: - ll = line_dump[j].split() - if len(ll) == 0: - break - - k = 0 - while k < len(ll): - linenum = int(ll[k + 0]) - address = int(ll[k + 1], 16) - if linenum not in self.lines[sourcepath]: - self.lines[sourcepath][linenum] = address - k += 2 - - j += 1 + cv = Cvdump(pdb).lines().symbols().publics().section_contributions().run() self.logger.debug("... Parsing output of cvdump.exe finished") + contrib_dict = {(s.section, s.offset): s.size for s in cv.sizerefs} + for pub in cv.publics: + if ( + pub.type == "S_PUB32" + and pub.name.startswith("_") + and (pub.section, pub.offset) in contrib_dict + ): + size = contrib_dict[(pub.section, pub.offset)] + + info = RecompiledInfo() + info.addr = sym_recompfile.get_abs_addr(pub.section, pub.offset) + + info.start = 0 + info.size = size + info.name = pub.name + self.names[pub.name] = info + self.funcs[pub.offset] = info + + for proc in cv.symbols: + if proc.type != "S_GPROC32": + continue + + info = RecompiledInfo() + info.addr = sym_recompfile.get_abs_addr(proc.section, proc.offset) + + info.start = 0 + info.size = proc.size + info.name = proc.name + + self.names[proc.name] = info + self.funcs[proc.offset] = info + + for sourcepath, line_no, offset in cv.lines: + sourcepath = path_resolver.resolve_cvdump(sourcepath) + + if sourcepath not in self.lines: + self.lines[sourcepath] = {} + + if line_no not in self.lines[sourcepath]: + self.lines[sourcepath][line_no] = offset + def get_recompiled_address(self, filename, line): recompiled_addr = None