From 3963967ebc4bf0b61b1d5fea26680cbc56fe29b1 Mon Sep 17 00:00:00 2001
From: Yuto Takano <yuto.takano@arm.com>
Date: Thu, 5 Aug 2021 19:47:48 +0100
Subject: [PATCH] Restructure check-names.py with more verbose error messages

Signed-off-by: Yuto Takano <yuto.takano@arm.com>
---
 tests/scripts/check-names.py | 596 +++++++++++++++++++++++++----------
 1 file changed, 429 insertions(+), 167 deletions(-)

diff --git a/tests/scripts/check-names.py b/tests/scripts/check-names.py
index 8a8e2dbbe..431bcbb5c 100755
--- a/tests/scripts/check-names.py
+++ b/tests/scripts/check-names.py
@@ -1,14 +1,27 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
+#
+# Copyright The Mbed TLS Contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
-This file is part of Mbed TLS (https://tls.mbed.org)
-
-Copyright (c) 2018, Arm Limited, All Rights Reserved
-
-Purpose
-
-This script confirms that the naming of all symbols and identifiers in mbed
-TLS are consistent with the house style and are also self-consistent.
+This script confirms that the naming of all symbols and identifiers in Mbed TLS
+are consistent with the house style and are also self-consistent.
 """
+
+import argparse
+import textwrap
 import os
 import sys
 import traceback
@@ -17,47 +30,89 @@ import shutil
 import subprocess
 import logging
 
+# Naming patterns to check against
+MACRO_PATTERN = r"^MBEDTLS_[0-9A-Z_]*[0-9A-Z]$|^YOTTA_[0-9A-Z_]*[0-9A-Z]$"
+IDENTIFIER_PATTERN = r"^mbedtls_[0-9a-z_]*[0-9a-z]$"
+
+class Match(object):
+    def __init__(self, filename, line, pos, name):
+        self.filename = filename
+        self.line = line
+        self.pos = pos
+        self.name = name
+    
+    def __str__(self):
+        return self.name
+
+class Problem(object):
+    def __init__(self):
+        self.textwrapper = textwrap.TextWrapper()
+        self.textwrapper.initial_indent = "  * "
+        self.textwrapper.subsequent_indent = "    "
+
+class SymbolNotInHeader(Problem):
+    def __init__(self, symbol_name):
+        self.symbol_name = symbol_name
+        Problem.__init__(self)
+
+    def __str__(self):
+        return self.textwrapper.fill(
+            "'{0}' was found as an available symbol in the output of nm, "
+            "however it was not declared in any header files."
+            .format(self.symbol_name))
+
+class PatternMismatch(Problem):
+    def __init__(self, pattern, match):
+        self.pattern = pattern
+        self.match = match
+        Problem.__init__(self)
+    
+    def __str__(self):
+        return self.textwrapper.fill(
+            "{0}: '{1}' does not match the required pattern '{2}'."
+            .format(self.match.filename, self.match.name, self.pattern))
+
+class Typo(Problem):
+    def __init__(self, match):
+        self.match = match
+        Problem.__init__(self)
+    
+    def __str__(self):
+        return self.textwrapper.fill(
+            "{0}: '{1}' looks like a typo. It was not found in any macros or "
+            "any enums. If this is not a typo, put //no-check-names after it."
+            .format(self.match.filename, self.match.name))
 
 class NameCheck(object):
     def __init__(self):
         self.log = None
-        self.setup_logger()
         self.check_repo_path()
         self.return_code = 0
         self.excluded_files = ["compat-1.3.h"]
-        self.header_files = self.get_files(os.path.join("include", "mbedtls"))
-        self.library_files = self.get_files("library")
-        self.macros = []
-        self.MBED_names = []
-        self.enum_consts = []
-        self.identifiers = []
-        self.actual_macros = []
-        self.symbols = []
-        self.macro_pattern = r"#define (?P<macro>\w+)"
-        self.MBED_pattern = r"\bMBED.+?_[A-Z0-9_]*"
-        self.symbol_pattern = r"^\S+( [0-9A-Fa-f]+)* . _*(?P<symbol>\w+)"
-        self.identifier_check_pattern = r"^mbedtls_[0-9a-z_]*[0-9a-z]$"
-        self.decls_pattern = (
-            r"^(extern \"C\"|(typedef )?(struct|enum)( {)?$|};?$|$)"
-        )
-        self.macro_const_check_pattern = (
-            r"^MBEDTLS_[0-9A-Z_]*[0-9A-Z]$|^YOTTA_[0-9A-Z_]*[0-9A-Z]$"
-        )
         self.typo_check_pattern = r"XXX|__|_$|^MBEDTLS_.*CONFIG_FILE$"
-        self.non_macros = (
-            "asm", "inline", "EMIT", "_CRT_SECURE_NO_DEPRECATE", "MULADDC_"
-        )
 
     def set_return_code(self, return_code):
         if return_code > self.return_code:
             self.return_code = return_code
 
-    def setup_logger(self):
+    def setup_logger(self, verbose=False):
+        """
+        Set up a logger and set the change the default logging level from
+        WARNING to INFO. Loggers are better than print statements since their 
+        verbosity can be controlled.
+        """
         self.log = logging.getLogger()
-        self.log.setLevel(logging.INFO)
+        if verbose:
+            self.log.setLevel(logging.DEBUG)
+        else:
+            self.log.setLevel(logging.INFO)
         self.log.addHandler(logging.StreamHandler())
 
     def check_repo_path(self):
+        """
+        Check that the current working directory is the project root, and throw
+        an exception if not.
+        """
         current_dir = os.path.realpath('.')
         root_dir = os.path.dirname(os.path.dirname(
             os.path.dirname(os.path.realpath(__file__))))
@@ -73,32 +128,81 @@ class NameCheck(object):
                     filenames.append(os.path.join(root, filename))
         return filenames
 
-    def get_macros(self):
-        for header_file in self.header_files:
+    def parse_macros(self, header_files):
+        """
+        Parse all macros defined by #define preprocessor directives.
+
+        Args:
+            header_files: A list of filepaths to look through.
+        
+        Returns:
+            A list of Match objects for the macros.
+        """
+        MACRO_REGEX = r"#define (?P<macro>\w+)"
+        NON_MACROS = (
+            "asm", "inline", "EMIT", "_CRT_SECURE_NO_DEPRECATE", "MULADDC_"
+        )
+
+        macros = []
+
+        for header_file in header_files:
             with open(header_file, "r") as header:
-                for line in iter(header.readline, ""):
-                    macro = re.search(self.macro_pattern, line)
-                    if (macro and not
-                            macro.group("macro").startswith(self.non_macros)):
-                        self.macros.append((macro.group("macro"), header_file))
-        self.macros = list(set(self.macros))
+                for line in header:
+                    macro = re.search(MACRO_REGEX, line)
+                    if (macro and
+                        not macro.group("macro").startswith(NON_MACROS)):
+                        macros.append(Match(
+                            header_file,
+                            line,
+                            (macro.start(), macro.end()),
+                            macro.group("macro")))
 
-    def get_MBED_names(self):
-        for file_group in [self.header_files, self.library_files]:
-            for filename in file_group:
-                with open(filename, "r") as f:
-                    for line in iter(f.readline, ""):
-                        mbed_names = re.findall(self.MBED_pattern, line)
-                        if mbed_names:
-                            for name in mbed_names:
-                                self.MBED_names.append((name, filename))
-        self.MBED_names = list(set(self.MBED_names))
+        return macros
 
-    def get_enum_consts(self):
-        for header_file in self.header_files:
+    def parse_MBED_names(self, header_files, library_files):
+        """
+        Parse all words in the file that begin with MBED. Includes macros.
+
+        Args:
+            header_files: A list of filepaths to look through.
+            library_files: A list of filepaths to look through.
+        
+        Returns:
+            A list of Match objects for words beginning with MBED.
+        """
+        MBED_names = []
+        
+        for filename in header_files + library_files:
+            with open(filename, "r") as fp:
+                for line in fp:
+                    for name in re.finditer(r"\bMBED.+?_[A-Z0-9_]*", line):
+                        MBED_names.append(Match(
+                            filename,
+                            line,
+                            (name.start(), name.end()),
+                            name.group(0)
+                            ))
+
+        return MBED_names
+
+    def parse_enum_consts(self, header_files):
+        """
+        Parse all enum value constants that are declared.
+
+        Args:
+            header_files: A list of filepaths to look through.
+
+        Returns:
+            A list of (enum constants, containing filename).
+        """
+
+        enum_consts = []
+
+        for header_file in header_files:
+            # Emulate a finite state machine to parse enum declarations.
             state = 0
             with open(header_file, "r") as header:
-                for line in iter(header.readline, ""):
+                for line in header:
                     if state is 0 and re.match(r"^(typedef )?enum {", line):
                         state = 1
                     elif state is 0 and re.match(r"^(typedef )?enum", line):
@@ -110,156 +214,314 @@ class NameCheck(object):
                     elif state is 1:
                         enum_const = re.match(r"^\s*(?P<enum_const>\w+)", line)
                         if enum_const:
-                            self.enum_consts.append(
-                                (enum_const.group("enum_const"), header_file)
-                            )
+                            enum_consts.append(Match(
+                                header_file,
+                                line,
+                                (enum_const.start(), enum_const.end()),
+                                enum_const.group("enum_const")))
+        
+        return enum_consts
 
-    def line_contains_declaration(self, line):
-        return (re.match(r"^[^ /#{]", line)
-                and not re.match(self.decls_pattern, line))
+    def parse_identifiers(self, header_files):
+        """
+        Parse all lines of a header where a function identifier is declared,
+        based on some huersitics. Assumes every line that is not a comment or a
+        preprocessor directive contains some identifier.
 
-    def get_identifier_from_declaration(self, declaration):
-        identifier = re.search(
-            r"([a-zA-Z_][a-zA-Z0-9_]*)\(|"
-            r"\(\*(.+)\)\(|"
-            r"(\w+)\W*$",
-            declaration
+        Args:
+            header_files: A list of filepaths to look through.
+        
+        Returns:
+            A list of (identifier, containing filename)
+        """
+        EXCLUDED_DECLARATIONS = (
+            r"^(extern \"C\"|(typedef )?(struct|enum)( {)?$|};?$|$)"
         )
-        if identifier:
-            for group in identifier.groups():
-                if group:
-                    return group
-        self.log.error(declaration)
-        raise Exception("No identifier found")
 
-    def get_identifiers(self):
-        for header_file in self.header_files:
+        identifiers = []
+
+        for header_file in header_files:
             with open(header_file, "r") as header:
-                for line in iter(header.readline, ""):
-                    if self.line_contains_declaration(line):
-                        self.identifiers.append(
-                            (self.get_identifier_from_declaration(line),
-                             header_file)
-                        )
+                in_block_comment = False
 
-    def get_symbols(self):
+                for line in header:
+                    # Skip parsing this line if it begins or ends a block
+                    # comment, and set the state machine's state.
+                    if re.search(r"/\*", line):
+                        in_block_comment = True
+                        continue
+                    elif re.search(r"\*/", line) and in_block_comment:
+                        in_block_comment = False
+                        continue
+                    
+                    # Skip parsing this line if it's a line comment, or if it
+                    # begins with a preprocessor directive
+                    if in_block_comment or re.match(r"(//|#)", line):
+                        continue
+
+                    if re.match(EXCLUDED_DECLARATIONS, line):
+                        continue
+                
+                    identifier = re.search(
+                        # Matches: "mbedtls_aes_init("
+                        r"([a-zA-Z_][a-zA-Z0-9_]*)\(|"
+                        # Matches: "(*f_rng)("
+                        r"\(\*(.+)\)\(|"
+                        # TODO: unknown purpose
+                        r"(\w+)\W*$",
+                        line
+                    )
+
+                    if identifier:
+                        for group in identifier.groups():
+                            if group:
+                                identifiers.append(Match(
+                                    header_file,
+                                    line,
+                                    (identifier.start(), identifier.end()),
+                                    identifier.group(0)))
+
+        return identifiers
+
+    def parse_symbols(self):
+        """
+        Compile the Mbed TLS libraries, and parse the TLS, Crypto, and x509
+        object files using nm to retrieve the list of referenced symbols.
+        
+        Returns:
+            A list of unique symbols defined and used in the libraries.
+        """
+
+        symbols = []
+
+        # Back up the config and atomically compile with the full configratuion.
+        shutil.copy("include/mbedtls/mbedtls_config.h",
+                    "include/mbedtls/mbedtls_config.h.bak")        
         try:
-            shutil.copy("include/mbedtls/config.h",
-                        "include/mbedtls/config.h.bak")
-            subprocess.check_output(
+            subprocess.run(
                 ["perl", "scripts/config.pl", "full"],
-                universal_newlines=True,
+                encoding=sys.stdout.encoding,
+                check=True
             )
             my_environment = os.environ.copy()
             my_environment["CFLAGS"] = "-fno-asynchronous-unwind-tables"
-            subprocess.check_output(
+            subprocess.run(
                 ["make", "clean", "lib"],
                 env=my_environment,
-                universal_newlines=True,
+                encoding=sys.stdout.encoding,
+                stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
+                check=True
             )
-            shutil.move("include/mbedtls/config.h.bak",
-                        "include/mbedtls/config.h")
-            nm_output = ""
-            for lib in ["library/libmbedcrypto.a",
-                        "library/libmbedtls.a",
-                        "library/libmbedx509.a"]:
-                nm_output += subprocess.check_output(
-                    ["nm", "-og", lib],
-                    universal_newlines=True,
-                    stderr=subprocess.STDOUT,
-                )
-            for line in nm_output.splitlines():
-                if not re.match(r"^\S+: +U |^$|^\S+:$", line):
-                    symbol = re.match(self.symbol_pattern, line)
-                    if symbol:
-                        self.symbols.append(symbol.group('symbol'))
-                    else:
-                        self.log.error(line)
-            self.symbols.sort()
-            subprocess.check_output(
+
+            # Perform object file analysis using nm
+            symbols = self.parse_symbols_from_nm(
+                ["library/libmbedcrypto.a",
+                "library/libmbedtls.a",
+                "library/libmbedx509.a"])
+
+            symbols.sort()
+
+            subprocess.run(
                 ["make", "clean"],
-                universal_newlines=True,
+                encoding=sys.stdout.encoding,
+                check=True
             )
         except subprocess.CalledProcessError as error:
             self.log.error(error)
             self.set_return_code(2)
+        finally:
+            shutil.move("include/mbedtls/mbedtls_config.h.bak",
+                        "include/mbedtls/mbedtls_config.h")
+
+        return symbols
+
+    def parse_symbols_from_nm(self, object_files):
+        """
+        Run nm to retrieve the list of referenced symbols in each object file.
+        Does not return the position data since it is of no use.
+
+        Returns:
+            A list of unique symbols defined and used in any of the object files.
+        """
+        UNDEFINED_SYMBOL = r"^\S+: +U |^$|^\S+:$"
+        VALID_SYMBOL = r"^\S+( [0-9A-Fa-f]+)* . _*(?P<symbol>\w+)"
+
+        symbols = []
+
+        nm_output = ""
+        for lib in object_files:
+            nm_output += subprocess.run(
+                ["nm", "-og", lib],
+                encoding=sys.stdout.encoding,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                check=True
+            ).stdout
+        for line in nm_output.splitlines():
+            if not re.match(UNDEFINED_SYMBOL, line):
+                symbol = re.match(VALID_SYMBOL, line)
+                if symbol:
+                    symbols.append(symbol.group('symbol'))
+                else:
+                    self.log.error(line)
+        
+        return symbols
+
+    def parse_names_in_source(self):
+        """
+        Calls each parsing function to retrieve various elements of the code,
+        together with their source location. Puts the parsed values in the
+        internal variable self.parse_result.
+        """
+        self.log.info("Parsing source code...")
+
+        m_headers = self.get_files(os.path.join("include", "mbedtls"))
+        libraries = self.get_files("library")
+        
+        all_macros = self.parse_macros(m_headers)
+        enum_consts = self.parse_enum_consts(m_headers)
+        identifiers = self.parse_identifiers(m_headers)
+        symbols = self.parse_symbols()
+        mbed_names = self.parse_MBED_names(m_headers, libraries)
+        
+        # Remove identifier macros like mbedtls_printf or mbedtls_calloc
+        macros = list(set(all_macros) - set(identifiers))
+
+        self.log.info("Found:")
+        self.log.info("  {} Macros".format(len(all_macros)))
+        self.log.info("  {} Enum Constants".format(len(enum_consts)))
+        self.log.info("  {} Identifiers".format(len(identifiers)))
+        self.log.info("  {} Exported Symbols".format(len(symbols)))
+        self.log.info("Analysing...")
+
+        self.parse_result = {
+            "macros": macros,
+            "enum_consts": enum_consts,
+            "identifiers": identifiers,
+            "symbols": symbols,
+            "mbed_names": mbed_names
+        }
+
+    def perform_checks(self):
+        """
+        Perform each check in order, output its PASS/FAIL status. Maintain an
+        overall test status, and output that at the end.
+        """
+        problems = 0
+
+        problems += self.check_symbols_declared_in_header()
+
+        pattern_checks = [
+            ("macros", MACRO_PATTERN),
+            ("enum_consts", MACRO_PATTERN),
+            ("identifiers", IDENTIFIER_PATTERN)]
+        for group, check_pattern in pattern_checks:
+            problems += self.check_match_pattern(group, check_pattern)
+
+        problems += self.check_for_typos()
+
+        self.log.info("=============")
+        if problems > 0:
+            self.log.info("FAIL: {0} problem(s) to fix".format(str(problems)))
+        else:
+            self.log.info("PASS")
 
     def check_symbols_declared_in_header(self):
-        identifiers = [x[0] for x in self.identifiers]
-        bad_names = []
-        for symbol in self.symbols:
-            if symbol not in identifiers:
-                bad_names.append(symbol)
-        if bad_names:
-            self.set_return_code(1)
-            self.log.info("Names of identifiers: FAIL")
-            for name in bad_names:
-                self.log.info(name)
-        else:
-            self.log.info("Names of identifiers: PASS")
+        """
+        Perform a check that all detected symbols in the library object files
+        are properly declared in headers.
+        
+        Outputs to the logger the PASS/FAIL status, followed by the location of
+        problems.
 
-    def check_group(self, group_to_check, check_pattern, name):
-        bad_names = []
-        for item in group_to_check:
-            if not re.match(check_pattern, item[0]):
-                bad_names.append("{} - {}".format(item[0], item[1]))
-        if bad_names:
+        Returns the number of problems that needs fixing.
+        """
+        problems = []
+        for symbol in self.parse_result["symbols"]:
+            found_symbol_declared = False
+            for identifier_match in self.parse_result["identifiers"]:
+                if symbol == identifier_match.name:
+                    found_symbol_declared = True
+                    break
+            
+            if not found_symbol_declared:
+                problems.append(SymbolNotInHeader(symbol))
+
+        if problems:
             self.set_return_code(1)
-            self.log.info("Names of {}: FAIL".format(name))
-            for name in bad_names:
-                self.log.info(name)
+            self.log.info("All symbols in header: FAIL")
+            for problem in problems:
+                self.log.info(str(problem) + "\n")
         else:
-            self.log.info("Names of {}: PASS".format(name))
+            self.log.info("All symbols in header: PASS")
+        
+        return len(problems)
+
+    def check_match_pattern(self, group_to_check, check_pattern):
+        problems = []
+        for item_match in self.parse_result[group_to_check]:
+            if not re.match(check_pattern, item_match.name):
+                problems.append(PatternMismatch(check_pattern, item_match))
+        
+        if problems:
+            self.set_return_code(1)
+            self.log.info("Naming patterns of {}: FAIL".format(group_to_check))
+            for problem in problems:
+                self.log.info(str(problem) + "\n")
+        else:
+            self.log.info("Naming patterns of {}: PASS".format(group_to_check))
+        
+        return len(problems)
 
     def check_for_typos(self):
-        bad_names = []
-        all_caps_names = list(set(
-            [x[0] for x in self.actual_macros + self.enum_consts]
+        problems = []
+        all_caps_names = list(set([
+            match.name for match
+            in self.parse_result["macros"] + self.parse_result["enum_consts"]]
         ))
-        for name in self.MBED_names:
-            if name[0] not in all_caps_names:
-                if not re.search(self.typo_check_pattern, name[0]):
-                    bad_names.append("{} - {}".format(name[0], name[1]))
-        if bad_names:
+
+        TYPO_EXCLUSION = r"XXX|__|_$|^MBEDTLS_.*CONFIG_FILE$"
+
+        for name_match in self.parse_result["mbed_names"]:
+            if name_match.name not in all_caps_names:
+                if not re.search(TYPO_EXCLUSION, name_match.name):
+                    problems.append(Typo(name_match))
+
+        if problems:
             self.set_return_code(1)
             self.log.info("Likely typos: FAIL")
-            for name in bad_names:
-                self.log.info(name)
+            for problem in problems:
+                self.log.info(str(problem) + "\n")
         else:
             self.log.info("Likely typos: PASS")
+        
+        return len(problems)
 
-    def get_names_from_source_code(self):
-        self.log.info("Analysing source code...")
-        self.get_macros()
-        self.get_enum_consts()
-        self.get_identifiers()
-        self.get_symbols()
-        self.get_MBED_names()
-        self.actual_macros = list(set(self.macros) - set(self.identifiers))
-        self.log.info("{} macros".format(len(self.macros)))
-        self.log.info("{} enum-consts".format(len(self.enum_consts)))
-        self.log.info("{} identifiers".format(len(self.identifiers)))
-        self.log.info("{} exported-symbols".format(len(self.symbols)))
+def main():
+    """
+    Main function, parses command-line arguments.
+    """
 
-    def check_names(self):
-        self.check_symbols_declared_in_header()
-        for group, check_pattern, name in [
-                (self.actual_macros, self.macro_const_check_pattern,
-                 "actual-macros"),
-                (self.enum_consts, self.macro_const_check_pattern,
-                 "enum-consts"),
-                (self.identifiers, self.identifier_check_pattern,
-                 "identifiers")]:
-            self.check_group(group, check_pattern, name)
-        self.check_for_typos()
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=(
+            "This script confirms that the naming of all symbols and identifiers "
+            "in Mbed TLS are consistent with the house style and are also "
+            "self-consistent.\n\n"
+            "Expected to be run from the MbedTLS root directory."))
 
+    parser.add_argument("-v", "--verbose",
+                        action="store_true",
+                        help="enable script debug outputs")
+    
+    args = parser.parse_args()
 
-def run_main():
     try:
         name_check = NameCheck()
-        name_check.get_names_from_source_code()
-        name_check.check_names()
+        name_check.setup_logger(verbose=args.verbose)
+        name_check.parse_names_in_source()
+        name_check.perform_checks()
         sys.exit(name_check.return_code)
     except Exception:
         traceback.print_exc()
@@ -267,4 +529,4 @@ def run_main():
 
 
 if __name__ == "__main__":
-    run_main()
+    main()