From 8622a6da3b500c45d666506f841ce10592d1a0e6 Mon Sep 17 00:00:00 2001 From: Kindi Date: Thu, 10 Aug 2023 05:28:19 +0800 Subject: [PATCH 01/10] luautf8lib --- components/CMakeLists.txt | 2 +- components/lua/luastate.cpp | 5 ++++- components/lua/utf8.cpp | 21 +++++++++++++++++++++ components/lua/utf8.hpp | 9 +++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 components/lua/utf8.cpp create mode 100644 components/lua/utf8.hpp diff --git a/components/CMakeLists.txt b/components/CMakeLists.txt index 32482ec331..dd40d41ab0 100644 --- a/components/CMakeLists.txt +++ b/components/CMakeLists.txt @@ -34,7 +34,7 @@ endif (GIT_CHECKOUT) # source files add_component_dir (lua - luastate scriptscontainer asyncpackage utilpackage serialization configuration l10n storage + luastate scriptscontainer asyncpackage utilpackage serialization configuration l10n storage utf8 shapes/box ) diff --git a/components/lua/luastate.cpp b/components/lua/luastate.cpp index 2a5769e6dd..8a836e4a68 100644 --- a/components/lua/luastate.cpp +++ b/components/lua/luastate.cpp @@ -12,6 +12,7 @@ #include #include "scriptscontainer.hpp" +#include "utf8.hpp" namespace LuaUtil { @@ -51,7 +52,7 @@ namespace LuaUtil static const std::string safeFunctions[] = { "assert", "error", "ipairs", "next", "pairs", "pcall", "select", "tonumber", "tostring", "type", "unpack", "xpcall", "rawequal", "rawget", "rawset", "setmetatable" }; - static const std::string safePackages[] = { "coroutine", "math", "string", "table" }; + static const std::string safePackages[] = { "coroutine", "math", "string", "table", "utf8" }; static constexpr int64_t countHookStep = 1000; @@ -181,6 +182,8 @@ namespace LuaUtil mSol["math"]["randomseed"](static_cast(std::time(nullptr))); mSol["math"]["randomseed"] = [] {}; + mSol["utf8"] = LuaUtf8::initUtf8Package(mSol); + mSol["writeToLog"] = [](std::string_view s) { Log(Debug::Level::Info) << s; }; mSol["setEnvironment"] diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp new file mode 100644 index 0000000000..c8be068353 --- /dev/null +++ b/components/lua/utf8.cpp @@ -0,0 +1,21 @@ +#include "utf8.hpp" +#include "luastate.hpp" + +namespace +{ + static constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 + static constexpr uint32_t MAXUTF = 0x7FFFFFFFu; + static constexpr uint32_t MAXUNICODE = 0x10FFFFu; +} + +namespace LuaUtf8 +{ + sol::table initUtf8Package(sol::state_view& lua) + { + sol::table utf8(lua, sol::create); + + utf8["charpattern"] = UTF8PATT; + + return utf8; + } +} diff --git a/components/lua/utf8.hpp b/components/lua/utf8.hpp new file mode 100644 index 0000000000..cb8666ea33 --- /dev/null +++ b/components/lua/utf8.hpp @@ -0,0 +1,9 @@ +#ifndef COMPONENTS_LUA_UTF8_H +#define COMPONENTS_LUA_UTF8_H + +namespace LuaUtf8 +{ + sol::table initUtf8Package(sol::state_view&); +} + +#endif From d9c102e14d6aa604c0a442f7ddb3e1f77a05e792 Mon Sep 17 00:00:00 2001 From: Kindi Date: Sun, 13 Aug 2023 22:46:24 +0800 Subject: [PATCH 02/10] utf8.char --- components/lua/utf8.cpp | 31 ++++++++++++++++++++++++++++++- components/lua/utf8.hpp | 2 ++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index c8be068353..9b7fc97de8 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -1,11 +1,27 @@ +#include + #include "utf8.hpp" -#include "luastate.hpp" namespace { static constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 static constexpr uint32_t MAXUTF = 0x7FFFFFFFu; static constexpr uint32_t MAXUNICODE = 0x10FFFFu; + + inline static double getInteger(const sol::stack_proxy arg, const size_t& n, const std::string_view& name) + { + double integer; + if (!arg.is()) + throw std::runtime_error(std::format("bad argument #{} to '{}' (number expected, got {})", n, name, + sol::type_name(arg.lua_state(), arg.get_type()))); + + if (std::modf(arg, &integer) != 0) + throw std::runtime_error( + std::format("bad argument #{} to '{}' (number has no integer representation)", n, name)); + + return integer; + } + } namespace LuaUtf8 @@ -16,6 +32,19 @@ namespace LuaUtf8 utf8["charpattern"] = UTF8PATT; + utf8["char"] = [](const sol::variadic_args args) -> std::string { + std::string result{}; + std::wstring_convert> converter; + for (size_t i = 0; i < args.size(); ++i) + { + int64_t codepoint = getInteger(args[i], (i + 1), "char"); + if (codepoint < 0 || codepoint > MAXUTF) + throw std::runtime_error(std::format("bad argument #{} to 'char' (value out of range)", (i + 1))); + + result += converter.to_bytes(codepoint); + } + return result; + }; return utf8; } } diff --git a/components/lua/utf8.hpp b/components/lua/utf8.hpp index cb8666ea33..dd936b3b5e 100644 --- a/components/lua/utf8.hpp +++ b/components/lua/utf8.hpp @@ -1,6 +1,8 @@ #ifndef COMPONENTS_LUA_UTF8_H #define COMPONENTS_LUA_UTF8_H +#include + namespace LuaUtf8 { sol::table initUtf8Package(sol::state_view&); From 6d02c317208c6aeaed3b0eb61ea212aea0016fd5 Mon Sep 17 00:00:00 2001 From: Kindi Date: Thu, 17 Aug 2023 23:58:50 +0800 Subject: [PATCH 03/10] utf8.codes --- components/lua/utf8.cpp | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 9b7fc97de8..926e43e84b 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -22,6 +22,53 @@ namespace return integer; } + // returns: first - character pos in bytes, second - character codepoint + static std::pair poscodes(const std::string_view& s, std::vector& pos_byte) + { + const int64_t pos = pos_byte.back() - 1; + const unsigned char ch = static_cast(s[pos]); + int64_t codepoint = -1; + size_t byteSize = 0; + + if ((ch & 0b10000000) == 0) + { + codepoint = ch; + byteSize = 1; + } + else if ((ch & 0b11100000) == 0b11000000) + { + codepoint = ch & 0b00011111; + byteSize = 2; + } + else if ((ch & 0b11110000) == 0b11100000) + { + codepoint = ch & 0b00001111; + byteSize = 3; + } + else if ((ch & 0b11111000) == 0b11110000) + { + codepoint = ch & 0b00000111; + byteSize = 4; + } + + // construct codepoint for non-ascii + for (size_t i = 1; i < byteSize; ++i) + { + // if not a continuation byte + if ((pos + i) >= s.size() || (static_cast(s[pos + i]) & 0b11000000) != 0b10000000) + { + return std::make_pair(0, -1); + } + codepoint = (codepoint << 6) | (static_cast(s[pos + i]) & 0b00111111); + } + + std::pair res = std::make_pair(pos_byte.back(), codepoint); + + pos_byte.push_back(pos_byte.back() + byteSize); /* the next character (if exists) starts at this byte */ + + return res; + } + } namespace LuaUtf8 @@ -45,6 +92,20 @@ namespace LuaUtf8 } return result; }; + + utf8["codes"] = [pos_byte = std::vector{ 1 }](const std::string_view& s) { + return sol::as_function([s, pos_byte]() mutable -> sol::optional> { + if (pos_byte.back() <= static_cast(s.size())) + { + const auto pair = poscodes(s, pos_byte); + if (pair.second == -1) + throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size())); + + return pair; + } + return sol::nullopt; + }); + }; return utf8; } } From 92842cedf55995009106c726fe130b88b1cce735 Mon Sep 17 00:00:00 2001 From: Kindi Date: Sun, 27 Aug 2023 16:12:12 +0800 Subject: [PATCH 04/10] len,codepoint,offset --- components/lua/utf8.cpp | 119 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 5 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 926e43e84b..6a80505411 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -4,11 +4,16 @@ namespace { - static constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 - static constexpr uint32_t MAXUTF = 0x7FFFFFFFu; - static constexpr uint32_t MAXUNICODE = 0x10FFFFu; + constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 + constexpr uint32_t MAXUTF = 0x7FFFFFFFu; + constexpr uint32_t MAXUNICODE = 0x10FFFFu; - inline static double getInteger(const sol::stack_proxy arg, const size_t& n, const std::string_view& name) + inline bool isNilOrNone(const sol::stack_proxy arg) + { + return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none); + } + + inline double getInteger(const sol::stack_proxy arg, const size_t& n, const std::string_view& name) { double integer; if (!arg.is()) @@ -22,8 +27,18 @@ namespace return integer; } + inline void posrelat(int64_t& pos, const size_t& len) + { + if (pos >= 0) + /* no change */; + else if (0u - pos > static_cast(len)) + pos = 0; + else + pos = len + pos + 1; + } + // returns: first - character pos in bytes, second - character codepoint - static std::pair poscodes(const std::string_view& s, std::vector& pos_byte) + std::pair poscodes(const std::string_view& s, std::vector& pos_byte) { const int64_t pos = pos_byte.back() - 1; const unsigned char ch = static_cast(s[pos]); @@ -106,6 +121,100 @@ namespace LuaUtf8 return sol::nullopt; }); }; + + utf8["len"] = [](const std::string_view& s, + const sol::variadic_args args) -> std::variant> { + size_t len = s.size(); + int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len"); + int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len"); + + posrelat(iv, len); + posrelat(fv, len); + + if (iv <= 0) + throw std::runtime_error("bad argument #2 to 'len' (initial position out of bounds)"); + if (fv > static_cast(len)) + throw std::runtime_error("bad argument #3 to 'len' (final position out of bounds)"); + + if (len == 0) + return len; + + std::vector pos_byte = { iv }; + + while (pos_byte.back() <= fv) + { + if (poscodes(s, pos_byte).second == -1) + return std::pair(sol::lua_nil, pos_byte.back()); + } + return pos_byte.size() - 1; + }; + + utf8["codepoint"] + = [](const std::string_view& s, const sol::variadic_args args) -> sol::as_returns_t> { + size_t len = s.size(); + int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint"); + int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint"); + + posrelat(iv, len); + posrelat(fv, len); + + if (iv <= 0) + throw std::runtime_error("bad argument #2 to 'codepoint' (initial position out of bounds)"); + if (fv > static_cast(len)) + throw std::runtime_error("bad argument #3 to 'codepoint' (final position out of bounds)"); + + if (iv > fv) + return sol::as_returns(std::vector{}); /* empty interval; return nothing */ + + std::vector pos_byte = { iv }; + std::vector codepoints; + + while (pos_byte.back() <= fv) + { + codepoints.push_back(poscodes(s, pos_byte).second); + if (codepoints.back() == -1) + throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size())); + } + + return sol::as_returns(std::move(codepoints)); + }; + + utf8["offset"] + = [](const std::string_view& s, const int64_t n, const sol::variadic_args args) -> sol::optional { + size_t len = s.size(); + int64_t iv = isNilOrNone(args[0]) ? ((n >= 0) ? 1 : s.size() + 1) : getInteger(args[0], 3, "offset"); + std::vector pos_byte = { 1 }; + + posrelat(iv, len); + + if (iv > static_cast(len) + 1) + throw std::runtime_error("bad argument #3 to 'offset' (position out of bounds)"); + + while (pos_byte.back() <= static_cast(len)) + poscodes(s, pos_byte); + + for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it) + if (*it == iv) + { + if (n <= 0) + if ((it + n) >= pos_byte.begin()) + return *(it + n); + if (n > 0) + if ((it + n - 1) < pos_byte.end()) + return *(it + n - 1); + break; + } + else if (*it > iv) /* a continuation byte */ + { + if (n == 0) + return *(it - 1); /* special case */ + else + throw std::runtime_error("initial position is a continuation byte"); + } + + return sol::nullopt; + }; + return utf8; } } From 532230254b244fb7a0c2ce5abd95381dadd3e406 Mon Sep 17 00:00:00 2001 From: Kindi Date: Sun, 27 Aug 2023 16:48:00 +0800 Subject: [PATCH 05/10] add documentation --- components/lua/utf8.cpp | 3 +- files/lua_api/utf8.doclua | 77 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 files/lua_api/utf8.doclua diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 6a80505411..69160fdd53 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -108,7 +108,8 @@ namespace LuaUtf8 return result; }; - utf8["codes"] = [pos_byte = std::vector{ 1 }](const std::string_view& s) { + utf8["codes"] = [](const std::string_view& s) { + std::vector pos_byte{ 1 }; return sol::as_function([s, pos_byte]() mutable -> sol::optional> { if (pos_byte.back() <= static_cast(s.size())) { diff --git a/files/lua_api/utf8.doclua b/files/lua_api/utf8.doclua new file mode 100644 index 0000000000..6c054f4401 --- /dev/null +++ b/files/lua_api/utf8.doclua @@ -0,0 +1,77 @@ +------------------------------------------------------------------------------- +-- UTF-8 Support. +-- This library provides basic support for UTF-8 encoding. +-- It provides all its functions inside the table utf8. +-- This library does not provide any support for Unicode other than the handling of the encoding. +-- Any operation that needs the meaning of a character, such as character classification, is outside its scope. +-- +-- Unless stated otherwise, all functions that expect a byte position as a parameter assume that +-- the given position is either the start of a byte sequence or one plus the length of the subject string. +-- As in the string library, negative indices count from the end of the string. +-- @module utf8 + +------------------------------------------------------------------------------- +-- Receives zero or more integers, converts each one to its +-- corresponding UTF-8 byte sequence, and returns a string with the concatenation +-- of all these sequences. +-- @function [parent=#utf8] char +-- @param ... zero or more integers. +-- @return #string + +------------------------------------------------------------------------------- +-- The pattern which matches exactly one UTF-8 byte sequence, assuming that +-- the subject is a valid UTF-8 string. +-- @function [parent=#utf8] charpattern +-- @return #string + +------------------------------------------------------------------------------- +-- Returns values so that the construction +-- +-- for p, c in utf8.codes(s) do body end +-- +-- will iterate over all characters in string s, with p being the position (in bytes) +-- and c the code point of each character. +-- It raises an error if it meets any invalid byte sequence. +-- @function [parent=#utf8] codes +-- @param #string s string to handle. + +------------------------------------------------------------------------------- +-- Returns the codepoints (as integers) from all characters in s that start +-- between byte position i and j (both included). The default for i is 1 and for j is i. +-- It raises an error if it meets any invalid byte sequence. +-- @function [parent=#utf8] codepoint +-- @param #string s string to handle +-- @param #number i the initial position (default value is 1) +-- @param #number j the final position (default value is i) +-- @return #number the codepoints of each character in s + +------------------------------------------------------------------------------- +-- Returns the number of UTF-8 characters in string s that start +-- between positions i and j (both inclusive). +-- The default for i is 1 and for j is -1. +-- If it finds any invalid byte sequence, +-- returns a false value plus the position of the first invalid byte. +-- @function [parent=#utf8] len +-- @param #string s string to handle +-- @param #number i the initial position (default value is 1) +-- @param #number j the final position (default value is -1) +-- @return #number the number of utf8 characters in s + +------------------------------------------------------------------------------- +-- Returns the position (in bytes) where the encoding of the n-th character of s +-- (counting from position i) starts. A negative n gets characters before position i. +-- The default for i is 1 when n is non-negative and #s + 1 otherwise, +-- so that utf8.offset(s, -n) gets the offset of the n-th character from the end of the string. +-- If the specified character is neither in the subject nor right after its end, the function returns nil. +-- +-- As a special case, when n is 0 the function returns the +-- start of the encoding of the character that contains the i-th byte of s. +-- +-- This function assumes that s is a valid UTF-8 string. +-- @function [parent=#utf8] offset +-- @param #string s string to handle +-- @param #number n the n-th character +-- @param #number i the initial position (default value is 1 if n is is non-negative and #s + 1 otherwise) +-- @return #number + +return nil From af58b531da743a54f7a480444affab1eeb919dc4 Mon Sep 17 00:00:00 2001 From: Kindi Date: Tue, 29 Aug 2023 08:27:40 +0800 Subject: [PATCH 06/10] change function names and add documentation in overview.rst --- components/lua/utf8.cpp | 24 +++++++++---------- .../reference/lua-scripting/overview.rst | 6 ++++- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 69160fdd53..eac3954230 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -27,10 +27,10 @@ namespace return integer; } - inline void posrelat(int64_t& pos, const size_t& len) + inline void relativePosition(int64_t& pos, const size_t& len) { if (pos >= 0) - /* no change */; + return; else if (0u - pos > static_cast(len)) pos = 0; else @@ -38,7 +38,7 @@ namespace } // returns: first - character pos in bytes, second - character codepoint - std::pair poscodes(const std::string_view& s, std::vector& pos_byte) + std::pair decodeNextUTF8Character(const std::string_view& s, std::vector& pos_byte) { const int64_t pos = pos_byte.back() - 1; const unsigned char ch = static_cast(s[pos]); @@ -113,7 +113,7 @@ namespace LuaUtf8 return sol::as_function([s, pos_byte]() mutable -> sol::optional> { if (pos_byte.back() <= static_cast(s.size())) { - const auto pair = poscodes(s, pos_byte); + const auto pair = decodeNextUTF8Character(s, pos_byte); if (pair.second == -1) throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size())); @@ -129,8 +129,8 @@ namespace LuaUtf8 int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len"); int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len"); - posrelat(iv, len); - posrelat(fv, len); + relativePosition(iv, len); + relativePosition(fv, len); if (iv <= 0) throw std::runtime_error("bad argument #2 to 'len' (initial position out of bounds)"); @@ -144,7 +144,7 @@ namespace LuaUtf8 while (pos_byte.back() <= fv) { - if (poscodes(s, pos_byte).second == -1) + if (decodeNextUTF8Character(s, pos_byte).second == -1) return std::pair(sol::lua_nil, pos_byte.back()); } return pos_byte.size() - 1; @@ -156,8 +156,8 @@ namespace LuaUtf8 int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint"); int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint"); - posrelat(iv, len); - posrelat(fv, len); + relativePosition(iv, len); + relativePosition(fv, len); if (iv <= 0) throw std::runtime_error("bad argument #2 to 'codepoint' (initial position out of bounds)"); @@ -172,7 +172,7 @@ namespace LuaUtf8 while (pos_byte.back() <= fv) { - codepoints.push_back(poscodes(s, pos_byte).second); + codepoints.push_back(decodeNextUTF8Character(s, pos_byte).second); if (codepoints.back() == -1) throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size())); } @@ -186,13 +186,13 @@ namespace LuaUtf8 int64_t iv = isNilOrNone(args[0]) ? ((n >= 0) ? 1 : s.size() + 1) : getInteger(args[0], 3, "offset"); std::vector pos_byte = { 1 }; - posrelat(iv, len); + relativePosition(iv, len); if (iv > static_cast(len) + 1) throw std::runtime_error("bad argument #3 to 'offset' (position out of bounds)"); while (pos_byte.back() <= static_cast(len)) - poscodes(s, pos_byte); + decodeNextUTF8Character(s, pos_byte); for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it) if (*it == iv) diff --git a/docs/source/reference/lua-scripting/overview.rst b/docs/source/reference/lua-scripting/overview.rst index 283664b2c4..6e25f321f3 100644 --- a/docs/source/reference/lua-scripting/overview.rst +++ b/docs/source/reference/lua-scripting/overview.rst @@ -4,7 +4,7 @@ Overview of Lua scripting Language and sandboxing ======================= -OpenMW supports scripts written in Lua 5.1 with some extensions (see below) from Lua 5.2. +OpenMW supports scripts written in Lua 5.1 with some extensions (see below) from Lua 5.2 and Lua 5.3. There are no plans to switch to any newer version of the language, because newer versions are not supported by LuaJIT. .. note:: @@ -38,6 +38,10 @@ Supported Lua 5.2 features: - ``__pairs`` and ``__ipairs`` metamethods; - Function ``table.unpack`` (alias to Lua 5.1 ``unpack``). +Supported Lua 5.3 features: + +- All functions in the `UTF-8 Library `__ + Loading libraries with ``require('library_name')`` is allowed, but limited. It works this way: 1. If `library_name` is one of the standard libraries, then return the library. From dd61caa96d282004d12d189d6ad83d62003eeb27 Mon Sep 17 00:00:00 2001 From: Kindi Date: Fri, 1 Sep 2023 19:26:18 +0800 Subject: [PATCH 07/10] using misc::stringutils::format and simplify relativeposition function --- components/lua/utf8.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index eac3954230..125b33fbe1 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -1,4 +1,5 @@ #include +#include #include "utf8.hpp" @@ -17,24 +18,20 @@ namespace { double integer; if (!arg.is()) - throw std::runtime_error(std::format("bad argument #{} to '{}' (number expected, got {})", n, name, - sol::type_name(arg.lua_state(), arg.get_type()))); + throw std::runtime_error(Misc::StringUtils::format("bad argument #%i to '%s' (number expected, got %s)", n, + name, sol::type_name(arg.lua_state(), arg.get_type()))); if (std::modf(arg, &integer) != 0) throw std::runtime_error( - std::format("bad argument #{} to '{}' (number has no integer representation)", n, name)); + Misc::StringUtils::format("bad argument #{} to '{}' (number has no integer representation)", n, name)); return integer; } inline void relativePosition(int64_t& pos, const size_t& len) { - if (pos >= 0) - return; - else if (0u - pos > static_cast(len)) - pos = 0; - else - pos = len + pos + 1; + if (pos < 0) + pos = std::max(0, pos + len + 1); } // returns: first - character pos in bytes, second - character codepoint @@ -101,7 +98,8 @@ namespace LuaUtf8 { int64_t codepoint = getInteger(args[i], (i + 1), "char"); if (codepoint < 0 || codepoint > MAXUTF) - throw std::runtime_error(std::format("bad argument #{} to 'char' (value out of range)", (i + 1))); + throw std::runtime_error( + Misc::StringUtils::format("bad argument #{} to 'char' (value out of range)", (i + 1))); result += converter.to_bytes(codepoint); } From db287b2bc64da2a57bd09c4146837c31ebe248b9 Mon Sep 17 00:00:00 2001 From: Kindi Date: Fri, 1 Sep 2023 19:33:25 +0800 Subject: [PATCH 08/10] dont use pass by const reference for small types in func arguments --- components/lua/utf8.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 125b33fbe1..d3927212b4 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -7,14 +7,14 @@ namespace { constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 constexpr uint32_t MAXUTF = 0x7FFFFFFFu; - constexpr uint32_t MAXUNICODE = 0x10FFFFu; + //constexpr uint32_t MAXUNICODE = 0x10FFFFu; inline bool isNilOrNone(const sol::stack_proxy arg) { return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none); } - inline double getInteger(const sol::stack_proxy arg, const size_t& n, const std::string_view& name) + inline double getInteger(const sol::stack_proxy arg, const size_t n, const std::string_view name) { double integer; if (!arg.is()) @@ -28,14 +28,14 @@ namespace return integer; } - inline void relativePosition(int64_t& pos, const size_t& len) + inline void relativePosition(int64_t& pos, const size_t len) { if (pos < 0) pos = std::max(0, pos + len + 1); } // returns: first - character pos in bytes, second - character codepoint - std::pair decodeNextUTF8Character(const std::string_view& s, std::vector& pos_byte) + std::pair decodeNextUTF8Character(const std::string_view s, std::vector& pos_byte) { const int64_t pos = pos_byte.back() - 1; const unsigned char ch = static_cast(s[pos]); @@ -123,7 +123,7 @@ namespace LuaUtf8 utf8["len"] = [](const std::string_view& s, const sol::variadic_args args) -> std::variant> { - size_t len = s.size(); + const size_t len = s.size(); int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len"); int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len"); From 8798217b51909df88e0103e4fd6305005314270a Mon Sep 17 00:00:00 2001 From: Kindi Date: Fri, 1 Sep 2023 20:03:19 +0800 Subject: [PATCH 09/10] remove const keyword from all string_view --- components/lua/utf8.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index d3927212b4..75e7343c55 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -7,14 +7,14 @@ namespace { constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 constexpr uint32_t MAXUTF = 0x7FFFFFFFu; - //constexpr uint32_t MAXUNICODE = 0x10FFFFu; + // constexpr uint32_t MAXUNICODE = 0x10FFFFu; inline bool isNilOrNone(const sol::stack_proxy arg) { return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none); } - inline double getInteger(const sol::stack_proxy arg, const size_t n, const std::string_view name) + inline double getInteger(const sol::stack_proxy arg, const size_t n, std::string_view name) { double integer; if (!arg.is()) @@ -35,7 +35,7 @@ namespace } // returns: first - character pos in bytes, second - character codepoint - std::pair decodeNextUTF8Character(const std::string_view s, std::vector& pos_byte) + std::pair decodeNextUTF8Character(std::string_view s, std::vector& pos_byte) { const int64_t pos = pos_byte.back() - 1; const unsigned char ch = static_cast(s[pos]); @@ -106,7 +106,7 @@ namespace LuaUtf8 return result; }; - utf8["codes"] = [](const std::string_view& s) { + utf8["codes"] = [](std::string_view s) { std::vector pos_byte{ 1 }; return sol::as_function([s, pos_byte]() mutable -> sol::optional> { if (pos_byte.back() <= static_cast(s.size())) @@ -121,7 +121,7 @@ namespace LuaUtf8 }); }; - utf8["len"] = [](const std::string_view& s, + utf8["len"] = [](std::string_view s, const sol::variadic_args args) -> std::variant> { const size_t len = s.size(); int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len"); @@ -149,7 +149,7 @@ namespace LuaUtf8 }; utf8["codepoint"] - = [](const std::string_view& s, const sol::variadic_args args) -> sol::as_returns_t> { + = [](std::string_view s, const sol::variadic_args args) -> sol::as_returns_t> { size_t len = s.size(); int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint"); int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint"); @@ -179,7 +179,7 @@ namespace LuaUtf8 }; utf8["offset"] - = [](const std::string_view& s, const int64_t n, const sol::variadic_args args) -> sol::optional { + = [](std::string_view s, const int64_t n, const sol::variadic_args args) -> sol::optional { size_t len = s.size(); int64_t iv = isNilOrNone(args[0]) ? ((n >= 0) ? 1 : s.size() + 1) : getInteger(args[0], 3, "offset"); std::vector pos_byte = { 1 }; From 7eb456a169bcb1900ba542def3e7fda8f4ffce44 Mon Sep 17 00:00:00 2001 From: Kindi Date: Sun, 3 Sep 2023 19:20:59 +0800 Subject: [PATCH 10/10] refactoring for readability --- components/lua/utf8.cpp | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 75e7343c55..83228afa65 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -28,6 +28,9 @@ namespace return integer; } + // If the input 'pos' is negative, it is treated as counting from the end of the string, + // where -1 represents the last character position, -2 represents the second-to-last position, + // and so on. If 'pos' is non-negative, it is used as-is. inline void relativePosition(int64_t& pos, const size_t len) { if (pos < 0) @@ -181,7 +184,18 @@ namespace LuaUtf8 utf8["offset"] = [](std::string_view s, const int64_t n, const sol::variadic_args args) -> sol::optional { size_t len = s.size(); - int64_t iv = isNilOrNone(args[0]) ? ((n >= 0) ? 1 : s.size() + 1) : getInteger(args[0], 3, "offset"); + int64_t iv; + + if (isNilOrNone(args[0])) + { + if (n >= 0) + iv = 1; + else + iv = s.size() + 1; + } + else + iv = getInteger(args[0], 3, "offset"); + std::vector pos_byte = { 1 }; relativePosition(iv, len); @@ -193,14 +207,13 @@ namespace LuaUtf8 decodeNextUTF8Character(s, pos_byte); for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it) + { if (*it == iv) { - if (n <= 0) - if ((it + n) >= pos_byte.begin()) - return *(it + n); - if (n > 0) - if ((it + n - 1) < pos_byte.end()) - return *(it + n - 1); + if (n <= 0 && it + n >= pos_byte.begin()) + return *(it + n); + if (n > 0 && it + n - 1 < pos_byte.end()) + return *(it + n - 1); break; } else if (*it > iv) /* a continuation byte */ @@ -210,6 +223,7 @@ namespace LuaUtf8 else throw std::runtime_error("initial position is a continuation byte"); } + } return sol::nullopt; };