FIX keyword substring highlighting regression

This commit is contained in:
Charles Horn 2025-07-02 06:13:25 +00:00 committed by Evil Eye
parent 42ca8e31eb
commit c948171d5f
3 changed files with 69 additions and 24 deletions

View File

@ -53,6 +53,7 @@ Programmers
Carl Maxwell Carl Maxwell
cc9cii cc9cii
Cédric Mocquillon Cédric Mocquillon
Charles Horn
Chris Boyce (slothlife) Chris Boyce (slothlife)
Chris Robinson (KittyCat) Chris Robinson (KittyCat)
Chris Vigil Chris Vigil

View File

@ -3,6 +3,7 @@
#include <algorithm> // std::reverse #include <algorithm> // std::reverse
#include <cctype> #include <cctype>
#include <cstring>
#include <map> #include <map>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
@ -66,6 +67,8 @@ namespace MWDialogue
return false; return false;
} }
static bool isWordSeparator(char c) { return std::strchr("\n\r \t'\"", c) != nullptr; }
static bool sortMatches(const Match& left, const Match& right) { return left.mBeg < right.mBeg; } static bool sortMatches(const Match& left, const Match& right) { return left.mBeg < right.mBeg; }
void highlightKeywords(Point beg, Point end, std::vector<Match>& out) const void highlightKeywords(Point beg, Point end, std::vector<Match>& out) const
@ -73,6 +76,14 @@ namespace MWDialogue
std::vector<Match> matches; std::vector<Match> matches;
for (Point i = beg; i != end; ++i) for (Point i = beg; i != end; ++i)
{ {
if (i != beg)
{
Point prev = i;
--prev;
if (!isWordSeparator(*prev))
continue;
}
// check first character // check first character
typename Entry::childen_t::const_iterator candidate typename Entry::childen_t::const_iterator candidate
= mRoot.mChildren.find(Misc::StringUtils::toLower(*i)); = mRoot.mChildren.find(Misc::StringUtils::toLower(*i));
@ -86,7 +97,6 @@ namespace MWDialogue
// some keywords might be longer variations of other keywords, so we definitely need a list of // some keywords might be longer variations of other keywords, so we definitely need a list of
// candidates the first element in the pair is length of the match, i.e. depth from the first character // candidates the first element in the pair is length of the match, i.e. depth from the first character
// on
std::vector<typename std::pair<std::ptrdiff_t, typename Entry::childen_t::const_iterator>> candidates; std::vector<typename std::pair<std::ptrdiff_t, typename Entry::childen_t::const_iterator>> candidates;
while ((j + 1) != end) while ((j + 1) != end)

View File

@ -46,7 +46,7 @@ TEST_F(KeywordSearchTest, keyword_test_conflict_resolution2)
TEST_F(KeywordSearchTest, keyword_test_conflict_resolution3) TEST_F(KeywordSearchTest, keyword_test_conflict_resolution3)
{ {
// testing that the longest keyword is chosen, rather than maximizing the // Test that the longest keyword is chosen, rather than maximizing the
// amount of highlighted characters by highlighting the first and last keyword // amount of highlighted characters by highlighting the first and last keyword
MWDialogue::KeywordSearch<int> search; MWDialogue::KeywordSearch<int> search;
search.seed("foo bar", 0); search.seed("foo bar", 0);
@ -64,12 +64,12 @@ TEST_F(KeywordSearchTest, keyword_test_conflict_resolution3)
TEST_F(KeywordSearchTest, keyword_test_utf8_word_begin) TEST_F(KeywordSearchTest, keyword_test_utf8_word_begin)
{ {
// We make sure that the search works well even if the character is not ASCII // Make sure that the search works well on UTF-8 strings containing some non-ASCII (French)
MWDialogue::KeywordSearch<int> search; MWDialogue::KeywordSearch<int> search;
search.seed("états", 0); search.seed("états", 0);
search.seed("ïrradiés", 0); search.seed("ïrradiés", 0);
search.seed("ça nous déçois", 0); search.seed("ça nous déçois", 0);
search.seed("ois", 0); search.seed("nous", 0);
std::string text std::string text
= "les nations unis ont réunis le monde entier, états units inclus pour parler du problème des gens ïrradiés " = "les nations unis ont réunis le monde entier, états units inclus pour parler du problème des gens ïrradiés "
@ -86,38 +86,22 @@ TEST_F(KeywordSearchTest, keyword_test_utf8_word_begin)
TEST_F(KeywordSearchTest, keyword_test_non_alpha_non_whitespace_word_begin) TEST_F(KeywordSearchTest, keyword_test_non_alpha_non_whitespace_word_begin)
{ {
// We make sure that the search works well even if the separator is not a whitespace // Make sure that the search works well even if the separator is not whitespace
MWDialogue::KeywordSearch<int> search; MWDialogue::KeywordSearch<int> search;
search.seed("Report to caius cosades", 0); search.seed("Report to caius cosades", 0);
std::string text = "I was told to \"Report to caius cosades\""; std::string text = "I was told to \"Report to Caius Cosades\"";
std::vector<MWDialogue::KeywordSearch<int>::Match> matches; std::vector<MWDialogue::KeywordSearch<int>::Match> matches;
search.highlightKeywords(text.begin(), text.end(), matches); search.highlightKeywords(text.begin(), text.end(), matches);
EXPECT_EQ(matches.size(), 1); EXPECT_EQ(matches.size(), 1);
EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "Report to caius cosades"); EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "Report to Caius Cosades");
}
TEST_F(KeywordSearchTest, keyword_test_russian_non_ascii_before)
{
// We make sure that the search works well even if the separator is not a whitespace with russian chars
MWDialogue::KeywordSearch<int> search;
search.seed("Доложить Каю Косадесу", 0);
std::string text
= "Что? Да. Я Кай Косадес. То есть как это, вам велели «Доложить Каю Косадесу»? О чем вы говорите?";
std::vector<MWDialogue::KeywordSearch<int>::Match> matches;
search.highlightKeywords(text.begin(), text.end(), matches);
EXPECT_EQ(matches.size(), 1);
EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "Доложить Каю Косадесу");
} }
TEST_F(KeywordSearchTest, keyword_test_russian_ascii_before) TEST_F(KeywordSearchTest, keyword_test_russian_ascii_before)
{ {
// We make sure that the search works well even if the separator is not a whitespace with russian chars // Make sure that the search works well even if the separator is not whitespace with Russian chars
MWDialogue::KeywordSearch<int> search; MWDialogue::KeywordSearch<int> search;
search.seed("Доложить Каю Косадесу", 0); search.seed("Доложить Каю Косадесу", 0);
@ -130,3 +114,53 @@ TEST_F(KeywordSearchTest, keyword_test_russian_ascii_before)
EXPECT_EQ(matches.size(), 1); EXPECT_EQ(matches.size(), 1);
EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "Доложить Каю Косадесу"); EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "Доложить Каю Косадесу");
} }
TEST_F(KeywordSearchTest, keyword_test_substrings_without_word_separators)
{
// Make sure that the search does not highlight substrings within words
// i.e. "Force" does not contain "orc"
// and "bring" does not contain "ring"
MWDialogue::KeywordSearch<int> search;
search.seed("orc", 0);
search.seed("ring", 0);
std::string text = "Bring the Force, Lucan!";
std::vector<MWDialogue::KeywordSearch<int>::Match> matches;
search.highlightKeywords(text.begin(), text.end(), matches);
EXPECT_EQ(matches.size(), 0);
}
TEST_F(KeywordSearchTest, keyword_test_initial_substrings_match)
{
// Make sure that the search highlights prefix substrings
// "Orcs" should match "orc"
// "ring" is not matched because "-" is not a word separator
MWDialogue::KeywordSearch<int> search;
search.seed("orc", 0);
search.seed("ring", 0);
std::string text = "Bring the Orcs some gold-rings.";
std::vector<MWDialogue::KeywordSearch<int>::Match> matches;
search.highlightKeywords(text.begin(), text.end(), matches);
EXPECT_EQ(matches.size(), 1);
EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "Orc");
}
TEST_F(KeywordSearchTest, keyword_test_french_substrings)
{
// Substrings within words should not match
MWDialogue::KeywordSearch<int> search;
search.seed("ages", 0);
search.seed("orc", 0);
std::string text = "traçages et forces";
std::vector<MWDialogue::KeywordSearch<int>::Match> matches;
search.highlightKeywords(text.begin(), text.end(), matches);
EXPECT_EQ(matches.size(), 0);
}