From cacd5dc91a7212e6c49159f453f386c6cc5d37f6 Mon Sep 17 00:00:00 2001 From: dahaihu <33454450+dahaihu@users.noreply.github.com> Date: Sun, 12 Feb 2023 21:04:35 +0800 Subject: [PATCH] fix Replacer suffix match, and add test case (#2867) * fix: replace shoud replace the longest match * feat: revert bytes.Buffer to strings.Builder * fix: loop reset nextStart * feat: add node longest match test * feat: add replacer suffix match test case * feat: multiple match * fix: partial match ends * fix: replace look back upon error * feat: rm unnecessary branch --------- Co-authored-by: hudahai Co-authored-by: hushichang --- core/stringx/node.go | 94 +++++++++----- core/stringx/node_test.go | 231 +++++++++++++++++++++++++++++++++- core/stringx/replacer.go | 33 +++-- core/stringx/replacer_test.go | 40 ++++++ 4 files changed, 349 insertions(+), 49 deletions(-) diff --git a/core/stringx/node.go b/core/stringx/node.go index 388e8f7b..c11139ba 100644 --- a/core/stringx/node.go +++ b/core/stringx/node.go @@ -14,7 +14,6 @@ func (n *node) add(word string) { } nd := n - var depth int for i, char := range chars { if nd.children == nil { child := new(node) @@ -23,7 +22,6 @@ func (n *node) add(word string) { nd = child } else if child, ok := nd.children[char]; ok { nd = child - depth++ } else { child := new(node) child.depth = i + 1 @@ -99,51 +97,91 @@ func (n *node) find(chars []rune) []scope { return scopes } -func (n *node) longestMatch(chars []rune, start int) (used int, jump *node, matched bool) { +func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) { cur := n - var matchedNode *node + var longestMatched *node + findMatch := func(path []*node) (*node, int) { + var ( + result *node + start int + ) + for i := len(path) - 1; i >= 0; i-- { + icur := path[i] + var cur *node + for icur.fail != nil { + if icur.fail.end { + cur = icur.fail + break + } + icur = icur.fail + } + if cur != nil { + if result == nil { + result = cur + start = i - result.depth + 1 + } else if curStart := i - cur.depth + 1; curStart < start { + result = cur + start = curStart + } + } + } + return result, start + } - for i := start; i < len(chars); i++ { - child, ok := cur.children[chars[i]] + for i := len(paths); i < len(chars); i++ { + char := chars[i] + child, ok := cur.children[char] if ok { cur = child if cur.end { - matchedNode = cur + longestMatched = cur } + paths = append(paths, cur) } else { - if matchedNode != nil { - return matchedNode.depth, nil, true + if longestMatched != nil { + return 0, longestMatched.depth, nil } - if n.end { - return start, nil, true + return 0, n.depth, nil } - + // old path pre longest preMatch + preMatch, preStart := findMatch(paths) + // new path match var jump *node - for cur.fail != nil { - jump, ok = cur.fail.children[chars[i]] + icur := cur + for icur.fail != nil { + jump, ok = icur.fail.children[char] if ok { break } - cur = cur.fail + icur = icur.fail } - if jump != nil { - return i + 1 - jump.depth, jump, false + switch { + case preMatch != nil && jump != nil: + if jumpStart := i - jump.depth + 1; preStart < jumpStart { + return preStart, preMatch.depth, nil + } else { + return jumpStart, 0, append(paths[jumpStart:], jump) + } + case preMatch != nil && jump == nil: + return preStart, preMatch.depth, nil + case preMatch == nil && jump != nil: + return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump) + case preMatch == nil && jump == nil: + return i + 1, 0, nil } - - return i + 1, nil, false } } - - // longest matched node - if matchedNode != nil { - return matchedNode.depth, nil, true + // this longest matched node + if longestMatched != nil { + return 0, longestMatched.depth, nil } - - // last matched node if n.end { - return start, nil, true + return 0, n.depth, nil } - - return len(chars), nil, false + match, start := findMatch(paths) + if match != nil { + return start, match.depth, nil + } + return len(chars), 0, nil } diff --git a/core/stringx/node_test.go b/core/stringx/node_test.go index 260e21ab..d4895d5f 100644 --- a/core/stringx/node_test.go +++ b/core/stringx/node_test.go @@ -9,10 +9,10 @@ import ( func TestLongestMatchGuardedCondition(t *testing.T) { n := new(node) n.end = true - used, jump, matched := n.longestMatch([]rune(""), 0) - assert.Equal(t, 0, used) + uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil) + assert.Equal(t, 0, uselessLen) assert.Nil(t, jump) - assert.True(t, matched) + assert.Equal(t, 0, matchLen) } func TestFuzzNodeCase1(t *testing.T) { @@ -202,3 +202,228 @@ func BenchmarkNodeFind(b *testing.B) { trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演")) } } + +func TestNode_longestMatchCase0(t *testing.T) { + // match the longest word + keywords := []string{ + "a", + "ab", + "abc", + "abcd", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil) + assert.Equal(t, 0, uselessLen) + assert.Equal(t, 3, matchLen) + assert.Nil(t, jump) +} + +func TestNode_longestMatchCase1(t *testing.T) { + keywords := []string{ + "abcde", + "bcde", + "cde", + "de", + + "b", + "bc", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 2, matchLen) + assert.Nil(t, jump) +} + +func TestNode_longestMatchCase2(t *testing.T) { + keywords := []string{ + "abcde", + "bcde", + "cde", + "de", + + "c", + "cd", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil) + assert.Equal(t, 2, uselessLen) + assert.Equal(t, 2, matchLen) + assert.Nil(t, jump) +} + +func TestNode_longestMatchCase3(t *testing.T) { + keywords := []string{ + "abcde", + "bcde", + "cde", + "de", + + "b", + "bc", + "c", + "cd", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 2, matchLen) + assert.Nil(t, jump) +} + +func TestNode_longestMatchCase4(t *testing.T) { + keywords := []string{ + "abcde", + "bcdf", + "bcd", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 0, matchLen) + assert.Equal(t, 4, len(paths)) +} + +func TestNode_longestMatchCase5(t *testing.T) { + keywords := []string{ + "abcdef", + "bcde", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 4, matchLen) + assert.Nil(t, paths) +} + +func TestNode_longestMatchCase6(t *testing.T) { + keywords := []string{ + "abcde", + "bc", + "d", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 2, matchLen) + assert.Nil(t, jump) +} + +func TestNode_longestMatchCase7(t *testing.T) { + keywords := []string{ + "abcdeg", + "cdef", + "bcde", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + word := []rune("abcdef") + uselessLen, matchLen, paths := trie.longestMatch(word, nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 4, matchLen) + assert.Nil(t, paths) + uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 0, matchLen) + assert.Nil(t, paths) +} + +func TestNode_longestMatchCase8(t *testing.T) { + keywords := []string{ + "abcdeg", + "cdef", + "cde", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + word := []rune("abcdef") + uselessLen, matchLen, paths := trie.longestMatch(word, nil) + assert.Equal(t, 2, uselessLen) + assert.Equal(t, 0, matchLen) + assert.NotNil(t, paths) +} + +func TestNode_longestMatchCase9(t *testing.T) { + keywords := []string{ + "abcdeg", + "cdef", + "cde", + "cd", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + + word := []rune("abcde") + uselessLen, matchLen, paths := trie.longestMatch(word, nil) + assert.Equal(t, 2, uselessLen) + assert.Equal(t, 3, matchLen) + assert.Nil(t, paths) +} + +func TestNode_jump(t *testing.T) { + keywords := []string{ + "de", + "fe", + } + trie := new(node) + for _, keyword := range keywords { + trie.add(keyword) + } + trie.build() + target := []rune("dfe") + + uselessLen, matchLen, paths := trie.longestMatch(target, nil) + assert.Equal(t, 1, uselessLen) + assert.Equal(t, 0, matchLen) + assert.NotNil(t, paths) + uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths) + assert.Equal(t, 0, uselessLen) + assert.Equal(t, 2, matchLen) + assert.Nil(t, paths) +} diff --git a/core/stringx/replacer.go b/core/stringx/replacer.go index 5447b12d..29c7ded5 100644 --- a/core/stringx/replacer.go +++ b/core/stringx/replacer.go @@ -33,29 +33,26 @@ func NewReplacer(mapping map[string]string) Replacer { // Replace replaces text with given substitutes. func (r *replacer) Replace(text string) string { var buf strings.Builder - var nextStart int target := []rune(text) cur := r.node - + var paths []*node for len(target) != 0 { - used, jump, matched := cur.longestMatch(target, nextStart) - if matched { - replaced := r.mapping[string(target[:used])] - target = append([]rune(replaced), target[used:]...) - cur = r.node - nextStart = 0 + uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths) + if uselessLen > 0 { + buf.WriteString(string(target[:uselessLen])) + target = target[uselessLen:] + } + if matchLen > 0 { + replaced := r.mapping[string(target[:matchLen])] + target = append([]rune(replaced), target[matchLen:]...) + } + if len(nextPaths) != 0 { + cur = nextPaths[len(nextPaths)-1] + paths = nextPaths } else { - buf.WriteString(string(target[:used])) - target = target[used:] - if jump != nil { - cur = jump - nextStart = jump.depth - } else { - cur = r.node - nextStart = 0 - } + cur = r.node + paths = nil } } - return buf.String() } diff --git a/core/stringx/replacer_test.go b/core/stringx/replacer_test.go index df35448b..0f8953f5 100644 --- a/core/stringx/replacer_test.go +++ b/core/stringx/replacer_test.go @@ -15,6 +15,15 @@ func TestReplacer_Replace(t *testing.T) { assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五")) } +func TestReplacer_ReplaceJumpMatch(t *testing.T) { + mapping := map[string]string{ + "abcdeg": "ABCDEG", + "cdef": "CDEF", + "cde": "CDE", + } + assert.Equal(t, "abCDEF", NewReplacer(mapping).Replace("abcdef")) +} + func TestReplacer_ReplaceOverlap(t *testing.T) { mapping := map[string]string{ "3d": "34", @@ -44,6 +53,14 @@ func TestReplacer_ReplacePartialMatch(t *testing.T) { assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五")) } +func TestReplacer_ReplacePartialMatchEnds(t *testing.T) { + mapping := map[string]string{ + "二三四七": "2347", + "三四": "34", + } + assert.Equal(t, "零一二34", NewReplacer(mapping).Replace("零一二三四")) +} + func TestReplacer_ReplaceMultiMatches(t *testing.T) { mapping := map[string]string{ "二三": "23", @@ -60,6 +77,29 @@ func TestReplacer_ReplaceLongestMatching(t *testing.T) { assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本")) } +func TestReplacer_ReplaceSuffixMatch(t *testing.T) { + // case1 + { + keywords := map[string]string{ + "abcde": "ABCDE", + "bcde": "BCDE", + "bcd": "BCD", + } + assert.Equal(t, "aBCDf", NewReplacer(keywords).Replace("abcdf")) + } + // case2 + { + keywords := map[string]string{ + "abcde": "ABCDE", + "bcde": "BCDE", + "cde": "CDE", + "c": "C", + "cd": "CD", + } + assert.Equal(t, "abCDf", NewReplacer(keywords).Replace("abcdf")) + } +} + func TestReplacer_ReplaceLongestOverlap(t *testing.T) { keywords := map[string]string{ "456": "def",