fix Replacer suffix match, and add test case (#2867)

* fix: replace shoud replace the longest match

* feat: revert bytes.Buffer to strings.Builder

* fix: loop reset nextStart

* feat: add node longest match test

* feat: add replacer suffix match test case

* feat: multiple match

* fix: partial match ends

* fix: replace look back upon error

* feat: rm unnecessary branch

---------

Co-authored-by: hudahai <hscxrzs@gmail.com>
Co-authored-by: hushichang <hushichang@sensetime.com>
master
dahaihu 2 years ago committed by GitHub
parent 3736dacf1e
commit cacd5dc91a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -14,7 +14,6 @@ func (n *node) add(word string) {
} }
nd := n nd := n
var depth int
for i, char := range chars { for i, char := range chars {
if nd.children == nil { if nd.children == nil {
child := new(node) child := new(node)
@ -23,7 +22,6 @@ func (n *node) add(word string) {
nd = child nd = child
} else if child, ok := nd.children[char]; ok { } else if child, ok := nd.children[char]; ok {
nd = child nd = child
depth++
} else { } else {
child := new(node) child := new(node)
child.depth = i + 1 child.depth = i + 1
@ -99,51 +97,91 @@ func (n *node) find(chars []rune) []scope {
return scopes return scopes
} }
func (n *node) longestMatch(chars []rune, start int) (used int, jump *node, matched bool) { func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) {
cur := n cur := n
var matchedNode *node var longestMatched *node
findMatch := func(path []*node) (*node, int) {
var (
result *node
start int
)
for i := len(path) - 1; i >= 0; i-- {
icur := path[i]
var cur *node
for icur.fail != nil {
if icur.fail.end {
cur = icur.fail
break
}
icur = icur.fail
}
if cur != nil {
if result == nil {
result = cur
start = i - result.depth + 1
} else if curStart := i - cur.depth + 1; curStart < start {
result = cur
start = curStart
}
}
}
return result, start
}
for i := start; i < len(chars); i++ { for i := len(paths); i < len(chars); i++ {
child, ok := cur.children[chars[i]] char := chars[i]
child, ok := cur.children[char]
if ok { if ok {
cur = child cur = child
if cur.end { if cur.end {
matchedNode = cur longestMatched = cur
} }
paths = append(paths, cur)
} else { } else {
if matchedNode != nil { if longestMatched != nil {
return matchedNode.depth, nil, true return 0, longestMatched.depth, nil
} }
if n.end { if n.end {
return start, nil, true return 0, n.depth, nil
} }
// old path pre longest preMatch
preMatch, preStart := findMatch(paths)
// new path match
var jump *node var jump *node
for cur.fail != nil { icur := cur
jump, ok = cur.fail.children[chars[i]] for icur.fail != nil {
jump, ok = icur.fail.children[char]
if ok { if ok {
break break
} }
cur = cur.fail icur = icur.fail
} }
if jump != nil { switch {
return i + 1 - jump.depth, jump, false case preMatch != nil && jump != nil:
if jumpStart := i - jump.depth + 1; preStart < jumpStart {
return preStart, preMatch.depth, nil
} else {
return jumpStart, 0, append(paths[jumpStart:], jump)
} }
case preMatch != nil && jump == nil:
return i + 1, nil, false return preStart, preMatch.depth, nil
case preMatch == nil && jump != nil:
return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump)
case preMatch == nil && jump == nil:
return i + 1, 0, nil
} }
} }
// longest matched node
if matchedNode != nil {
return matchedNode.depth, nil, true
} }
// this longest matched node
// last matched node if longestMatched != nil {
return 0, longestMatched.depth, nil
}
if n.end { if n.end {
return start, nil, true return 0, n.depth, nil
} }
match, start := findMatch(paths)
return len(chars), nil, false if match != nil {
return start, match.depth, nil
}
return len(chars), 0, nil
} }

@ -9,10 +9,10 @@ import (
func TestLongestMatchGuardedCondition(t *testing.T) { func TestLongestMatchGuardedCondition(t *testing.T) {
n := new(node) n := new(node)
n.end = true n.end = true
used, jump, matched := n.longestMatch([]rune(""), 0) uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil)
assert.Equal(t, 0, used) assert.Equal(t, 0, uselessLen)
assert.Nil(t, jump) assert.Nil(t, jump)
assert.True(t, matched) assert.Equal(t, 0, matchLen)
} }
func TestFuzzNodeCase1(t *testing.T) { func TestFuzzNodeCase1(t *testing.T) {
@ -202,3 +202,228 @@ func BenchmarkNodeFind(b *testing.B) {
trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演")) trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
} }
} }
func TestNode_longestMatchCase0(t *testing.T) {
// match the longest word
keywords := []string{
"a",
"ab",
"abc",
"abcd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil)
assert.Equal(t, 0, uselessLen)
assert.Equal(t, 3, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase1(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"b",
"bc",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase2(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"c",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase3(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"b",
"bc",
"c",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase4(t *testing.T) {
keywords := []string{
"abcde",
"bcdf",
"bcd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.Equal(t, 4, len(paths))
}
func TestNode_longestMatchCase5(t *testing.T) {
keywords := []string{
"abcdef",
"bcde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 4, matchLen)
assert.Nil(t, paths)
}
func TestNode_longestMatchCase6(t *testing.T) {
keywords := []string{
"abcde",
"bc",
"d",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase7(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"bcde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcdef")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 4, matchLen)
assert.Nil(t, paths)
uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.Nil(t, paths)
}
func TestNode_longestMatchCase8(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"cde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcdef")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 0, matchLen)
assert.NotNil(t, paths)
}
func TestNode_longestMatchCase9(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"cde",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcde")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 3, matchLen)
assert.Nil(t, paths)
}
func TestNode_jump(t *testing.T) {
keywords := []string{
"de",
"fe",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
target := []rune("dfe")
uselessLen, matchLen, paths := trie.longestMatch(target, nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.NotNil(t, paths)
uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths)
assert.Equal(t, 0, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, paths)
}

@ -33,29 +33,26 @@ func NewReplacer(mapping map[string]string) Replacer {
// Replace replaces text with given substitutes. // Replace replaces text with given substitutes.
func (r *replacer) Replace(text string) string { func (r *replacer) Replace(text string) string {
var buf strings.Builder var buf strings.Builder
var nextStart int
target := []rune(text) target := []rune(text)
cur := r.node cur := r.node
var paths []*node
for len(target) != 0 { for len(target) != 0 {
used, jump, matched := cur.longestMatch(target, nextStart) uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths)
if matched { if uselessLen > 0 {
replaced := r.mapping[string(target[:used])] buf.WriteString(string(target[:uselessLen]))
target = append([]rune(replaced), target[used:]...) target = target[uselessLen:]
cur = r.node }
nextStart = 0 if matchLen > 0 {
} else { replaced := r.mapping[string(target[:matchLen])]
buf.WriteString(string(target[:used])) target = append([]rune(replaced), target[matchLen:]...)
target = target[used:] }
if jump != nil { if len(nextPaths) != 0 {
cur = jump cur = nextPaths[len(nextPaths)-1]
nextStart = jump.depth paths = nextPaths
} else { } else {
cur = r.node cur = r.node
nextStart = 0 paths = nil
} }
} }
}
return buf.String() return buf.String()
} }

@ -15,6 +15,15 @@ func TestReplacer_Replace(t *testing.T) {
assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五")) assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五"))
} }
func TestReplacer_ReplaceJumpMatch(t *testing.T) {
mapping := map[string]string{
"abcdeg": "ABCDEG",
"cdef": "CDEF",
"cde": "CDE",
}
assert.Equal(t, "abCDEF", NewReplacer(mapping).Replace("abcdef"))
}
func TestReplacer_ReplaceOverlap(t *testing.T) { func TestReplacer_ReplaceOverlap(t *testing.T) {
mapping := map[string]string{ mapping := map[string]string{
"3d": "34", "3d": "34",
@ -44,6 +53,14 @@ func TestReplacer_ReplacePartialMatch(t *testing.T) {
assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五")) assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五"))
} }
func TestReplacer_ReplacePartialMatchEnds(t *testing.T) {
mapping := map[string]string{
"二三四七": "2347",
"三四": "34",
}
assert.Equal(t, "零一二34", NewReplacer(mapping).Replace("零一二三四"))
}
func TestReplacer_ReplaceMultiMatches(t *testing.T) { func TestReplacer_ReplaceMultiMatches(t *testing.T) {
mapping := map[string]string{ mapping := map[string]string{
"二三": "23", "二三": "23",
@ -60,6 +77,29 @@ func TestReplacer_ReplaceLongestMatching(t *testing.T) {
assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本")) assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本"))
} }
func TestReplacer_ReplaceSuffixMatch(t *testing.T) {
// case1
{
keywords := map[string]string{
"abcde": "ABCDE",
"bcde": "BCDE",
"bcd": "BCD",
}
assert.Equal(t, "aBCDf", NewReplacer(keywords).Replace("abcdf"))
}
// case2
{
keywords := map[string]string{
"abcde": "ABCDE",
"bcde": "BCDE",
"cde": "CDE",
"c": "C",
"cd": "CD",
}
assert.Equal(t, "abCDf", NewReplacer(keywords).Replace("abcdf"))
}
}
func TestReplacer_ReplaceLongestOverlap(t *testing.T) { func TestReplacer_ReplaceLongestOverlap(t *testing.T) {
keywords := map[string]string{ keywords := map[string]string{
"456": "def", "456": "def",

Loading…
Cancel
Save