diff --git a/doc/images/trie.png b/doc/images/trie.png new file mode 100644 index 00000000..fbcd2294 Binary files /dev/null and b/doc/images/trie.png differ diff --git a/doc/keywords.md b/doc/keywords.md new file mode 100644 index 00000000..5c8a0db9 --- /dev/null +++ b/doc/keywords.md @@ -0,0 +1,61 @@ +# 高效的关键词替换和敏感词过滤工具 + +## 1. 算法介绍 + +利用高效的Trie树建立关键词树,如下图所示,然后依次查找字符串中的相连字符是否形成树的一条路径 + +![trie](images/trie.png =250x250) + +发现掘金上[这篇文章](https://juejin.im/post/6844903750490914829)写的比较详细,可以一读,具体原理在此不详述。 + +## 2. 关键词替换 + +```go + replacer := stringx.NewReplacer(map[string]string{ + "PHP": "PPT", + "世界上": "吹牛", + }) + fmt.Println(replacer.Replace("PHP是世界上最好的语言!")) +``` + +可以得到: +``` +PPT是吹牛最好的语言! +``` + +示例代码见`example/stringx/replace/replace.go` + +## 3. 敏感词过滤 + +```go + filter := stringx.NewTrie([]string{ + "AV演员", + "苍井空", + "AV", + "日本AV女优", + "AV演员色情", + }, stringx.WithMask('?')) + safe, keywords, found := filter.Filter("日本AV演员兼电视、电影演员。苍井空AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演") + fmt.Println(safe) + fmt.Println(keywords) + fmt.Println(found) +``` + +可以得到: + +``` +日本????兼电视、电影演员。?????女优是xx出道, ??????们最精彩的表演是??????表演 +[苍井空 日本AV女优 AV演员色情 AV AV演员] +true +``` + +示例代码见`example/stringx/filter/filter.go` + +## 4. Benchmark + +``` +| Sentences | Keywords | Regex | Go-Zero | +|-----------|----------|----------|----------| +| 10000 | 10000 | 16min10s | 27.2ms +``` + diff --git a/example/stringx/filter/filter.go b/example/stringx/filter/filter.go new file mode 100644 index 00000000..9d4d6647 --- /dev/null +++ b/example/stringx/filter/filter.go @@ -0,0 +1,21 @@ +package main + +import ( + "fmt" + + "github.com/tal-tech/go-zero/core/stringx" +) + +func main() { + filter := stringx.NewTrie([]string{ + "AV演员", + "苍井空", + "AV", + "日本AV女优", + "AV演员色情", + }, stringx.WithMask('?')) + safe, keywords, found := filter.Filter("日本AV演员兼电视、电影演员。苍井空AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演") + fmt.Println(safe) + fmt.Println(keywords) + fmt.Println(found) +} diff --git a/example/stringx/replace/replace.go b/example/stringx/replace/replace.go new file mode 100644 index 00000000..4c6961c7 --- /dev/null +++ b/example/stringx/replace/replace.go @@ -0,0 +1,15 @@ +package main + +import ( + "fmt" + + "github.com/tal-tech/go-zero/core/stringx" +) + +func main() { + replacer := stringx.NewReplacer(map[string]string{ + "PHP": "PPT", + "世界上": "吹牛", + }) + fmt.Println(replacer.Replace("PHP是世界上最好的语言!")) +} diff --git a/readme.md b/readme.md index 8b2cf2ef..539c19ea 100644 --- a/readme.md +++ b/readme.md @@ -182,7 +182,8 @@ go-zero是一个集成了各种工程实践的包含web和rpc框架,有如下 ## 9. 文档 * [goctl使用帮助](doc/goctl.md) +* [关键字替换和敏感词过滤工具](doc/keywords.md) -### 微信交流群 +## 10. 微信交流群 添加我的微信:kevwan,请注明go-zero,我拉进go-zero社区群🤝