直接上测试结果:
1000000数据集。 1000000关键词(匹配词)
装载消耗时间:20869 毫秒
匹配消耗时间:6599 毫秒
代码和测试案例:
- package com.baian.tggroupmessagematchkeyword.ac;
-
- import lombok.Data;
-
- import java.util.*;
-
- /**
- * @program: tg-parent
- * @description: ac
- * @author: <发哥讲Java-694204477@qq.com>
- * @create: 2023-09-19 17:20
- **/
- @Data
- public class AhoCorasick {
- private TrieNode root;
-
- public AhoCorasick() {
- root = new TrieNode();
- }
-
- public void addKeyword(String keyword) {
- TrieNode current = root;
-
- for (char ch : keyword.toCharArray()) {
- current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
- }
-
- current.setEndOfWord(true);
- current.addKeyword(keyword);
- }
-
- public void buildFailureLinks() {
- Queue
queue = new LinkedList<>(); - root.setFailure(null);
- queue.offer(root);
-
- while (!queue.isEmpty()) {
- TrieNode current = queue.poll();
-
- for (TrieNode child : current.getChildren().values()) {
- TrieNode failure = current.getFailure();
-
- while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
- failure = failure.getFailure();
- }
-
- if (failure == null) {
- child.setFailure(root);
- } else {
- child.setFailure(failure.getChildren().get(child.getKey()));
- child.addAllKeywords(child.getFailure().getKeywords());
- }
-
- queue.offer(child);
- }
- }
- }
-
- public List
searchKeywords(String text) { - List
result = new ArrayList<>(); - TrieNode current = root;
-
- for (int i = 0; i < text.length(); i++) {
- char ch = text.charAt(i);
-
- while (current != null && !current.getChildren().containsKey(ch)) {
- current = current.getFailure();
- }
-
- if (current == null) {
- current = root;
- } else {
- current = current.getChildren().get(ch);
- if (current.isEndOfWord()) {
- result.addAll(current.getKeywords());
- }
-
- TrieNode failure = current.getFailure();
- while (failure != null) {
- if (failure.isEndOfWord()) {
- result.addAll(failure.getKeywords());
- }
- failure = failure.getFailure();
- }
- }
- }
-
- return result;
- }
-
- public static class TrieNode {
- private char key;
- private boolean endOfWord;
- private TrieNode failure;
- private Map
children; - private List
keywords; -
- public TrieNode() {
- children = new HashMap<>();
- keywords = new ArrayList<>();
- }
-
- public char getKey() {
- return key;
- }
-
- public void setKey(char key) {
- this.key = key;
- }
-
- public boolean isEndOfWord() {
- return endOfWord;
- }
-
- public void setEndOfWord(boolean endOfWord) {
- this.endOfWord = endOfWord;
- }
-
- public TrieNode getFailure() {
- return failure;
- }
-
- public void setFailure(TrieNode failure) {
- this.failure = failure;
- }
-
- public Map
getChildren() { - return children;
- }
-
- public List
getKeywords() { - return keywords;
- }
-
- public void addKeyword(String keyword) {
- keywords.add(keyword);
- }
-
- public void addAllKeywords(List
keywords) { - this.keywords.addAll(keywords);
- }
- }
- }
main:
- package test;
-
- import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;
-
- import java.util.ArrayList;
- import java.util.List;
- import java.util.UUID;
-
- /**
- * @program: tg-parent
- * @description: 多样本数据集 测试。
- * @author: <发哥讲Java-694204477@qq.com>
- * @create: 2023-09-19 14:11
- **/
- public class TestMain001 {
- public static void main(String[] args) {
- long start0 = System.currentTimeMillis();
- List
datas = new ArrayList<>(1000000); - for (int i = 0; i < 1000000; i++) {
- datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
- }
-
- AhoCorasick ahoCorasick2 = new AhoCorasick();
- for (int i = 0; i < 1000000; i++) {
- ahoCorasick2.addKeyword(UUID.randomUUID().toString());
- }
- ahoCorasick2.addKeyword("11");
- ahoCorasick2.addKeyword("22");
- ahoCorasick2.buildFailureLinks();
- long end0 = System.currentTimeMillis();
- System.out.println("装载消耗时间:" + (end0 - start0));
-
- long start = System.currentTimeMillis();
- for (String message : datas) {
- List
stringList = ahoCorasick2.searchKeywords(message); - if (stringList.size() > 0) {
- // System.out.println(stringList + " message:" + message + " size:" + stringList.size());
- }
- }
-
- long end = System.currentTimeMillis();
- System.out.println("消耗时间:" + (end - start));
-
- }
- }