• 高性能AC算法多关键词匹配文本功能Java实现


    直接上测试结果:

    1000000数据集。
    1000000关键词(匹配词)

    装载消耗时间:20869 毫秒
    匹配消耗时间:6599 毫秒

    代码和测试案例:

    1. package com.baian.tggroupmessagematchkeyword.ac;
    2. import lombok.Data;
    3. import java.util.*;
    4. /**
    5. * @program: tg-parent
    6. * @description: ac
    7. * @author: <发哥讲Java-694204477@qq.com>
    8. * @create: 2023-09-19 17:20
    9. **/
    10. @Data
    11. public class AhoCorasick {
    12. private TrieNode root;
    13. public AhoCorasick() {
    14. root = new TrieNode();
    15. }
    16. public void addKeyword(String keyword) {
    17. TrieNode current = root;
    18. for (char ch : keyword.toCharArray()) {
    19. current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());
    20. }
    21. current.setEndOfWord(true);
    22. current.addKeyword(keyword);
    23. }
    24. public void buildFailureLinks() {
    25. Queue queue = new LinkedList<>();
    26. root.setFailure(null);
    27. queue.offer(root);
    28. while (!queue.isEmpty()) {
    29. TrieNode current = queue.poll();
    30. for (TrieNode child : current.getChildren().values()) {
    31. TrieNode failure = current.getFailure();
    32. while (failure != null && !failure.getChildren().containsKey(child.getKey())) {
    33. failure = failure.getFailure();
    34. }
    35. if (failure == null) {
    36. child.setFailure(root);
    37. } else {
    38. child.setFailure(failure.getChildren().get(child.getKey()));
    39. child.addAllKeywords(child.getFailure().getKeywords());
    40. }
    41. queue.offer(child);
    42. }
    43. }
    44. }
    45. public List searchKeywords(String text) {
    46. List result = new ArrayList<>();
    47. TrieNode current = root;
    48. for (int i = 0; i < text.length(); i++) {
    49. char ch = text.charAt(i);
    50. while (current != null && !current.getChildren().containsKey(ch)) {
    51. current = current.getFailure();
    52. }
    53. if (current == null) {
    54. current = root;
    55. } else {
    56. current = current.getChildren().get(ch);
    57. if (current.isEndOfWord()) {
    58. result.addAll(current.getKeywords());
    59. }
    60. TrieNode failure = current.getFailure();
    61. while (failure != null) {
    62. if (failure.isEndOfWord()) {
    63. result.addAll(failure.getKeywords());
    64. }
    65. failure = failure.getFailure();
    66. }
    67. }
    68. }
    69. return result;
    70. }
    71. public static class TrieNode {
    72. private char key;
    73. private boolean endOfWord;
    74. private TrieNode failure;
    75. private Map children;
    76. private List keywords;
    77. public TrieNode() {
    78. children = new HashMap<>();
    79. keywords = new ArrayList<>();
    80. }
    81. public char getKey() {
    82. return key;
    83. }
    84. public void setKey(char key) {
    85. this.key = key;
    86. }
    87. public boolean isEndOfWord() {
    88. return endOfWord;
    89. }
    90. public void setEndOfWord(boolean endOfWord) {
    91. this.endOfWord = endOfWord;
    92. }
    93. public TrieNode getFailure() {
    94. return failure;
    95. }
    96. public void setFailure(TrieNode failure) {
    97. this.failure = failure;
    98. }
    99. public Map getChildren() {
    100. return children;
    101. }
    102. public List getKeywords() {
    103. return keywords;
    104. }
    105. public void addKeyword(String keyword) {
    106. keywords.add(keyword);
    107. }
    108. public void addAllKeywords(List keywords) {
    109. this.keywords.addAll(keywords);
    110. }
    111. }
    112. }

    main:

    1. package test;
    2. import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;
    3. import java.util.ArrayList;
    4. import java.util.List;
    5. import java.util.UUID;
    6. /**
    7. * @program: tg-parent
    8. * @description: 多样本数据集 测试。
    9. * @author: <发哥讲Java-694204477@qq.com>
    10. * @create: 2023-09-19 14:11
    11. **/
    12. public class TestMain001 {
    13. public static void main(String[] args) {
    14. long start0 = System.currentTimeMillis();
    15. List datas = new ArrayList<>(1000000);
    16. for (int i = 0; i < 1000000; i++) {
    17. datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());
    18. }
    19. AhoCorasick ahoCorasick2 = new AhoCorasick();
    20. for (int i = 0; i < 1000000; i++) {
    21. ahoCorasick2.addKeyword(UUID.randomUUID().toString());
    22. }
    23. ahoCorasick2.addKeyword("11");
    24. ahoCorasick2.addKeyword("22");
    25. ahoCorasick2.buildFailureLinks();
    26. long end0 = System.currentTimeMillis();
    27. System.out.println("装载消耗时间:" + (end0 - start0));
    28. long start = System.currentTimeMillis();
    29. for (String message : datas) {
    30. List stringList = ahoCorasick2.searchKeywords(message);
    31. if (stringList.size() > 0) {
    32. // System.out.println(stringList + " message:" + message + " size:" + stringList.size());
    33. }
    34. }
    35. long end = System.currentTimeMillis();
    36. System.out.println("消耗时间:" + (end - start));
    37. }
    38. }

  • 相关阅读:
    代码随想录一刷last day|84.柱状图中最大的矩形
    Zookeeper实战案例(1)
    【uniapp】确认弹出框,选择确定和取消
    急救医学-习题集-复习资料-题库
    java开发之个人微信的二次开发
    Mybatis简介
    Spring、MyBatis框架和Redis数据库介绍 第1关:Spring 框架简介
    设计模式与应用:备忘录模式
    Android四大组件详解
    springboot+高校失物招领系统 毕业设计-附源码121441
  • 原文地址:https://blog.csdn.net/qq_42390636/article/details/133044598