Java实现AC自动机全文检索示例

本篇文章主要介绍了Java实现AC自动机全文检索示例,小编觉得挺不错的,现在分享给大家,也给大家做个参考。一起跟随小编过来看看吧

第一步,构建Trie树,定义Node类型:

/** * Created by zhaoyy on 2017/2/7. */ interface Node { char value(); boolean exists(); boolean isRoot(); Node parent(); Node childOf(char c); Node fail(); void setFail(Node node); void setExists(boolean exists); void add(Node child); List children(); }

第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:

/** * Created by zhaoyy on 2017/2/8. */ abstract class AbstractNode implements Node { private static final char EMPTY = ''; private final char value; private final Node parent; private boolean exists; private Node fail; public AbstractNode(Node parent, char value) { this.parent = parent; this.value = value; this.exists = false; this.fail = null; } public AbstractNode() { this(null, EMPTY); } private static String tab(int n) { StringBuilder builder = new StringBuilder(); for (int i = 0; i rn"); for (Node child : node.children()) builder.append(toString(child, depth + 1)); builder .append(tab) .append("")>rn"); return builder.toString(); } @Override public char value() { return value; } @Override public boolean exists() { return exists; } @Override public boolean isRoot() { return value == EMPTY; } @Override public Node parent() { return parent; } @Override public Node fail() { return fail; } @Override public void setFail(Node node) { this.fail = node; } @Override public void setExists(boolean exists) { this.exists = exists; } @Override public String toString() { return toString(this, 0); } } ///////////////////////////////////////////////////////////////////////////////////////////// /** * Created by zhaoyy on 2017/2/8. */ final class AsciiNode extends AbstractNode implements Node { private static final char FROM = 32; private static final char TO = 126; private final Node[] children; public AsciiNode() { super(); this.children = new Node[TO - FROM + 1]; } public AsciiNode(Node parent, char value) { super(parent, value); this.children = new Node[TO - FROM + 1]; } @Override public Node childOf(char c) { if (c >= FROM && c children() { List nodes = new ArrayList(); for (Node child : children) if (child != null) nodes.add(child); return nodes; } } ////////////////////////////////////////////////////////////////////////////////////////////// /** * Created by zhaoyy on 2017/2/8. */ final class MapNode extends AbstractNode implements Node { private final Map children; public MapNode() { super(); this.children = new HashMap(); } public MapNode(Node parent, char value) { super(parent, value); this.children = new HashMap(); } @Override public Node childOf(char c) { return children.get(c); } @Override public void add(Node child) { children.put(child.value(), child); } @Override public List children() { List nodes = new ArrayList(); nodes.addAll(children.values()); return nodes; } }

第三步,

首先定义一个Node构造器:

/** * Created by zhaoyy on 2017/2/8. */ public interface NodeMaker { Node make(Node parent, char value); Node makeRoot(); }

然后构建AC自动机,实现创建及查找方法

/** * Created by zhaoyy on 2017/2/7. */ public final class WordTable { private final Node root; private WordTable(Collection extends CharSequence> words, NodeMaker maker) { Node root = buildTrie(words, maker); setFailNode(root); this.root = root; } public static WordTable compile(Collection extends CharSequence> words) { if (words == null || words.isEmpty()) throw new IllegalArgumentException(); final NodeMaker maker; if (isAscii(words)) maker = new NodeMaker() { @Override public Node make(Node parent, char value) { return new AsciiNode(parent, value); } @Override public Node makeRoot() { return new AsciiNode(); } }; else maker = new NodeMaker() { @Override public Node make(Node parent, char value) { return new MapNode(parent, value); } @Override public Node makeRoot() { return new MapNode(); } }; return new WordTable(words, maker); } private static boolean isAscii(Collection extends CharSequence> words) { for (CharSequence word : words) { int len = word.length(); for (int i = 0; i 126) return false; } } return true; } private static Node buildTrie(Collection extends CharSequence> sequences, NodeMaker maker) { Node root = maker.makeRoot(); for (CharSequence sequence : sequences) { int len = sequence.length(); Node current = root; for (int i = 0; i queue = new LinkedList(); queue.add(root); while (!queue.isEmpty()) { Node parent = queue.poll(); Node temp; for (Node child : parent.children()) { if (parent.isRoot()) child.setFail(root); else { temp = parent.fail(); while (temp != null) { Node node = temp.childOf(child.value()); if (node != null) { child.setFail(node); break; } temp = temp.fail(); } if (temp == null) child.setFail(root); } queue.add(child); } } } public boolean findAnyIn(CharSequence cs) { int len = cs.length(); Node node = root; for (int i = 0; i search(CharSequence cs) { if (cs == null || cs.length() == 0) return Collections.emptyList(); List result = new ArrayList(); int len = cs.length(); Node node = root; for (int i = 0; i

定义一个保存查找结果的实体:

/** * Created by zhaoyy on 2017/2/7. */ public final class MatchInfo { private final int index; private final String word; public MatchInfo(int index, String word) { this.index = index; this.word = word; } public MatchInfo(int index, Node node) { StringBuilder builder = new StringBuilder(); while (node != null) { if (!node.isRoot()) builder.append(node.value()); node = node.parent(); } String word = builder.reverse().toString(); this.index = index + 1 - word.length(); this.word = word; } public int getIndex() { return index; } public String getWord() { return word; } @Override public String toString() { return index + ":" + word; } }

第四步,调用Demo:

public static void main(String[] args) { List list = Arrays.asList("say", "her", "he", "she", "shr", "alone"); WordTable table = WordTable.compile(list); System.out.println(table); System.out.println(table.search("1shesaynothingabouthislivinghimalone")); }

以下是输出结果:

> [1:she, 4:say, 31:alone]

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持编程之家。

相关文章

HashMap是Java中最常用的集合类框架,也是Java语言中非常典型...
在EffectiveJava中的第 36条中建议 用 EnumSet 替代位字段,...
介绍 注解是JDK1.5版本开始引入的一个特性,用于对代码进行说...
介绍 LinkedList同时实现了List接口和Deque接口,也就是说它...
介绍 TreeSet和TreeMap在Java里有着相同的实现,前者仅仅是对...
HashMap为什么线程不安全 put的不安全 由于多线程对HashMap进...