1 /*
2 * Copyright 2020 The Netty Project
3 *
4 * The Netty Project licenses this file to you under the Apache License, version 2.0 (the
5 * "License"); you may not use this file except in compliance with the License. You may obtain a
6 * copy of the License at:
7 *
8 * https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software distributed under the License
11 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing permissions and limitations under
13 * the License.
14 */
15 package io.netty.buffer.search;
16
17 /**
18 * Base class for precomputed factories that create {@link MultiSearchProcessor}s.
19 * <br>
20 * The purpose of {@link MultiSearchProcessor} is to perform efficient simultaneous search for multiple {@code needles}
21 * in the {@code haystack}, while scanning every byte of the input sequentially, only once. While it can also be used
22 * to search for just a single {@code needle}, using a {@link SearchProcessorFactory} would be more efficient for
23 * doing that.
24 * <br>
25 * See the documentation of {@link AbstractSearchProcessorFactory} for a comprehensive description of common usage.
26 * In addition to the functionality provided by {@link SearchProcessor}, {@link MultiSearchProcessor} adds
27 * a method to get the index of the {@code needle} found at the current position of the {@link MultiSearchProcessor} -
28 * {@link MultiSearchProcessor#getFoundNeedleId()}.
29 * <br>
30 * <b>Note:</b> in some cases one {@code needle} can be a suffix of another {@code needle}, eg. {@code {"BC", "ABC"}},
31 * and there can potentially be multiple {@code needles} found ending at the same position of the {@code haystack}.
32 * In such case {@link MultiSearchProcessor#getFoundNeedleId()} returns the index of the longest matching {@code needle}
33 * in the array of {@code needles}.
34 * <br>
35 * Usage example (given that the {@code haystack} is a {@link io.netty.buffer.ByteBuf} containing "ABCD" and the
36 * {@code needles} are "AB", "BC" and "CD"):
37 * <pre>
38 * MultiSearchProcessorFactory factory = MultiSearchProcessorFactory.newAhoCorasicSearchProcessorFactory(
39 * "AB".getBytes(CharsetUtil.UTF_8), "BC".getBytes(CharsetUtil.UTF_8), "CD".getBytes(CharsetUtil.UTF_8));
40 * MultiSearchProcessor processor = factory.newSearchProcessor();
41 *
42 * int idx1 = haystack.forEachByte(processor);
43 * // idx1 is 1 (index of the last character of the occurrence of "AB" in the haystack)
44 * // processor.getFoundNeedleId() is 0 (index of "AB" in needles[])
45 *
46 * int continueFrom1 = idx1 + 1;
47 * // continue the search starting from the next character
48 *
49 * int idx2 = haystack.forEachByte(continueFrom1, haystack.readableBytes() - continueFrom1, processor);
50 * // idx2 is 2 (index of the last character of the occurrence of "BC" in the haystack)
51 * // processor.getFoundNeedleId() is 1 (index of "BC" in needles[])
52 *
53 * int continueFrom2 = idx2 + 1;
54 *
55 * int idx3 = haystack.forEachByte(continueFrom2, haystack.readableBytes() - continueFrom2, processor);
56 * // idx3 is 3 (index of the last character of the occurrence of "CD" in the haystack)
57 * // processor.getFoundNeedleId() is 2 (index of "CD" in needles[])
58 *
59 * int continueFrom3 = idx3 + 1;
60 *
61 * int idx4 = haystack.forEachByte(continueFrom3, haystack.readableBytes() - continueFrom3, processor);
62 * // idx4 is -1 (no more occurrences of any of the needles)
63 *
64 * // This search session is complete, processor should be discarded.
65 * // To search for the same needles again, reuse the same {@link AbstractMultiSearchProcessorFactory}
66 * // to get a new MultiSearchProcessor.
67 * </pre>
68 */
69 public abstract class AbstractMultiSearchProcessorFactory implements MultiSearchProcessorFactory {
70
71 /**
72 * Creates a {@link MultiSearchProcessorFactory} based on
73 * <a href="https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm">Aho–Corasick</a>
74 * string search algorithm.
75 * <br>
76 * Precomputation (this method) time is linear in the size of input ({@code O(Σ|needles|)}).
77 * <br>
78 * The factory allocates and retains an array of 256 * X ints plus another array of X ints, where X
79 * is the sum of lengths of each entry of {@code needles} minus the sum of lengths of repeated
80 * prefixes of the {@code needles}.
81 * <br>
82 * Search (the actual application of {@link MultiSearchProcessor}) time is linear in the size of
83 * {@link io.netty.buffer.ByteBuf} on which the search is performed ({@code O(|haystack|)}).
84 * Every byte of {@link io.netty.buffer.ByteBuf} is processed only once, sequentually, regardles of
85 * the number of {@code needles} being searched for.
86 *
87 * @param needles a varargs array of arrays of bytes to search for
88 * @return a new instance of {@link AhoCorasicSearchProcessorFactory} precomputed for the given {@code needles}
89 */
90 public static AhoCorasicSearchProcessorFactory newAhoCorasicSearchProcessorFactory(byte[] ...needles) {
91 return new AhoCorasicSearchProcessorFactory(needles);
92 }
93
94 }