View Javadoc
1   /*
2    * Copyright 2020 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License, version 2.0 (the
5    * "License"); you may not use this file except in compliance with the License. You may obtain a
6    * copy of the License at:
7    *
8    * https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software distributed under the License
11   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12   * or implied. See the License for the specific language governing permissions and limitations under
13   * the License.
14   */
15  package io.netty.buffer.search;
16  
17  /**
18   * Base class for precomputed factories that create {@link MultiSearchProcessor}s.
19   * <br>
20   * The purpose of {@link MultiSearchProcessor} is to perform efficient simultaneous search for multiple {@code needles}
21   * in the {@code haystack}, while scanning every byte of the input sequentially, only once. While it can also be used
22   * to search for just a single {@code needle}, using a {@link SearchProcessorFactory} would be more efficient for
23   * doing that.
24   * <br>
25   * See the documentation of {@link AbstractSearchProcessorFactory} for a comprehensive description of common usage.
26   * In addition to the functionality provided by {@link SearchProcessor}, {@link MultiSearchProcessor} adds
27   * a method to get the index of the {@code needle} found at the current position of the {@link MultiSearchProcessor} -
28   * {@link MultiSearchProcessor#getFoundNeedleId()}.
29   * <br>
30   * <b>Note:</b> in some cases one {@code needle} can be a suffix of another {@code needle}, eg. {@code {"BC", "ABC"}},
31   * and there can potentially be multiple {@code needles} found ending at the same position of the {@code haystack}.
32   * In such case {@link MultiSearchProcessor#getFoundNeedleId()} returns the index of the longest matching {@code needle}
33   * in the array of {@code needles}.
34   * <br>
35   * Usage example (given that the {@code haystack} is a {@link io.netty.buffer.ByteBuf} containing "ABCD" and the
36   * {@code needles} are "AB", "BC" and "CD"):
37   * <pre>
38   *      MultiSearchProcessorFactory factory = MultiSearchProcessorFactory.newAhoCorasicSearchProcessorFactory(
39   *          "AB".getBytes(CharsetUtil.UTF_8), "BC".getBytes(CharsetUtil.UTF_8), "CD".getBytes(CharsetUtil.UTF_8));
40   *      MultiSearchProcessor processor = factory.newSearchProcessor();
41   *
42   *      int idx1 = haystack.forEachByte(processor);
43   *      // idx1 is 1 (index of the last character of the occurrence of "AB" in the haystack)
44   *      // processor.getFoundNeedleId() is 0 (index of "AB" in needles[])
45   *
46   *      int continueFrom1 = idx1 + 1;
47   *      // continue the search starting from the next character
48   *
49   *      int idx2 = haystack.forEachByte(continueFrom1, haystack.readableBytes() - continueFrom1, processor);
50   *      // idx2 is 2 (index of the last character of the occurrence of "BC" in the haystack)
51   *      // processor.getFoundNeedleId() is 1 (index of "BC" in needles[])
52   *
53   *      int continueFrom2 = idx2 + 1;
54   *
55   *      int idx3 = haystack.forEachByte(continueFrom2, haystack.readableBytes() - continueFrom2, processor);
56   *      // idx3 is 3 (index of the last character of the occurrence of "CD" in the haystack)
57   *      // processor.getFoundNeedleId() is 2 (index of "CD" in needles[])
58   *
59   *      int continueFrom3 = idx3 + 1;
60   *
61   *      int idx4 = haystack.forEachByte(continueFrom3, haystack.readableBytes() - continueFrom3, processor);
62   *      // idx4 is -1 (no more occurrences of any of the needles)
63   *
64   *      // This search session is complete, processor should be discarded.
65   *      // To search for the same needles again, reuse the same {@link AbstractMultiSearchProcessorFactory}
66   *      // to get a new MultiSearchProcessor.
67   * </pre>
68   */
69  public abstract class AbstractMultiSearchProcessorFactory implements MultiSearchProcessorFactory {
70  
71      /**
72       * Creates a {@link MultiSearchProcessorFactory} based on
73       * <a href="https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm">Aho–Corasick</a>
74       * string search algorithm.
75       * <br>
76       * Precomputation (this method) time is linear in the size of input ({@code O(Σ|needles|)}).
77       * <br>
78       * The factory allocates and retains an array of 256 * X ints plus another array of X ints, where X
79       * is the sum of lengths of each entry of {@code needles} minus the sum of lengths of repeated
80       * prefixes of the {@code needles}.
81       * <br>
82       * Search (the actual application of {@link MultiSearchProcessor}) time is linear in the size of
83       * {@link io.netty.buffer.ByteBuf} on which the search is performed ({@code O(|haystack|)}).
84       * Every byte of {@link io.netty.buffer.ByteBuf} is processed only once, sequentually, regardles of
85       * the number of {@code needles} being searched for.
86       *
87       * @param needles a varargs array of arrays of bytes to search for
88       * @return a new instance of {@link AhoCorasicSearchProcessorFactory} precomputed for the given {@code needles}
89       */
90      public static AhoCorasicSearchProcessorFactory newAhoCorasicSearchProcessorFactory(byte[] ...needles) {
91          return new AhoCorasicSearchProcessorFactory(needles);
92      }
93  
94  }