View Javadoc
1   /*
2    * Copyright 2020 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License, version 2.0 (the
5    * "License"); you may not use this file except in compliance with the License. You may obtain a
6    * copy of the License at:
7    *
8    * https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software distributed under the License
11   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12   * or implied. See the License for the specific language governing permissions and limitations under
13   * the License.
14   */
15  package io.netty.buffer.search;
16  
17  /**
18   * Base class for precomputed factories that create {@link SearchProcessor}s.
19   * <br>
20   * Different factories implement different search algorithms with performance characteristics that
21   * depend on a use case, so it is advisable to benchmark a concrete use case with different algorithms
22   * before choosing one of them.
23   * <br>
24   * A concrete instance of {@link AbstractSearchProcessorFactory} is built for searching for a concrete sequence of bytes
25   * (the {@code needle}), it contains precomputed data needed to perform the search, and is meant to be reused
26   * whenever searching for the same {@code needle}.
27   * <br>
28   * <b>Note:</b> implementations of {@link SearchProcessor} scan the {@link io.netty.buffer.ByteBuf} sequentially,
29   * one byte after another, without doing any random access. As a result, when using {@link SearchProcessor}
30   * with such methods as {@link io.netty.buffer.ByteBuf#forEachByte}, these methods return the index of the last byte
31   * of the found byte sequence within the {@link io.netty.buffer.ByteBuf} (which might feel counterintuitive,
32   * and different from {@link io.netty.buffer.ByteBufUtil#indexOf} which returns the index of the first byte
33   * of found sequence).
34   * <br>
35   * A {@link SearchProcessor} is implemented as a
36   * <a href="https://en.wikipedia.org/wiki/Finite-state_machine">Finite State Automaton</a> that contains a
37   * small internal state which is updated with every byte processed. As a result, an instance of {@link SearchProcessor}
38   * should not be reused across independent search sessions (eg. for searching in different
39   * {@link io.netty.buffer.ByteBuf}s). A new instance should be created with {@link AbstractSearchProcessorFactory} for
40   * every search session. However, a {@link SearchProcessor} can (and should) be reused within the search session,
41   * eg. when searching for all occurrences of the {@code needle} within the same {@code haystack}. That way, it can
42   * also detect overlapping occurrences of the {@code needle} (eg. a string "ABABAB" contains two occurrences of "BAB"
43   * that overlap by one character "B"). For this to work correctly, after an occurrence of the {@code needle} is
44   * found ending at index {@code idx}, the search should continue starting from the index {@code idx + 1}.
45   * <br>
46   * Example (given that the {@code haystack} is a {@link io.netty.buffer.ByteBuf} containing "ABABAB" and
47   * the {@code needle} is "BAB"):
48   * <pre>
49   *     SearchProcessorFactory factory =
50   *         SearchProcessorFactory.newKmpSearchProcessorFactory(needle.getBytes(CharsetUtil.UTF_8));
51   *     SearchProcessor processor = factory.newSearchProcessor();
52   *
53   *     int idx1 = haystack.forEachByte(processor);
54   *     // idx1 is 3 (index of the last character of the first occurrence of the needle in the haystack)
55   *
56   *     int continueFrom1 = idx1 + 1;
57   *     // continue the search starting from the next character
58   *
59   *     int idx2 = haystack.forEachByte(continueFrom1, haystack.readableBytes() - continueFrom1, processor);
60   *     // idx2 is 5 (index of the last character of the second occurrence of the needle in the haystack)
61   *
62   *     int continueFrom2 = idx2 + 1;
63   *     // continue the search starting from the next character
64   *
65   *     int idx3 = haystack.forEachByte(continueFrom2, haystack.readableBytes() - continueFrom2, processor);
66   *     // idx3 is -1 (no more occurrences of the needle)
67   *
68   *     // After this search session is complete, processor should be discarded.
69   *     // To search for the same needle again, reuse the same factory to get a new SearchProcessor.
70   * </pre>
71   */
72  public abstract class AbstractSearchProcessorFactory implements SearchProcessorFactory {
73  
74      /**
75       * Creates a {@link SearchProcessorFactory} based on
76       * <a href="https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm">Knuth-Morris-Pratt</a>
77       * string search algorithm. It is a reasonable default choice among the provided algorithms.
78       * <br>
79       * Precomputation (this method) time is linear in the size of input ({@code O(|needle|)}).
80       * <br>
81       * The factory allocates and retains an int array of size {@code needle.length + 1}, and retains a reference
82       * to the {@code needle} itself.
83       * <br>
84       * Search (the actual application of {@link SearchProcessor}) time is linear in the size of
85       * {@link io.netty.buffer.ByteBuf} on which the search is performed ({@code O(|haystack|)}).
86       * Every byte of {@link io.netty.buffer.ByteBuf} is processed only once, sequentially.
87       *
88       * @param needle an array of bytes to search for
89       * @return a new instance of {@link KmpSearchProcessorFactory} precomputed for the given {@code needle}
90       */
91      public static KmpSearchProcessorFactory newKmpSearchProcessorFactory(byte[] needle) {
92          return new KmpSearchProcessorFactory(needle);
93      }
94  
95      /**
96       * Creates a {@link SearchProcessorFactory} based on Bitap string search algorithm.
97       * It is a jump free algorithm that has very stable performance (the contents of the inputs have a minimal
98       * effect on it). The limitation is that the {@code needle} can be no more than 64 bytes long.
99       * <br>
100      * Precomputation (this method) time is linear in the size of the input ({@code O(|needle|)}).
101      * <br>
102      * The factory allocates and retains a long[256] array.
103      * <br>
104      * Search (the actual application of {@link SearchProcessor}) time is linear in the size of
105      * {@link io.netty.buffer.ByteBuf} on which the search is performed ({@code O(|haystack|)}).
106      * Every byte of {@link io.netty.buffer.ByteBuf} is processed only once, sequentially.
107      *
108      * @param needle an array <b>of no more than 64 bytes</b> to search for
109      * @return a new instance of {@link BitapSearchProcessorFactory} precomputed for the given {@code needle}
110      */
111     public static BitapSearchProcessorFactory newBitapSearchProcessorFactory(byte[] needle) {
112         return new BitapSearchProcessorFactory(needle);
113     }
114 
115 }