View Javadoc
1   /*
2    * Copyright 2014 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty5.handler.codec.compression;
17  
18  import io.netty5.buffer.api.Buffer;
19  
20  import java.util.Arrays;
21  
22  import static io.netty5.handler.codec.compression.Bzip2Constants.HUFFMAN_ENCODE_MAX_CODE_LENGTH;
23  import static io.netty5.handler.codec.compression.Bzip2Constants.HUFFMAN_GROUP_RUN_LENGTH;
24  
25  /**
26   * An encoder for the Bzip2 Huffman encoding stage.
27   */
28  final class Bzip2HuffmanStageEncoder {
29      /**
30       * Used in initial Huffman table generation.
31       */
32      private static final int HUFFMAN_HIGH_SYMBOL_COST = 15;
33  
34      /**
35       * The {@link Bzip2BitWriter} to which the Huffman tables and data is written.
36       */
37      private final Bzip2BitWriter writer;
38  
39      /**
40       * The output of the Move To Front Transform and Run Length Encoding[2] stages.
41       */
42      private final char[] mtfBlock;
43  
44      /**
45       * The actual number of values contained in the {@link #mtfBlock} array.
46       */
47      private final int mtfLength;
48  
49      /**
50       * The number of unique values in the {@link #mtfBlock} array.
51       */
52      private final int mtfAlphabetSize;
53  
54      /**
55       * The global frequencies of values within the {@link #mtfBlock} array.
56       */
57      private final int[] mtfSymbolFrequencies;
58  
59      /**
60       * The Canonical Huffman code lengths for each table.
61       */
62      private final int[][] huffmanCodeLengths;
63  
64      /**
65       * Merged code symbols for each table. The value at each position is ((code length << 24) | code).
66       */
67      private final int[][] huffmanMergedCodeSymbols;
68  
69      /**
70       * The selectors for each segment.
71       */
72      private final byte[] selectors;
73  
74      /**
75       * @param writer The {@link Bzip2BitWriter} which provides bit-level writes
76       * @param mtfBlock The MTF block data
77       * @param mtfLength The actual length of the MTF block
78       * @param mtfAlphabetSize The size of the MTF block's alphabet
79       * @param mtfSymbolFrequencies The frequencies the MTF block's symbols
80       */
81      Bzip2HuffmanStageEncoder(final Bzip2BitWriter writer, final char[] mtfBlock,
82                               final int mtfLength, final int mtfAlphabetSize, final int[] mtfSymbolFrequencies) {
83          this.writer = writer;
84          this.mtfBlock = mtfBlock;
85          this.mtfLength = mtfLength;
86          this.mtfAlphabetSize = mtfAlphabetSize;
87          this.mtfSymbolFrequencies = mtfSymbolFrequencies;
88  
89          final int totalTables = selectTableCount(mtfLength);
90  
91          huffmanCodeLengths = new int[totalTables][mtfAlphabetSize];
92          huffmanMergedCodeSymbols = new int[totalTables][mtfAlphabetSize];
93          selectors = new byte[(mtfLength + HUFFMAN_GROUP_RUN_LENGTH - 1) / HUFFMAN_GROUP_RUN_LENGTH];
94      }
95  
96      /**
97       * Selects an appropriate table count for a given MTF length.
98       * @param mtfLength The length to select a table count for
99       * @return The selected table count
100      */
101     private static int selectTableCount(final int mtfLength) {
102         if (mtfLength >= 2400) {
103             return 6;
104         }
105         if (mtfLength >= 1200) {
106             return 5;
107         }
108         if (mtfLength >= 600) {
109             return 4;
110         }
111         if (mtfLength >= 200) {
112             return 3;
113         }
114         return 2;
115     }
116 
117     /**
118      * Generate a Huffman code length table for a given list of symbol frequencies.
119      * @param alphabetSize The total number of symbols
120      * @param symbolFrequencies The frequencies of the symbols
121      * @param codeLengths The array to which the generated code lengths should be written
122      */
123     private static void generateHuffmanCodeLengths(final int alphabetSize,
124                                                    final int[] symbolFrequencies, final int[] codeLengths) {
125 
126         final int[] mergedFrequenciesAndIndices = new int[alphabetSize];
127         final int[] sortedFrequencies = new int[alphabetSize];
128 
129         // The Huffman allocator needs its input symbol frequencies to be sorted, but we need to
130         // return code lengths in the same order as the corresponding frequencies are passed in.
131 
132         // The symbol frequency and index are merged into a single array of
133         // integers - frequency in the high 23 bits, index in the low 9 bits.
134         //     2^23 = 8,388,608 which is higher than the maximum possible frequency for one symbol in a block
135         //     2^9 = 512 which is higher than the maximum possible alphabet size (== 258)
136         // Sorting this array simultaneously sorts the frequencies and
137         // leaves a lookup that can be used to cheaply invert the sort.
138         for (int i = 0; i < alphabetSize; i++) {
139             mergedFrequenciesAndIndices[i] = (symbolFrequencies[i] << 9) | i;
140         }
141         Arrays.sort(mergedFrequenciesAndIndices);
142         for (int i = 0; i < alphabetSize; i++) {
143             sortedFrequencies[i] = mergedFrequenciesAndIndices[i] >>> 9;
144         }
145 
146         // Allocate code lengths - the allocation is in place,
147         // so the code lengths will be in the sortedFrequencies array afterwards
148         Bzip2HuffmanAllocator.allocateHuffmanCodeLengths(sortedFrequencies, HUFFMAN_ENCODE_MAX_CODE_LENGTH);
149 
150         // Reverse the sort to place the code lengths in the same order as the symbols whose frequencies were passed in
151         for (int i = 0; i < alphabetSize; i++) {
152             codeLengths[mergedFrequenciesAndIndices[i] & 0x1ff] = sortedFrequencies[i];
153         }
154     }
155 
156     /**
157      * Generate initial Huffman code length tables, giving each table a different low cost section
158      * of the alphabet that is roughly equal in overall cumulative frequency. Note that the initial
159      * tables are invalid for actual Huffman code generation, and only serve as the seed for later
160      * iterative optimisation in {@link #optimiseSelectorsAndHuffmanTables(boolean)}.
161      */
162     private void generateHuffmanOptimisationSeeds() {
163         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
164         final int[] mtfSymbolFrequencies = this.mtfSymbolFrequencies;
165         final int mtfAlphabetSize = this.mtfAlphabetSize;
166 
167         final int totalTables = huffmanCodeLengths.length;
168 
169         int remainingLength = mtfLength;
170         int lowCostEnd = -1;
171 
172         for (int i = 0; i < totalTables; i++) {
173 
174             final int targetCumulativeFrequency = remainingLength / (totalTables - i);
175             final int lowCostStart = lowCostEnd + 1;
176             int actualCumulativeFrequency = 0;
177 
178             while (actualCumulativeFrequency < targetCumulativeFrequency && lowCostEnd < mtfAlphabetSize - 1) {
179                 actualCumulativeFrequency += mtfSymbolFrequencies[++lowCostEnd];
180             }
181 
182             if (lowCostEnd > lowCostStart && i != 0 && i != totalTables - 1 && (totalTables - i & 1) == 0) {
183                 actualCumulativeFrequency -= mtfSymbolFrequencies[lowCostEnd--];
184             }
185 
186             final int[] tableCodeLengths = huffmanCodeLengths[i];
187             for (int j = 0; j < mtfAlphabetSize; j++) {
188                 if (j < lowCostStart || j > lowCostEnd) {
189                     tableCodeLengths[j] = HUFFMAN_HIGH_SYMBOL_COST;
190                 }
191             }
192 
193             remainingLength -= actualCumulativeFrequency;
194         }
195     }
196 
197     /**
198      * Co-optimise the selector list and the alternative Huffman table code lengths. This method is
199      * called repeatedly in the hope that the total encoded size of the selectors, the Huffman code
200      * lengths and the block data encoded with them will converge towards a minimum.<br>
201      * If the data is highly incompressible, it is possible that the total encoded size will
202      * instead diverge (increase) slightly.<br>
203      * @param storeSelectors If {@code true}, write out the (final) chosen selectors
204      */
205     private void optimiseSelectorsAndHuffmanTables(final boolean storeSelectors) {
206         final char[] mtfBlock = this.mtfBlock;
207         final byte[] selectors = this.selectors;
208         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
209         final int mtfLength = this.mtfLength;
210         final int mtfAlphabetSize = this.mtfAlphabetSize;
211 
212         final int totalTables = huffmanCodeLengths.length;
213         final int[][] tableFrequencies = new int[totalTables][mtfAlphabetSize];
214 
215         int selectorIndex = 0;
216 
217         // Find the best table for each group of 50 block bytes based on the current Huffman code lengths
218         for (int groupStart = 0; groupStart < mtfLength;) {
219 
220             final int groupEnd = Math.min(groupStart + HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1;
221 
222             // Calculate the cost of this group when encoded by each table
223             int[] cost = new int[totalTables];
224             for (int i = groupStart; i <= groupEnd; i++) {
225                 final int value = mtfBlock[i];
226                 for (int j = 0; j < totalTables; j++) {
227                     cost[j] += huffmanCodeLengths[j][value];
228                 }
229             }
230 
231             // Find the table with the least cost for this group
232             byte bestTable = 0;
233             int bestCost = cost[0];
234             for (byte i = 1 ; i < totalTables; i++) {
235                 final int tableCost = cost[i];
236                 if (tableCost < bestCost) {
237                     bestCost = tableCost;
238                     bestTable = i;
239                 }
240             }
241 
242             // Accumulate symbol frequencies for the table chosen for this block
243             final int[] bestGroupFrequencies = tableFrequencies[bestTable];
244             for (int i = groupStart; i <= groupEnd; i++) {
245                 bestGroupFrequencies[mtfBlock[i]]++;
246             }
247 
248             // Store a selector indicating the table chosen for this block
249             if (storeSelectors) {
250                 selectors[selectorIndex++] = bestTable;
251             }
252             groupStart = groupEnd + 1;
253         }
254 
255         // Generate new Huffman code lengths based on the frequencies for each table accumulated in this iteration
256         for (int i = 0; i < totalTables; i++) {
257             generateHuffmanCodeLengths(mtfAlphabetSize, tableFrequencies[i], huffmanCodeLengths[i]);
258         }
259     }
260 
261     /**
262      * Assigns Canonical Huffman codes based on the calculated lengths.
263      */
264     private void assignHuffmanCodeSymbols() {
265         final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols;
266         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
267         final int mtfAlphabetSize = this.mtfAlphabetSize;
268 
269         final int totalTables = huffmanCodeLengths.length;
270 
271         for (int i = 0; i < totalTables; i++) {
272             final int[] tableLengths = huffmanCodeLengths[i];
273 
274             int minimumLength = 32;
275             int maximumLength = 0;
276             for (int j = 0; j < mtfAlphabetSize; j++) {
277                 final int length = tableLengths[j];
278                 if (length > maximumLength) {
279                     maximumLength = length;
280                 }
281                 if (length < minimumLength) {
282                     minimumLength = length;
283                 }
284             }
285 
286             int code = 0;
287             for (int j = minimumLength; j <= maximumLength; j++) {
288                 for (int k = 0; k < mtfAlphabetSize; k++) {
289                     if ((huffmanCodeLengths[i][k] & 0xff) == j) {
290                         huffmanMergedCodeSymbols[i][k] = (j << 24) | code;
291                         code++;
292                     }
293                 }
294                 code <<= 1;
295             }
296         }
297     }
298 
299     /**
300      * Write out the selector list and Huffman tables.
301      */
302     private void writeSelectorsAndHuffmanTables(Buffer out) {
303         final Bzip2BitWriter writer = this.writer;
304         final byte[] selectors = this.selectors;
305         final int totalSelectors = selectors.length;
306         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
307         final int totalTables = huffmanCodeLengths.length;
308         final int mtfAlphabetSize = this.mtfAlphabetSize;
309 
310         writer.writeBits(out, 3, totalTables);
311         writer.writeBits(out, 15, totalSelectors);
312 
313         // Write the selectors
314         Bzip2MoveToFrontTable selectorMTF = new Bzip2MoveToFrontTable();
315         for (byte selector : selectors) {
316             writer.writeUnary(out, selectorMTF.valueToFront(selector));
317         }
318 
319         // Write the Huffman tables
320         for (final int[] tableLengths : huffmanCodeLengths) {
321             int currentLength = tableLengths[0];
322 
323             writer.writeBits(out, 5, currentLength);
324 
325             for (int j = 0; j < mtfAlphabetSize; j++) {
326                 final int codeLength = tableLengths[j];
327                 final int value = currentLength < codeLength ? 2 : 3;
328                 int delta = Math.abs(codeLength - currentLength);
329                 while (delta-- > 0) {
330                     writer.writeBits(out, 2, value);
331                 }
332                 writer.writeBoolean(out, false);
333                 currentLength = codeLength;
334             }
335         }
336     }
337 
338     /**
339      * Writes out the encoded block data.
340      */
341     private void writeBlockData(Buffer out) {
342         final Bzip2BitWriter writer = this.writer;
343         final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols;
344         final byte[] selectors = this.selectors;
345         final int mtfLength = this.mtfLength;
346 
347         int selectorIndex = 0;
348         for (int mtfIndex = 0; mtfIndex < mtfLength;) {
349             final int groupEnd = Math.min(mtfIndex + HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1;
350             final int[] tableMergedCodeSymbols = huffmanMergedCodeSymbols[selectors[selectorIndex++]];
351 
352             while (mtfIndex <= groupEnd) {
353                 final int mergedCodeSymbol = tableMergedCodeSymbols[mtfBlock[mtfIndex++]];
354                 writer.writeBits(out, mergedCodeSymbol >>> 24, mergedCodeSymbol);
355             }
356         }
357     }
358 
359     /**
360      * Encodes and writes the block data.
361      */
362     void encode(Buffer out) {
363         // Create optimised selector list and Huffman tables
364         generateHuffmanOptimisationSeeds();
365         for (int i = 3; i >= 0; i--) {
366             optimiseSelectorsAndHuffmanTables(i == 0);
367         }
368         assignHuffmanCodeSymbols();
369 
370         // Write out the tables and the block data encoded with them
371         writeSelectorsAndHuffmanTables(out);
372         writeBlockData(out);
373     }
374 }