View Javadoc
1   /*
2    * Copyright 2014 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import io.netty.buffer.ByteBuf;
19  
20  import java.util.Arrays;
21  
22  import static io.netty.handler.codec.compression.Bzip2Constants.*;
23  
24  /**
25   * An encoder for the Bzip2 Huffman encoding stage.
26   */
27  final class Bzip2HuffmanStageEncoder {
28      /**
29       * Used in initial Huffman table generation.
30       */
31      private static final int HUFFMAN_HIGH_SYMBOL_COST = 15;
32  
33      /**
34       * The {@link Bzip2BitWriter} to which the Huffman tables and data is written.
35       */
36      private final Bzip2BitWriter writer;
37  
38      /**
39       * The output of the Move To Front Transform and Run Length Encoding[2] stages.
40       */
41      private final char[] mtfBlock;
42  
43      /**
44       * The actual number of values contained in the {@link #mtfBlock} array.
45       */
46      private final int mtfLength;
47  
48      /**
49       * The number of unique values in the {@link #mtfBlock} array.
50       */
51      private final int mtfAlphabetSize;
52  
53      /**
54       * The global frequencies of values within the {@link #mtfBlock} array.
55       */
56      private final int[] mtfSymbolFrequencies;
57  
58      /**
59       * The Canonical Huffman code lengths for each table.
60       */
61      private final int[][] huffmanCodeLengths;
62  
63      /**
64       * Merged code symbols for each table. The value at each position is ((code length << 24) | code).
65       */
66      private final int[][] huffmanMergedCodeSymbols;
67  
68      /**
69       * The selectors for each segment.
70       */
71      private final byte[] selectors;
72  
73      /**
74       * @param writer The {@link Bzip2BitWriter} which provides bit-level writes
75       * @param mtfBlock The MTF block data
76       * @param mtfLength The actual length of the MTF block
77       * @param mtfAlphabetSize The size of the MTF block's alphabet
78       * @param mtfSymbolFrequencies The frequencies the MTF block's symbols
79       */
80      Bzip2HuffmanStageEncoder(final Bzip2BitWriter writer, final char[] mtfBlock,
81                               final int mtfLength, final int mtfAlphabetSize, final int[] mtfSymbolFrequencies) {
82          this.writer = writer;
83          this.mtfBlock = mtfBlock;
84          this.mtfLength = mtfLength;
85          this.mtfAlphabetSize = mtfAlphabetSize;
86          this.mtfSymbolFrequencies = mtfSymbolFrequencies;
87  
88          final int totalTables = selectTableCount(mtfLength);
89  
90          huffmanCodeLengths = new int[totalTables][mtfAlphabetSize];
91          huffmanMergedCodeSymbols = new int[totalTables][mtfAlphabetSize];
92          selectors = new byte[(mtfLength + HUFFMAN_GROUP_RUN_LENGTH - 1) / HUFFMAN_GROUP_RUN_LENGTH];
93      }
94  
95      /**
96       * Selects an appropriate table count for a given MTF length.
97       * @param mtfLength The length to select a table count for
98       * @return The selected table count
99       */
100     private static int selectTableCount(final int mtfLength) {
101         if (mtfLength >= 2400) {
102             return 6;
103         }
104         if (mtfLength >= 1200) {
105             return 5;
106         }
107         if (mtfLength >= 600) {
108             return 4;
109         }
110         if (mtfLength >= 200) {
111             return 3;
112         }
113         return 2;
114     }
115 
116     /**
117      * Generate a Huffman code length table for a given list of symbol frequencies.
118      * @param alphabetSize The total number of symbols
119      * @param symbolFrequencies The frequencies of the symbols
120      * @param codeLengths The array to which the generated code lengths should be written
121      */
122     private static void generateHuffmanCodeLengths(final int alphabetSize,
123                                                    final int[] symbolFrequencies, final int[] codeLengths) {
124 
125         final int[] mergedFrequenciesAndIndices = new int[alphabetSize];
126         final int[] sortedFrequencies = new int[alphabetSize];
127 
128         // The Huffman allocator needs its input symbol frequencies to be sorted, but we need to
129         // return code lengths in the same order as the corresponding frequencies are passed in.
130 
131         // The symbol frequency and index are merged into a single array of
132         // integers - frequency in the high 23 bits, index in the low 9 bits.
133         //     2^23 = 8,388,608 which is higher than the maximum possible frequency for one symbol in a block
134         //     2^9 = 512 which is higher than the maximum possible alphabet size (== 258)
135         // Sorting this array simultaneously sorts the frequencies and
136         // leaves a lookup that can be used to cheaply invert the sort.
137         for (int i = 0; i < alphabetSize; i++) {
138             mergedFrequenciesAndIndices[i] = (symbolFrequencies[i] << 9) | i;
139         }
140         Arrays.sort(mergedFrequenciesAndIndices);
141         for (int i = 0; i < alphabetSize; i++) {
142             sortedFrequencies[i] = mergedFrequenciesAndIndices[i] >>> 9;
143         }
144 
145         // Allocate code lengths - the allocation is in place,
146         // so the code lengths will be in the sortedFrequencies array afterwards
147         Bzip2HuffmanAllocator.allocateHuffmanCodeLengths(sortedFrequencies, HUFFMAN_ENCODE_MAX_CODE_LENGTH);
148 
149         // Reverse the sort to place the code lengths in the same order as the symbols whose frequencies were passed in
150         for (int i = 0; i < alphabetSize; i++) {
151             codeLengths[mergedFrequenciesAndIndices[i] & 0x1ff] = sortedFrequencies[i];
152         }
153     }
154 
155     /**
156      * Generate initial Huffman code length tables, giving each table a different low cost section
157      * of the alphabet that is roughly equal in overall cumulative frequency. Note that the initial
158      * tables are invalid for actual Huffman code generation, and only serve as the seed for later
159      * iterative optimisation in {@link #optimiseSelectorsAndHuffmanTables(boolean)}.
160      */
161     private void generateHuffmanOptimisationSeeds() {
162         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
163         final int[] mtfSymbolFrequencies = this.mtfSymbolFrequencies;
164         final int mtfAlphabetSize = this.mtfAlphabetSize;
165 
166         final int totalTables = huffmanCodeLengths.length;
167 
168         int remainingLength = mtfLength;
169         int lowCostEnd = -1;
170 
171         for (int i = 0; i < totalTables; i++) {
172 
173             final int targetCumulativeFrequency = remainingLength / (totalTables - i);
174             final int lowCostStart = lowCostEnd + 1;
175             int actualCumulativeFrequency = 0;
176 
177             while (actualCumulativeFrequency < targetCumulativeFrequency && lowCostEnd < mtfAlphabetSize - 1) {
178                 actualCumulativeFrequency += mtfSymbolFrequencies[++lowCostEnd];
179             }
180 
181             if (lowCostEnd > lowCostStart && i != 0 && i != totalTables - 1 && (totalTables - i & 1) == 0) {
182                 actualCumulativeFrequency -= mtfSymbolFrequencies[lowCostEnd--];
183             }
184 
185             final int[] tableCodeLengths = huffmanCodeLengths[i];
186             for (int j = 0; j < mtfAlphabetSize; j++) {
187                 if (j < lowCostStart || j > lowCostEnd) {
188                     tableCodeLengths[j] = HUFFMAN_HIGH_SYMBOL_COST;
189                 }
190             }
191 
192             remainingLength -= actualCumulativeFrequency;
193         }
194     }
195 
196     /**
197      * Co-optimise the selector list and the alternative Huffman table code lengths. This method is
198      * called repeatedly in the hope that the total encoded size of the selectors, the Huffman code
199      * lengths and the block data encoded with them will converge towards a minimum.<br>
200      * If the data is highly incompressible, it is possible that the total encoded size will
201      * instead diverge (increase) slightly.<br>
202      * @param storeSelectors If {@code true}, write out the (final) chosen selectors
203      */
204     private void optimiseSelectorsAndHuffmanTables(final boolean storeSelectors) {
205         final char[] mtfBlock = this.mtfBlock;
206         final byte[] selectors = this.selectors;
207         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
208         final int mtfLength = this.mtfLength;
209         final int mtfAlphabetSize = this.mtfAlphabetSize;
210 
211         final int totalTables = huffmanCodeLengths.length;
212         final int[][] tableFrequencies = new int[totalTables][mtfAlphabetSize];
213 
214         int selectorIndex = 0;
215 
216         // Find the best table for each group of 50 block bytes based on the current Huffman code lengths
217         for (int groupStart = 0; groupStart < mtfLength;) {
218 
219             final int groupEnd = Math.min(groupStart + HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1;
220 
221             // Calculate the cost of this group when encoded by each table
222             short[] cost = new short[totalTables];
223             for (int i = groupStart; i <= groupEnd; i++) {
224                 final int value = mtfBlock[i];
225                 for (int j = 0; j < totalTables; j++) {
226                     cost[j] += huffmanCodeLengths[j][value];
227                 }
228             }
229 
230             // Find the table with the least cost for this group
231             byte bestTable = 0;
232             int bestCost = cost[0];
233             for (byte i = 1 ; i < totalTables; i++) {
234                 final int tableCost = cost[i];
235                 if (tableCost < bestCost) {
236                     bestCost = tableCost;
237                     bestTable = i;
238                 }
239             }
240 
241             // Accumulate symbol frequencies for the table chosen for this block
242             final int[] bestGroupFrequencies = tableFrequencies[bestTable];
243             for (int i = groupStart; i <= groupEnd; i++) {
244                 bestGroupFrequencies[mtfBlock[i]]++;
245             }
246 
247             // Store a selector indicating the table chosen for this block
248             if (storeSelectors) {
249                 selectors[selectorIndex++] = bestTable;
250             }
251             groupStart = groupEnd + 1;
252         }
253 
254         // Generate new Huffman code lengths based on the frequencies for each table accumulated in this iteration
255         for (int i = 0; i < totalTables; i++) {
256             generateHuffmanCodeLengths(mtfAlphabetSize, tableFrequencies[i], huffmanCodeLengths[i]);
257         }
258     }
259 
260     /**
261      * Assigns Canonical Huffman codes based on the calculated lengths.
262      */
263     private void assignHuffmanCodeSymbols() {
264         final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols;
265         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
266         final int mtfAlphabetSize = this.mtfAlphabetSize;
267 
268         final int totalTables = huffmanCodeLengths.length;
269 
270         for (int i = 0; i < totalTables; i++) {
271             final int[] tableLengths = huffmanCodeLengths[i];
272 
273             int minimumLength = 32;
274             int maximumLength = 0;
275             for (int j = 0; j < mtfAlphabetSize; j++) {
276                 final int length = tableLengths[j];
277                 if (length > maximumLength) {
278                     maximumLength = length;
279                 }
280                 if (length < minimumLength) {
281                     minimumLength = length;
282                 }
283             }
284 
285             int code = 0;
286             for (int j = minimumLength; j <= maximumLength; j++) {
287                 for (int k = 0; k < mtfAlphabetSize; k++) {
288                     if ((huffmanCodeLengths[i][k] & 0xff) == j) {
289                         huffmanMergedCodeSymbols[i][k] = (j << 24) | code;
290                         code++;
291                     }
292                 }
293                 code <<= 1;
294             }
295         }
296     }
297 
298     /**
299      * Write out the selector list and Huffman tables.
300      */
301     private void writeSelectorsAndHuffmanTables(ByteBuf out) {
302         final Bzip2BitWriter writer = this.writer;
303         final byte[] selectors = this.selectors;
304         final int totalSelectors = selectors.length;
305         final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
306         final int totalTables = huffmanCodeLengths.length;
307         final int mtfAlphabetSize = this.mtfAlphabetSize;
308 
309         writer.writeBits(out, 3, totalTables);
310         writer.writeBits(out, 15, totalSelectors);
311 
312         // Write the selectors
313         Bzip2MoveToFrontTable selectorMTF = new Bzip2MoveToFrontTable();
314         for (byte selector : selectors) {
315             writer.writeUnary(out, selectorMTF.valueToFront(selector));
316         }
317 
318         // Write the Huffman tables
319         for (final int[] tableLengths : huffmanCodeLengths) {
320             int currentLength = tableLengths[0];
321 
322             writer.writeBits(out, 5, currentLength);
323 
324             for (int j = 0; j < mtfAlphabetSize; j++) {
325                 final int codeLength = tableLengths[j];
326                 final int value = currentLength < codeLength ? 2 : 3;
327                 int delta = Math.abs(codeLength - currentLength);
328                 while (delta-- > 0) {
329                     writer.writeBits(out, 2, value);
330                 }
331                 writer.writeBoolean(out, false);
332                 currentLength = codeLength;
333             }
334         }
335     }
336 
337     /**
338      * Writes out the encoded block data.
339      */
340     private void writeBlockData(ByteBuf out) {
341         final Bzip2BitWriter writer = this.writer;
342         final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols;
343         final byte[] selectors = this.selectors;
344         final char[] mtf = mtfBlock;
345         final int mtfLength = this.mtfLength;
346 
347         int selectorIndex = 0;
348         for (int mtfIndex = 0; mtfIndex < mtfLength;) {
349             final int groupEnd = Math.min(mtfIndex + HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1;
350             final int[] tableMergedCodeSymbols = huffmanMergedCodeSymbols[selectors[selectorIndex++]];
351 
352             while (mtfIndex <= groupEnd) {
353                 final int mergedCodeSymbol = tableMergedCodeSymbols[mtf[mtfIndex++]];
354                 writer.writeBits(out, mergedCodeSymbol >>> 24, mergedCodeSymbol);
355             }
356         }
357     }
358 
359     /**
360      * Encodes and writes the block data.
361      */
362     void encode(ByteBuf out) {
363         // Create optimised selector list and Huffman tables
364         generateHuffmanOptimisationSeeds();
365         for (int i = 3; i >= 0; i--) {
366             optimiseSelectorsAndHuffmanTables(i == 0);
367         }
368         assignHuffmanCodeSymbols();
369 
370         // Write out the tables and the block data encoded with them
371         writeSelectorsAndHuffmanTables(out);
372         writeBlockData(out);
373     }
374 }