View Javadoc
1   /*
2    * Copyright 2014 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import static io.netty.handler.codec.compression.Bzip2Constants.HUFFMAN_DECODE_MAX_CODE_LENGTH;
19  import static io.netty.handler.codec.compression.Bzip2Constants.HUFFMAN_SYMBOL_RUNA;
20  import static io.netty.handler.codec.compression.Bzip2Constants.HUFFMAN_SYMBOL_RUNB;
21  import static io.netty.handler.codec.compression.Bzip2Constants.MAX_BLOCK_LENGTH;
22  
23  /**
24   * Reads and decompresses a single Bzip2 block.<br><br>
25   *
26   * Block decoding consists of the following stages:<br>
27   * 1. Read block header<br>
28   * 2. Read Huffman tables<br>
29   * 3. Read and decode Huffman encoded data - {@link #decodeHuffmanData(Bzip2HuffmanStageDecoder)}<br>
30   * 4. Run-Length Decoding[2] - {@link #decodeHuffmanData(Bzip2HuffmanStageDecoder)}<br>
31   * 5. Inverse Move To Front Transform - {@link #decodeHuffmanData(Bzip2HuffmanStageDecoder)}<br>
32   * 6. Inverse Burrows Wheeler Transform - {@link #initialiseInverseBWT()}<br>
33   * 7. Run-Length Decoding[1] - {@link #read()}<br>
34   * 8. Optional Block De-Randomisation - {@link #read()} (through {@link #decodeNextBWTByte()})
35   */
36  final class Bzip2BlockDecompressor {
37      /**
38       * A reader that provides bit-level reads.
39       */
40      private final Bzip2BitReader reader;
41  
42      /**
43       * Calculates the block CRC from the fully decoded bytes of the block.
44       */
45      private final Crc32 crc = new Crc32();
46  
47      /**
48       * The CRC of the current block as read from the block header.
49       */
50      private final int blockCRC;
51  
52      /**
53       * {@code true} if the current block is randomised, otherwise {@code false}.
54       */
55      private final boolean blockRandomised;
56  
57      /* Huffman Decoding stage */
58      /**
59       * The end-of-block Huffman symbol. Decoding of the block ends when this is encountered.
60       */
61      int huffmanEndOfBlockSymbol;
62  
63      /**
64       * Bitmap, of ranges of 16 bytes, present/not present.
65       */
66      int huffmanInUse16;
67  
68      /**
69       * A map from Huffman symbol index to output character. Some types of data (e.g. ASCII text)
70       * may contain only a limited number of byte values; Huffman symbols are only allocated to
71       * those values that actually occur in the uncompressed data.
72       */
73      final byte[] huffmanSymbolMap = new byte[256];
74  
75      /* Move To Front stage */
76      /**
77       * Counts of each byte value within the {@link Bzip2BlockDecompressor#huffmanSymbolMap} data.
78       * Collected at the Move To Front stage, consumed by the Inverse Burrows Wheeler Transform stage.
79       */
80      private final int[] bwtByteCounts = new int[256];
81  
82      /**
83       * The Burrows-Wheeler Transform processed data. Read at the Move To Front stage, consumed by the
84       * Inverse Burrows Wheeler Transform stage.
85       */
86      private final byte[] bwtBlock;
87  
88      /**
89       * Starting pointer into BWT for after untransform.
90       */
91      private final int bwtStartPointer;
92  
93      /* Inverse Burrows-Wheeler Transform stage */
94      /**
95       * At each position contains the union of :-
96       *   An output character (8 bits)
97       *   A pointer from each position to its successor (24 bits, left shifted 8 bits)
98       * As the pointer cannot exceed the maximum block size of 900k, 24 bits is more than enough to
99       * hold it; Folding the character data into the spare bits while performing the inverse BWT,
100      * when both pieces of information are available, saves a large number of memory accesses in
101      * the final decoding stages.
102      */
103     private int[] bwtMergedPointers;
104 
105     /**
106      * The current merged pointer into the Burrow-Wheeler Transform array.
107      */
108     private int bwtCurrentMergedPointer;
109 
110     /**
111      * The actual length in bytes of the current block at the Inverse Burrows Wheeler Transform
112      * stage (before final Run-Length Decoding).
113      */
114     private int bwtBlockLength;
115 
116     /**
117      * The number of output bytes that have been decoded up to the Inverse Burrows Wheeler Transform stage.
118      */
119     private int bwtBytesDecoded;
120 
121     /* Run-Length Encoding and Random Perturbation stage */
122     /**
123      * The most recently RLE decoded byte.
124      */
125     private int rleLastDecodedByte = -1;
126 
127     /**
128      * The number of previous identical output bytes decoded. After 4 identical bytes, the next byte
129      * decoded is an RLE repeat count.
130      */
131     private int rleAccumulator;
132 
133     /**
134      * The RLE repeat count of the current decoded byte. When this reaches zero, a new byte is decoded.
135      */
136     private int rleRepeat;
137 
138     /**
139      * If the current block is randomised, the position within the RNUMS randomisation array.
140      */
141     private int randomIndex;
142 
143     /**
144      * If the current block is randomised, the remaining count at the current RNUMS position.
145      */
146     private int randomCount = Bzip2Rand.rNums(0) - 1;
147 
148     /**
149      * Table for Move To Front transformations.
150      */
151     private final Bzip2MoveToFrontTable symbolMTF = new Bzip2MoveToFrontTable();
152 
153     // This variables is used to save current state if we haven't got enough readable bits
154     private int repeatCount;
155     private int repeatIncrement = 1;
156     private int mtfValue;
157 
158     Bzip2BlockDecompressor(final int blockSize, final int blockCRC, final boolean blockRandomised,
159                            final int bwtStartPointer, final Bzip2BitReader reader) {
160 
161         bwtBlock = new byte[blockSize];
162 
163         this.blockCRC = blockCRC;
164         this.blockRandomised = blockRandomised;
165         this.bwtStartPointer = bwtStartPointer;
166 
167         this.reader = reader;
168     }
169 
170     /**
171      * Reads the Huffman encoded data from the input stream, performs Run-Length Decoding and
172      * applies the Move To Front transform to reconstruct the Burrows-Wheeler Transform array.
173      */
174     boolean decodeHuffmanData(final Bzip2HuffmanStageDecoder huffmanDecoder) {
175         final Bzip2BitReader reader = this.reader;
176         final byte[] bwtBlock = this.bwtBlock;
177         final byte[] huffmanSymbolMap = this.huffmanSymbolMap;
178         final int streamBlockSize = this.bwtBlock.length;
179         final int huffmanEndOfBlockSymbol = this.huffmanEndOfBlockSymbol;
180         final int[] bwtByteCounts = this.bwtByteCounts;
181         final Bzip2MoveToFrontTable symbolMTF = this.symbolMTF;
182 
183         int bwtBlockLength = this.bwtBlockLength;
184         int repeatCount = this.repeatCount;
185         int repeatIncrement = this.repeatIncrement;
186         int mtfValue = this.mtfValue;
187 
188         for (;;) {
189             if (!reader.hasReadableBits(HUFFMAN_DECODE_MAX_CODE_LENGTH)) {
190                 this.bwtBlockLength = bwtBlockLength;
191                 this.repeatCount = repeatCount;
192                 this.repeatIncrement = repeatIncrement;
193                 this.mtfValue = mtfValue;
194                 return false;
195             }
196             final int nextSymbol = huffmanDecoder.nextSymbol();
197 
198             if (nextSymbol == HUFFMAN_SYMBOL_RUNA) {
199                 repeatCount += repeatIncrement;
200                 repeatIncrement <<= 1;
201             } else if (nextSymbol == HUFFMAN_SYMBOL_RUNB) {
202                 repeatCount += repeatIncrement << 1;
203                 repeatIncrement <<= 1;
204             } else {
205                 if (repeatCount > 0) {
206                     if (bwtBlockLength + repeatCount > streamBlockSize) {
207                         throw new DecompressionException("block exceeds declared block size");
208                     }
209                     final byte nextByte = huffmanSymbolMap[mtfValue];
210                     bwtByteCounts[nextByte & 0xff] += repeatCount;
211                     while (--repeatCount >= 0) {
212                         bwtBlock[bwtBlockLength++] = nextByte;
213                     }
214 
215                     repeatCount = 0;
216                     repeatIncrement = 1;
217                 }
218 
219                 if (nextSymbol == huffmanEndOfBlockSymbol) {
220                     break;
221                 }
222 
223                 if (bwtBlockLength >= streamBlockSize) {
224                     throw new DecompressionException("block exceeds declared block size");
225                 }
226 
227                 mtfValue = symbolMTF.indexToFront(nextSymbol - 1) & 0xff;
228 
229                 final byte nextByte = huffmanSymbolMap[mtfValue];
230                 bwtByteCounts[nextByte & 0xff]++;
231                 bwtBlock[bwtBlockLength++] = nextByte;
232             }
233         }
234         if (bwtBlockLength > MAX_BLOCK_LENGTH) {
235             throw new DecompressionException("block length exceeds max block length: "
236                     + bwtBlockLength + " > " + MAX_BLOCK_LENGTH);
237         }
238 
239         this.bwtBlockLength = bwtBlockLength;
240         initialiseInverseBWT();
241         return true;
242     }
243 
244     /**
245      * Set up the Inverse Burrows-Wheeler Transform merged pointer array.
246      */
247     private void initialiseInverseBWT() {
248         final int bwtStartPointer = this.bwtStartPointer;
249         final byte[] bwtBlock  = this.bwtBlock;
250         final int[] bwtMergedPointers = new int[bwtBlockLength];
251         final int[] characterBase = new int[256];
252 
253         if (bwtStartPointer < 0 || bwtStartPointer >= bwtBlockLength) {
254             throw new DecompressionException("start pointer invalid");
255         }
256 
257         // Cumulative character counts
258         System.arraycopy(bwtByteCounts, 0, characterBase, 1, 255);
259         for (int i = 2; i <= 255; i++) {
260             characterBase[i] += characterBase[i - 1];
261         }
262 
263         // Merged-Array Inverse Burrows-Wheeler Transform
264         // Combining the output characters and forward pointers into a single array here, where we
265         // have already read both of the corresponding values, cuts down on memory accesses in the
266         // final walk through the array
267         for (int i = 0; i < bwtBlockLength; i++) {
268             int value = bwtBlock[i] & 0xff;
269             bwtMergedPointers[characterBase[value]++] = (i << 8) + value;
270         }
271 
272         this.bwtMergedPointers = bwtMergedPointers;
273         bwtCurrentMergedPointer = bwtMergedPointers[bwtStartPointer];
274     }
275 
276     /**
277      * Decodes a byte from the final Run-Length Encoding stage, pulling a new byte from the
278      * Burrows-Wheeler Transform stage when required.
279      * @return The decoded byte, or -1 if there are no more bytes
280      */
281     public int read() {
282         while (rleRepeat < 1) {
283             if (bwtBytesDecoded == bwtBlockLength) {
284                 return -1;
285             }
286 
287             int nextByte = decodeNextBWTByte();
288             if (nextByte != rleLastDecodedByte) {
289                 // New byte, restart accumulation
290                 rleLastDecodedByte = nextByte;
291                 rleRepeat = 1;
292                 rleAccumulator = 1;
293                 crc.updateCRC(nextByte);
294             } else {
295                 if (++rleAccumulator == 4) {
296                     // Accumulation complete, start repetition
297                     int rleRepeat = decodeNextBWTByte() + 1;
298                     this.rleRepeat = rleRepeat;
299                     rleAccumulator = 0;
300                     crc.updateCRC(nextByte, rleRepeat);
301                 } else {
302                     rleRepeat = 1;
303                     crc.updateCRC(nextByte);
304                 }
305             }
306         }
307         rleRepeat--;
308 
309         return rleLastDecodedByte;
310     }
311 
312     /**
313      * Decodes a byte from the Burrows-Wheeler Transform stage. If the block has randomisation
314      * applied, reverses the randomisation.
315      * @return The decoded byte
316      */
317     private int decodeNextBWTByte() {
318         int mergedPointer = bwtCurrentMergedPointer;
319         int nextDecodedByte =  mergedPointer & 0xff;
320         bwtCurrentMergedPointer = bwtMergedPointers[mergedPointer >>> 8];
321 
322         if (blockRandomised) {
323             if (--randomCount == 0) {
324                 nextDecodedByte ^= 1;
325                 randomIndex = (randomIndex + 1) % 512;
326                 randomCount = Bzip2Rand.rNums(randomIndex);
327             }
328         }
329         bwtBytesDecoded++;
330 
331         return nextDecodedByte;
332     }
333 
334     public int blockLength() {
335         return bwtBlockLength;
336     }
337 
338     /**
339      * Verify and return the block CRC. This method may only be called
340      * after all of the block's bytes have been read.
341      * @return The block CRC
342      */
343     int checkCRC() {
344         final int computedBlockCRC = crc.getCRC();
345         if (blockCRC != computedBlockCRC) {
346             throw new DecompressionException("block CRC error");
347         }
348         return computedBlockCRC;
349     }
350 }