View Javadoc
1   /*
2    * Copyright 2014 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import static io.netty.handler.codec.compression.Bzip2Constants.*;
19  
20  /**
21   * Reads and decompresses a single Bzip2 block.<br><br>
22   *
23   * Block decoding consists of the following stages:<br>
24   * 1. Read block header<br>
25   * 2. Read Huffman tables<br>
26   * 3. Read and decode Huffman encoded data - {@link #decodeHuffmanData(Bzip2HuffmanStageDecoder)}<br>
27   * 4. Run-Length Decoding[2] - {@link #decodeHuffmanData(Bzip2HuffmanStageDecoder)}<br>
28   * 5. Inverse Move To Front Transform - {@link #decodeHuffmanData(Bzip2HuffmanStageDecoder)}<br>
29   * 6. Inverse Burrows Wheeler Transform - {@link #initialiseInverseBWT()}<br>
30   * 7. Run-Length Decoding[1] - {@link #read()}<br>
31   * 8. Optional Block De-Randomisation - {@link #read()} (through {@link #decodeNextBWTByte()})
32   */
33  final class Bzip2BlockDecompressor {
34      /**
35       * A reader that provides bit-level reads.
36       */
37      private final Bzip2BitReader reader;
38  
39      /**
40       * Calculates the block CRC from the fully decoded bytes of the block.
41       */
42      private final Crc32 crc = new Crc32();
43  
44      /**
45       * The CRC of the current block as read from the block header.
46       */
47      private final int blockCRC;
48  
49      /**
50       * {@code true} if the current block is randomised, otherwise {@code false}.
51       */
52      private final boolean blockRandomised;
53  
54      /* Huffman Decoding stage */
55      /**
56       * The end-of-block Huffman symbol. Decoding of the block ends when this is encountered.
57       */
58      int huffmanEndOfBlockSymbol;
59  
60      /**
61       * Bitmap, of ranges of 16 bytes, present/not present.
62       */
63      int huffmanInUse16;
64  
65      /**
66       * A map from Huffman symbol index to output character. Some types of data (e.g. ASCII text)
67       * may contain only a limited number of byte values; Huffman symbols are only allocated to
68       * those values that actually occur in the uncompressed data.
69       */
70      final byte[] huffmanSymbolMap = new byte[256];
71  
72      /* Move To Front stage */
73      /**
74       * Counts of each byte value within the {@link Bzip2BlockDecompressor#huffmanSymbolMap} data.
75       * Collected at the Move To Front stage, consumed by the Inverse Burrows Wheeler Transform stage.
76       */
77      private final int[] bwtByteCounts = new int[256];
78  
79      /**
80       * The Burrows-Wheeler Transform processed data. Read at the Move To Front stage, consumed by the
81       * Inverse Burrows Wheeler Transform stage.
82       */
83      private final byte[] bwtBlock;
84  
85      /**
86       * Starting pointer into BWT for after untransform.
87       */
88      private final int bwtStartPointer;
89  
90      /* Inverse Burrows-Wheeler Transform stage */
91      /**
92       * At each position contains the union of :-
93       *   An output character (8 bits)
94       *   A pointer from each position to its successor (24 bits, left shifted 8 bits)
95       * As the pointer cannot exceed the maximum block size of 900k, 24 bits is more than enough to
96       * hold it; Folding the character data into the spare bits while performing the inverse BWT,
97       * when both pieces of information are available, saves a large number of memory accesses in
98       * the final decoding stages.
99       */
100     private int[] bwtMergedPointers;
101 
102     /**
103      * The current merged pointer into the Burrow-Wheeler Transform array.
104      */
105     private int bwtCurrentMergedPointer;
106 
107     /**
108      * The actual length in bytes of the current block at the Inverse Burrows Wheeler Transform
109      * stage (before final Run-Length Decoding).
110      */
111     private int bwtBlockLength;
112 
113     /**
114      * The number of output bytes that have been decoded up to the Inverse Burrows Wheeler Transform stage.
115      */
116     private int bwtBytesDecoded;
117 
118     /* Run-Length Encoding and Random Perturbation stage */
119     /**
120      * The most recently RLE decoded byte.
121      */
122     private int rleLastDecodedByte = -1;
123 
124     /**
125      * The number of previous identical output bytes decoded. After 4 identical bytes, the next byte
126      * decoded is an RLE repeat count.
127      */
128     private int rleAccumulator;
129 
130     /**
131      * The RLE repeat count of the current decoded byte. When this reaches zero, a new byte is decoded.
132      */
133     private int rleRepeat;
134 
135     /**
136      * If the current block is randomised, the position within the RNUMS randomisation array.
137      */
138     private int randomIndex;
139 
140     /**
141      * If the current block is randomised, the remaining count at the current RNUMS position.
142      */
143     private int randomCount = Bzip2Rand.rNums(0) - 1;
144 
145     /**
146      * Table for Move To Front transformations.
147      */
148     private final Bzip2MoveToFrontTable symbolMTF = new Bzip2MoveToFrontTable();
149 
150     // This variables is used to save current state if we haven't got enough readable bits
151     private int repeatCount;
152     private int repeatIncrement = 1;
153     private int mtfValue;
154 
155     Bzip2BlockDecompressor(final int blockSize, final int blockCRC, final boolean blockRandomised,
156                            final int bwtStartPointer, final Bzip2BitReader reader) {
157 
158         bwtBlock = new byte[blockSize];
159 
160         this.blockCRC = blockCRC;
161         this.blockRandomised = blockRandomised;
162         this.bwtStartPointer = bwtStartPointer;
163 
164         this.reader = reader;
165     }
166 
167     /**
168      * Reads the Huffman encoded data from the input stream, performs Run-Length Decoding and
169      * applies the Move To Front transform to reconstruct the Burrows-Wheeler Transform array.
170      */
171     boolean decodeHuffmanData(final Bzip2HuffmanStageDecoder huffmanDecoder) {
172         final Bzip2BitReader reader = this.reader;
173         final byte[] bwtBlock = this.bwtBlock;
174         final byte[] huffmanSymbolMap = this.huffmanSymbolMap;
175         final int streamBlockSize = this.bwtBlock.length;
176         final int huffmanEndOfBlockSymbol = this.huffmanEndOfBlockSymbol;
177         final int[] bwtByteCounts = this.bwtByteCounts;
178         final Bzip2MoveToFrontTable symbolMTF = this.symbolMTF;
179 
180         int bwtBlockLength = this.bwtBlockLength;
181         int repeatCount = this.repeatCount;
182         int repeatIncrement = this.repeatIncrement;
183         int mtfValue = this.mtfValue;
184 
185         for (;;) {
186             if (!reader.hasReadableBits(HUFFMAN_DECODE_MAX_CODE_LENGTH)) {
187                 this.bwtBlockLength = bwtBlockLength;
188                 this.repeatCount = repeatCount;
189                 this.repeatIncrement = repeatIncrement;
190                 this.mtfValue = mtfValue;
191                 return false;
192             }
193             final int nextSymbol = huffmanDecoder.nextSymbol();
194 
195             if (nextSymbol == HUFFMAN_SYMBOL_RUNA) {
196                 repeatCount += repeatIncrement;
197                 repeatIncrement <<= 1;
198             } else if (nextSymbol == HUFFMAN_SYMBOL_RUNB) {
199                 repeatCount += repeatIncrement << 1;
200                 repeatIncrement <<= 1;
201             } else {
202                 if (repeatCount > 0) {
203                     if (bwtBlockLength + repeatCount > streamBlockSize) {
204                         throw new DecompressionException("block exceeds declared block size");
205                     }
206                     final byte nextByte = huffmanSymbolMap[mtfValue];
207                     bwtByteCounts[nextByte & 0xff] += repeatCount;
208                     while (--repeatCount >= 0) {
209                         bwtBlock[bwtBlockLength++] = nextByte;
210                     }
211 
212                     repeatCount = 0;
213                     repeatIncrement = 1;
214                 }
215 
216                 if (nextSymbol == huffmanEndOfBlockSymbol) {
217                     break;
218                 }
219 
220                 if (bwtBlockLength >= streamBlockSize) {
221                     throw new DecompressionException("block exceeds declared block size");
222                 }
223 
224                 mtfValue = symbolMTF.indexToFront(nextSymbol - 1) & 0xff;
225 
226                 final byte nextByte = huffmanSymbolMap[mtfValue];
227                 bwtByteCounts[nextByte & 0xff]++;
228                 bwtBlock[bwtBlockLength++] = nextByte;
229             }
230         }
231         this.bwtBlockLength = bwtBlockLength;
232         initialiseInverseBWT();
233         return true;
234     }
235 
236     /**
237      * Set up the Inverse Burrows-Wheeler Transform merged pointer array.
238      */
239     private void initialiseInverseBWT() {
240         final int bwtStartPointer = this.bwtStartPointer;
241         final byte[] bwtBlock  = this.bwtBlock;
242         final int[] bwtMergedPointers = new int[bwtBlockLength];
243         final int[] characterBase = new int[256];
244 
245         if (bwtStartPointer < 0 || bwtStartPointer >= bwtBlockLength) {
246             throw new DecompressionException("start pointer invalid");
247         }
248 
249         // Cumulative character counts
250         System.arraycopy(bwtByteCounts, 0, characterBase, 1, 255);
251         for (int i = 2; i <= 255; i++) {
252             characterBase[i] += characterBase[i - 1];
253         }
254 
255         // Merged-Array Inverse Burrows-Wheeler Transform
256         // Combining the output characters and forward pointers into a single array here, where we
257         // have already read both of the corresponding values, cuts down on memory accesses in the
258         // final walk through the array
259         for (int i = 0; i < bwtBlockLength; i++) {
260             int value = bwtBlock[i] & 0xff;
261             bwtMergedPointers[characterBase[value]++] = (i << 8) + value;
262         }
263 
264         this.bwtMergedPointers = bwtMergedPointers;
265         bwtCurrentMergedPointer = bwtMergedPointers[bwtStartPointer];
266     }
267 
268     /**
269      * Decodes a byte from the final Run-Length Encoding stage, pulling a new byte from the
270      * Burrows-Wheeler Transform stage when required.
271      * @return The decoded byte, or -1 if there are no more bytes
272      */
273     public int read() {
274         while (rleRepeat < 1) {
275             if (bwtBytesDecoded == bwtBlockLength) {
276                 return -1;
277             }
278 
279             int nextByte = decodeNextBWTByte();
280             if (nextByte != rleLastDecodedByte) {
281                 // New byte, restart accumulation
282                 rleLastDecodedByte = nextByte;
283                 rleRepeat = 1;
284                 rleAccumulator = 1;
285                 crc.updateCRC(nextByte);
286             } else {
287                 if (++rleAccumulator == 4) {
288                     // Accumulation complete, start repetition
289                     int rleRepeat = decodeNextBWTByte() + 1;
290                     this.rleRepeat = rleRepeat;
291                     rleAccumulator = 0;
292                     crc.updateCRC(nextByte, rleRepeat);
293                 } else {
294                     rleRepeat = 1;
295                     crc.updateCRC(nextByte);
296                 }
297             }
298         }
299         rleRepeat--;
300 
301         return rleLastDecodedByte;
302     }
303 
304     /**
305      * Decodes a byte from the Burrows-Wheeler Transform stage. If the block has randomisation
306      * applied, reverses the randomisation.
307      * @return The decoded byte
308      */
309     private int decodeNextBWTByte() {
310         int mergedPointer = bwtCurrentMergedPointer;
311         int nextDecodedByte =  mergedPointer & 0xff;
312         bwtCurrentMergedPointer = bwtMergedPointers[mergedPointer >>> 8];
313 
314         if (blockRandomised) {
315             if (--randomCount == 0) {
316                 nextDecodedByte ^= 1;
317                 randomIndex = (randomIndex + 1) % 512;
318                 randomCount = Bzip2Rand.rNums(randomIndex);
319             }
320         }
321         bwtBytesDecoded++;
322 
323         return nextDecodedByte;
324     }
325 
326     public int blockLength() {
327         return bwtBlockLength;
328     }
329 
330     /**
331      * Verify and return the block CRC. This method may only be called
332      * after all of the block's bytes have been read.
333      * @return The block CRC
334      */
335     int checkCRC() {
336         final int computedBlockCRC = crc.getCRC();
337         if (blockCRC != computedBlockCRC) {
338             throw new DecompressionException("block CRC error");
339         }
340         return computedBlockCRC;
341     }
342 }