View Javadoc
1   /*
2    * Copyright 2012 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import io.netty.buffer.ByteBuf;
19  
20  /**
21   * Uncompresses an input {@link ByteBuf} encoded with Snappy compression into an
22   * output {@link ByteBuf}.
23   *
24   * See <a href="https://github.com/google/snappy/blob/master/format_description.txt">snappy format</a>.
25   */
26  public final class Snappy {
27  
28      private static final int MAX_HT_SIZE = 1 << 14;
29      private static final int MIN_COMPRESSIBLE_BYTES = 15;
30  
31      // used as a return value to indicate that we haven't yet read our full preamble
32      private static final int PREAMBLE_NOT_FULL = -1;
33      private static final int NOT_ENOUGH_INPUT = -1;
34  
35      // constants for the tag types
36      private static final int LITERAL = 0;
37      private static final int COPY_1_BYTE_OFFSET = 1;
38      private static final int COPY_2_BYTE_OFFSET = 2;
39      private static final int COPY_4_BYTE_OFFSET = 3;
40  
41      private State state = State.READY;
42      private byte tag;
43      private int written;
44  
45      private enum State {
46          READY,
47          READING_PREAMBLE,
48          READING_TAG,
49          READING_LITERAL,
50          READING_COPY
51      }
52  
53      public void reset() {
54          state = State.READY;
55          tag = 0;
56          written = 0;
57      }
58  
59      public void encode(final ByteBuf in, final ByteBuf out, final int length) {
60          // Write the preamble length to the output buffer
61          for (int i = 0;; i ++) {
62              int b = length >>> i * 7;
63              if ((b & 0xFFFFFF80) != 0) {
64                  out.writeByte(b & 0x7f | 0x80);
65              } else {
66                  out.writeByte(b);
67                  break;
68              }
69          }
70  
71          int inIndex = in.readerIndex();
72          final int baseIndex = inIndex;
73  
74          final short[] table = getHashTable(length);
75          final int shift = Integer.numberOfLeadingZeros(table.length) + 1;
76  
77          int nextEmit = inIndex;
78  
79          if (length - inIndex >= MIN_COMPRESSIBLE_BYTES) {
80              int nextHash = hash(in, ++inIndex, shift);
81              outer: while (true) {
82                  int skip = 32;
83  
84                  int candidate;
85                  int nextIndex = inIndex;
86                  do {
87                      inIndex = nextIndex;
88                      int hash = nextHash;
89                      int bytesBetweenHashLookups = skip++ >> 5;
90                      nextIndex = inIndex + bytesBetweenHashLookups;
91  
92                      // We need at least 4 remaining bytes to read the hash
93                      if (nextIndex > length - 4) {
94                          break outer;
95                      }
96  
97                      nextHash = hash(in, nextIndex, shift);
98  
99                      candidate = baseIndex + table[hash];
100 
101                     table[hash] = (short) (inIndex - baseIndex);
102                 }
103                 while (in.getInt(inIndex) != in.getInt(candidate));
104 
105                 encodeLiteral(in, out, inIndex - nextEmit);
106 
107                 int insertTail;
108                 do {
109                     int base = inIndex;
110                     int matched = 4 + findMatchingLength(in, candidate + 4, inIndex + 4, length);
111                     inIndex += matched;
112                     int offset = base - candidate;
113                     encodeCopy(out, offset, matched);
114                     in.readerIndex(in.readerIndex() + matched);
115                     insertTail = inIndex - 1;
116                     nextEmit = inIndex;
117                     if (inIndex >= length - 4) {
118                         break outer;
119                     }
120 
121                     int prevHash = hash(in, insertTail, shift);
122                     table[prevHash] = (short) (inIndex - baseIndex - 1);
123                     int currentHash = hash(in, insertTail + 1, shift);
124                     candidate = baseIndex + table[currentHash];
125                     table[currentHash] = (short) (inIndex - baseIndex);
126                 }
127                 while (in.getInt(insertTail + 1) == in.getInt(candidate));
128 
129                 nextHash = hash(in, insertTail + 2, shift);
130                 ++inIndex;
131             }
132         }
133 
134         // If there are any remaining characters, write them out as a literal
135         if (nextEmit < length) {
136             encodeLiteral(in, out, length - nextEmit);
137         }
138     }
139 
140     /**
141      * Hashes the 4 bytes located at index, shifting the resulting hash into
142      * the appropriate range for our hash table.
143      *
144      * @param in The input buffer to read 4 bytes from
145      * @param index The index to read at
146      * @param shift The shift value, for ensuring that the resulting value is
147      *     withing the range of our hash table size
148      * @return A 32-bit hash of 4 bytes located at index
149      */
150     private static int hash(ByteBuf in, int index, int shift) {
151         return in.getInt(index) * 0x1e35a7bd >>> shift;
152     }
153 
154     /**
155      * Creates an appropriately sized hashtable for the given input size
156      *
157      * @param inputSize The size of our input, ie. the number of bytes we need to encode
158      * @return An appropriately sized empty hashtable
159      */
160     private static short[] getHashTable(int inputSize) {
161         int htSize = 256;
162         while (htSize < MAX_HT_SIZE && htSize < inputSize) {
163             htSize <<= 1;
164         }
165         return new short[htSize];
166     }
167 
168     /**
169      * Iterates over the supplied input buffer between the supplied minIndex and
170      * maxIndex to find how long our matched copy overlaps with an already-written
171      * literal value.
172      *
173      * @param in The input buffer to scan over
174      * @param minIndex The index in the input buffer to start scanning from
175      * @param inIndex The index of the start of our copy
176      * @param maxIndex The length of our input buffer
177      * @return The number of bytes for which our candidate copy is a repeat of
178      */
179     private static int findMatchingLength(ByteBuf in, int minIndex, int inIndex, int maxIndex) {
180         int matched = 0;
181 
182         while (inIndex <= maxIndex - 4 &&
183                 in.getInt(inIndex) == in.getInt(minIndex + matched)) {
184             inIndex += 4;
185             matched += 4;
186         }
187 
188         while (inIndex < maxIndex && in.getByte(minIndex + matched) == in.getByte(inIndex)) {
189             ++inIndex;
190             ++matched;
191         }
192 
193         return matched;
194     }
195 
196     /**
197      * Calculates the minimum number of bits required to encode a value.  This can
198      * then in turn be used to calculate the number of septets or octets (as
199      * appropriate) to use to encode a length parameter.
200      *
201      * @param value The value to calculate the minimum number of bits required to encode
202      * @return The minimum number of bits required to encode the supplied value
203      */
204     private static int bitsToEncode(int value) {
205         int highestOneBit = Integer.highestOneBit(value);
206         int bitLength = 0;
207         while ((highestOneBit >>= 1) != 0) {
208             bitLength++;
209         }
210 
211         return bitLength;
212     }
213 
214     /**
215      * Writes a literal to the supplied output buffer by directly copying from
216      * the input buffer.  The literal is taken from the current readerIndex
217      * up to the supplied length.
218      *
219      * @param in The input buffer to copy from
220      * @param out The output buffer to copy to
221      * @param length The length of the literal to copy
222      */
223     static void encodeLiteral(ByteBuf in, ByteBuf out, int length) {
224         if (length < 61) {
225             out.writeByte(length - 1 << 2);
226         } else {
227             int bitLength = bitsToEncode(length - 1);
228             int bytesToEncode = 1 + bitLength / 8;
229             out.writeByte(59 + bytesToEncode << 2);
230             for (int i = 0; i < bytesToEncode; i++) {
231                 out.writeByte(length - 1 >> i * 8 & 0x0ff);
232             }
233         }
234 
235         out.writeBytes(in, length);
236     }
237 
238     private static void encodeCopyWithOffset(ByteBuf out, int offset, int length) {
239         if (length < 12 && offset < 2048) {
240             out.writeByte(COPY_1_BYTE_OFFSET | length - 4 << 2 | offset >> 8 << 5);
241             out.writeByte(offset & 0x0ff);
242         } else {
243             out.writeByte(COPY_2_BYTE_OFFSET | length - 1 << 2);
244             out.writeByte(offset & 0x0ff);
245             out.writeByte(offset >> 8 & 0x0ff);
246         }
247     }
248 
249     /**
250      * Encodes a series of copies, each at most 64 bytes in length.
251      *
252      * @param out The output buffer to write the copy pointer to
253      * @param offset The offset at which the original instance lies
254      * @param length The length of the original instance
255      */
256     private static void encodeCopy(ByteBuf out, int offset, int length) {
257         while (length >= 68) {
258             encodeCopyWithOffset(out, offset, 64);
259             length -= 64;
260         }
261 
262         if (length > 64) {
263             encodeCopyWithOffset(out, offset, 60);
264             length -= 60;
265         }
266 
267         encodeCopyWithOffset(out, offset, length);
268     }
269 
270     public void decode(ByteBuf in, ByteBuf out) {
271         while (in.isReadable()) {
272             switch (state) {
273             case READY:
274                 state = State.READING_PREAMBLE;
275                 // fall through
276             case READING_PREAMBLE:
277                 int uncompressedLength = readPreamble(in);
278                 if (uncompressedLength == PREAMBLE_NOT_FULL) {
279                     // We've not yet read all of the preamble, so wait until we can
280                     return;
281                 }
282                 if (uncompressedLength == 0) {
283                     // Should never happen, but it does mean we have nothing further to do
284                     state = State.READY;
285                     return;
286                 }
287                 out.ensureWritable(uncompressedLength);
288                 state = State.READING_TAG;
289                 // fall through
290             case READING_TAG:
291                 if (!in.isReadable()) {
292                     return;
293                 }
294                 tag = in.readByte();
295                 switch (tag & 0x03) {
296                 case LITERAL:
297                     state = State.READING_LITERAL;
298                     break;
299                 case COPY_1_BYTE_OFFSET:
300                 case COPY_2_BYTE_OFFSET:
301                 case COPY_4_BYTE_OFFSET:
302                     state = State.READING_COPY;
303                     break;
304                 }
305                 break;
306             case READING_LITERAL:
307                 int literalWritten = decodeLiteral(tag, in, out);
308                 if (literalWritten != NOT_ENOUGH_INPUT) {
309                     state = State.READING_TAG;
310                     written += literalWritten;
311                 } else {
312                     // Need to wait for more data
313                     return;
314                 }
315                 break;
316             case READING_COPY:
317                 int decodeWritten;
318                 switch (tag & 0x03) {
319                 case COPY_1_BYTE_OFFSET:
320                     decodeWritten = decodeCopyWith1ByteOffset(tag, in, out, written);
321                     if (decodeWritten != NOT_ENOUGH_INPUT) {
322                         state = State.READING_TAG;
323                         written += decodeWritten;
324                     } else {
325                         // Need to wait for more data
326                         return;
327                     }
328                     break;
329                 case COPY_2_BYTE_OFFSET:
330                     decodeWritten = decodeCopyWith2ByteOffset(tag, in, out, written);
331                     if (decodeWritten != NOT_ENOUGH_INPUT) {
332                         state = State.READING_TAG;
333                         written += decodeWritten;
334                     } else {
335                         // Need to wait for more data
336                         return;
337                     }
338                     break;
339                 case COPY_4_BYTE_OFFSET:
340                     decodeWritten = decodeCopyWith4ByteOffset(tag, in, out, written);
341                     if (decodeWritten != NOT_ENOUGH_INPUT) {
342                         state = State.READING_TAG;
343                         written += decodeWritten;
344                     } else {
345                         // Need to wait for more data
346                         return;
347                     }
348                     break;
349                 }
350             }
351         }
352     }
353 
354     /**
355      * Reads the length varint (a series of bytes, where the lower 7 bits
356      * are data and the upper bit is a flag to indicate more bytes to be
357      * read).
358      *
359      * @param in The input buffer to read the preamble from
360      * @return The calculated length based on the input buffer, or 0 if
361      *   no preamble is able to be calculated
362      */
363     private static int readPreamble(ByteBuf in) {
364         int length = 0;
365         int byteIndex = 0;
366         while (in.isReadable()) {
367             int current = in.readUnsignedByte();
368             length |= (current & 0x7f) << byteIndex++ * 7;
369             if ((current & 0x80) == 0) {
370                 return length;
371             }
372 
373             if (byteIndex >= 4) {
374                 throw new DecompressionException("Preamble is greater than 4 bytes");
375             }
376         }
377 
378         return 0;
379     }
380 
381     /**
382      * Reads a literal from the input buffer directly to the output buffer.
383      * A "literal" is an uncompressed segment of data stored directly in the
384      * byte stream.
385      *
386      * @param tag The tag that identified this segment as a literal is also
387      *            used to encode part of the length of the data
388      * @param in The input buffer to read the literal from
389      * @param out The output buffer to write the literal to
390      * @return The number of bytes appended to the output buffer, or -1 to indicate "try again later"
391      */
392     static int decodeLiteral(byte tag, ByteBuf in, ByteBuf out) {
393         in.markReaderIndex();
394         int length;
395         switch(tag >> 2 & 0x3F) {
396         case 60:
397             if (!in.isReadable()) {
398                 return NOT_ENOUGH_INPUT;
399             }
400             length = in.readUnsignedByte();
401             break;
402         case 61:
403             if (in.readableBytes() < 2) {
404                 return NOT_ENOUGH_INPUT;
405             }
406             length = in.readUnsignedShortLE();
407             break;
408         case 62:
409             if (in.readableBytes() < 3) {
410                 return NOT_ENOUGH_INPUT;
411             }
412             length = in.readUnsignedMediumLE();
413             break;
414         case 63:
415             if (in.readableBytes() < 4) {
416                 return NOT_ENOUGH_INPUT;
417             }
418             length = in.readIntLE();
419             break;
420         default:
421             length = tag >> 2 & 0x3F;
422         }
423         length += 1;
424 
425         if (in.readableBytes() < length) {
426             in.resetReaderIndex();
427             return NOT_ENOUGH_INPUT;
428         }
429 
430         out.writeBytes(in, length);
431         return length;
432     }
433 
434     /**
435      * Reads a compressed reference offset and length from the supplied input
436      * buffer, seeks back to the appropriate place in the input buffer and
437      * writes the found data to the supplied output stream.
438      *
439      * @param tag The tag used to identify this as a copy is also used to encode
440      *     the length and part of the offset
441      * @param in The input buffer to read from
442      * @param out The output buffer to write to
443      * @return The number of bytes appended to the output buffer, or -1 to indicate
444      *     "try again later"
445      * @throws DecompressionException If the read offset is invalid
446      */
447     private static int decodeCopyWith1ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
448         if (!in.isReadable()) {
449             return NOT_ENOUGH_INPUT;
450         }
451 
452         int initialIndex = out.writerIndex();
453         int length = 4 + ((tag & 0x01c) >> 2);
454         int offset = (tag & 0x0e0) << 8 >> 5 | in.readUnsignedByte();
455 
456         validateOffset(offset, writtenSoFar);
457 
458         out.markReaderIndex();
459         if (offset < length) {
460             int copies = length / offset;
461             for (; copies > 0; copies--) {
462                 out.readerIndex(initialIndex - offset);
463                 out.readBytes(out, offset);
464             }
465             if (length % offset != 0) {
466                 out.readerIndex(initialIndex - offset);
467                 out.readBytes(out, length % offset);
468             }
469         } else {
470             out.readerIndex(initialIndex - offset);
471             out.readBytes(out, length);
472         }
473         out.resetReaderIndex();
474 
475         return length;
476     }
477 
478     /**
479      * Reads a compressed reference offset and length from the supplied input
480      * buffer, seeks back to the appropriate place in the input buffer and
481      * writes the found data to the supplied output stream.
482      *
483      * @param tag The tag used to identify this as a copy is also used to encode
484      *     the length and part of the offset
485      * @param in The input buffer to read from
486      * @param out The output buffer to write to
487      * @throws DecompressionException If the read offset is invalid
488      * @return The number of bytes appended to the output buffer, or -1 to indicate
489      *     "try again later"
490      */
491     private static int decodeCopyWith2ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
492         if (in.readableBytes() < 2) {
493             return NOT_ENOUGH_INPUT;
494         }
495 
496         int initialIndex = out.writerIndex();
497         int length = 1 + (tag >> 2 & 0x03f);
498         int offset = in.readUnsignedShortLE();
499 
500         validateOffset(offset, writtenSoFar);
501 
502         out.markReaderIndex();
503         if (offset < length) {
504             int copies = length / offset;
505             for (; copies > 0; copies--) {
506                 out.readerIndex(initialIndex - offset);
507                 out.readBytes(out, offset);
508             }
509             if (length % offset != 0) {
510                 out.readerIndex(initialIndex - offset);
511                 out.readBytes(out, length % offset);
512             }
513         } else {
514             out.readerIndex(initialIndex - offset);
515             out.readBytes(out, length);
516         }
517         out.resetReaderIndex();
518 
519         return length;
520     }
521 
522     /**
523      * Reads a compressed reference offset and length from the supplied input
524      * buffer, seeks back to the appropriate place in the input buffer and
525      * writes the found data to the supplied output stream.
526      *
527      * @param tag The tag used to identify this as a copy is also used to encode
528      *     the length and part of the offset
529      * @param in The input buffer to read from
530      * @param out The output buffer to write to
531      * @return The number of bytes appended to the output buffer, or -1 to indicate
532      *     "try again later"
533      * @throws DecompressionException If the read offset is invalid
534      */
535     private static int decodeCopyWith4ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
536         if (in.readableBytes() < 4) {
537             return NOT_ENOUGH_INPUT;
538         }
539 
540         int initialIndex = out.writerIndex();
541         int length = 1 + (tag >> 2 & 0x03F);
542         int offset = in.readIntLE();
543 
544         validateOffset(offset, writtenSoFar);
545 
546         out.markReaderIndex();
547         if (offset < length) {
548             int copies = length / offset;
549             for (; copies > 0; copies--) {
550                 out.readerIndex(initialIndex - offset);
551                 out.readBytes(out, offset);
552             }
553             if (length % offset != 0) {
554                 out.readerIndex(initialIndex - offset);
555                 out.readBytes(out, length % offset);
556             }
557         } else {
558             out.readerIndex(initialIndex - offset);
559             out.readBytes(out, length);
560         }
561         out.resetReaderIndex();
562 
563         return length;
564     }
565 
566     /**
567      * Validates that the offset extracted from a compressed reference is within
568      * the permissible bounds of an offset (0 < offset < Integer.MAX_VALUE), and does not
569      * exceed the length of the chunk currently read so far.
570      *
571      * @param offset The offset extracted from the compressed reference
572      * @param chunkSizeSoFar The number of bytes read so far from this chunk
573      * @throws DecompressionException if the offset is invalid
574      */
575     private static void validateOffset(int offset, int chunkSizeSoFar) {
576         if (offset == 0) {
577             throw new DecompressionException("Offset is less than minimum permissible value");
578         }
579 
580         if (offset < 0) {
581             // Due to arithmetic overflow
582             throw new DecompressionException("Offset is greater than maximum value supported by this implementation");
583         }
584 
585         if (offset > chunkSizeSoFar) {
586             throw new DecompressionException("Offset exceeds size of chunk");
587         }
588     }
589 
590     /**
591      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
592      * on the computed checksum
593      *
594      * @param data The input data to calculate the CRC32C checksum of
595      */
596     static int calculateChecksum(ByteBuf data) {
597         return calculateChecksum(data, data.readerIndex(), data.readableBytes());
598     }
599 
600     /**
601      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
602      * on the computed checksum
603      *
604      * @param data The input data to calculate the CRC32C checksum of
605      */
606     static int calculateChecksum(ByteBuf data, int offset, int length) {
607         Crc32c crc32 = new Crc32c();
608         try {
609             crc32.update(data, offset, length);
610             return maskChecksum((int) crc32.getValue());
611         } finally {
612             crc32.reset();
613         }
614     }
615 
616     /**
617      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
618      * on the computed checksum, and then compares the resulting masked checksum to the
619      * supplied checksum.
620      *
621      * @param expectedChecksum The checksum decoded from the stream to compare against
622      * @param data The input data to calculate the CRC32C checksum of
623      * @throws DecompressionException If the calculated and supplied checksums do not match
624      */
625     static void validateChecksum(int expectedChecksum, ByteBuf data) {
626         validateChecksum(expectedChecksum, data, data.readerIndex(), data.readableBytes());
627     }
628 
629     /**
630      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
631      * on the computed checksum, and then compares the resulting masked checksum to the
632      * supplied checksum.
633      *
634      * @param expectedChecksum The checksum decoded from the stream to compare against
635      * @param data The input data to calculate the CRC32C checksum of
636      * @throws DecompressionException If the calculated and supplied checksums do not match
637      */
638     static void validateChecksum(int expectedChecksum, ByteBuf data, int offset, int length) {
639         final int actualChecksum = calculateChecksum(data, offset, length);
640         if (actualChecksum != expectedChecksum) {
641             throw new DecompressionException(
642                     "mismatching checksum: " + Integer.toHexString(actualChecksum) +
643                             " (expected: " + Integer.toHexString(expectedChecksum) + ')');
644         }
645     }
646 
647     /**
648      * From the spec:
649      *
650      * "Checksums are not stored directly, but masked, as checksumming data and
651      * then its own checksum can be problematic. The masking is the same as used
652      * in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant
653      * 0xa282ead8 (using wraparound as normal for unsigned integers)."
654      *
655      * @param checksum The actual checksum of the data
656      * @return The masked checksum
657      */
658     static int maskChecksum(int checksum) {
659         return (checksum >> 15 | checksum << 17) + 0xa282ead8;
660     }
661 }