View Javadoc

1   /*
2    * Copyright 2012 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import io.netty.buffer.ByteBuf;
19  import io.netty.buffer.ByteBufUtil;
20  
21  /**
22   * Uncompresses an input {@link ByteBuf} encoded with Snappy compression into an
23   * output {@link ByteBuf}.
24   *
25   * See <a href="https://github.com/google/snappy/blob/master/format_description.txt">snappy format</a>.
26   */
27  class Snappy {
28  
29      private static final int MAX_HT_SIZE = 1 << 14;
30      private static final int MIN_COMPRESSIBLE_BYTES = 15;
31  
32      // used as a return value to indicate that we haven't yet read our full preamble
33      private static final int PREAMBLE_NOT_FULL = -1;
34      private static final int NOT_ENOUGH_INPUT = -1;
35  
36      // constants for the tag types
37      private static final int LITERAL = 0;
38      private static final int COPY_1_BYTE_OFFSET = 1;
39      private static final int COPY_2_BYTE_OFFSET = 2;
40      private static final int COPY_4_BYTE_OFFSET = 3;
41  
42      private State state = State.READY;
43      private byte tag;
44      private int written;
45  
46      private enum State {
47          READY,
48          READING_PREAMBLE,
49          READING_TAG,
50          READING_LITERAL,
51          READING_COPY
52      }
53  
54      public void reset() {
55          state = State.READY;
56          tag = 0;
57          written = 0;
58      }
59  
60      public void encode(final ByteBuf in, final ByteBuf out, final int length) {
61          // Write the preamble length to the output buffer
62          for (int i = 0;; i ++) {
63              int b = length >>> i * 7;
64              if ((b & 0xFFFFFF80) != 0) {
65                  out.writeByte(b & 0x7f | 0x80);
66              } else {
67                  out.writeByte(b);
68                  break;
69              }
70          }
71  
72          int inIndex = in.readerIndex();
73          final int baseIndex = inIndex;
74  
75          final short[] table = getHashTable(length);
76          final int shift = Integer.numberOfLeadingZeros(table.length) + 1;
77  
78          int nextEmit = inIndex;
79  
80          if (length - inIndex >= MIN_COMPRESSIBLE_BYTES) {
81              int nextHash = hash(in, ++inIndex, shift);
82              outer: while (true) {
83                  int skip = 32;
84  
85                  int candidate;
86                  int nextIndex = inIndex;
87                  do {
88                      inIndex = nextIndex;
89                      int hash = nextHash;
90                      int bytesBetweenHashLookups = skip++ >> 5;
91                      nextIndex = inIndex + bytesBetweenHashLookups;
92  
93                      // We need at least 4 remaining bytes to read the hash
94                      if (nextIndex > length - 4) {
95                          break outer;
96                      }
97  
98                      nextHash = hash(in, nextIndex, shift);
99  
100                     candidate = baseIndex + table[hash];
101 
102                     table[hash] = (short) (inIndex - baseIndex);
103                 }
104                 while (in.getInt(inIndex) != in.getInt(candidate));
105 
106                 encodeLiteral(in, out, inIndex - nextEmit);
107 
108                 int insertTail;
109                 do {
110                     int base = inIndex;
111                     int matched = 4 + findMatchingLength(in, candidate + 4, inIndex + 4, length);
112                     inIndex += matched;
113                     int offset = base - candidate;
114                     encodeCopy(out, offset, matched);
115                     in.readerIndex(in.readerIndex() + matched);
116                     insertTail = inIndex - 1;
117                     nextEmit = inIndex;
118                     if (inIndex >= length - 4) {
119                         break outer;
120                     }
121 
122                     int prevHash = hash(in, insertTail, shift);
123                     table[prevHash] = (short) (inIndex - baseIndex - 1);
124                     int currentHash = hash(in, insertTail + 1, shift);
125                     candidate = baseIndex + table[currentHash];
126                     table[currentHash] = (short) (inIndex - baseIndex);
127                 }
128                 while (in.getInt(insertTail + 1) == in.getInt(candidate));
129 
130                 nextHash = hash(in, insertTail + 2, shift);
131                 ++inIndex;
132             }
133         }
134 
135         // If there are any remaining characters, write them out as a literal
136         if (nextEmit < length) {
137             encodeLiteral(in, out, length - nextEmit);
138         }
139     }
140 
141     /**
142      * Hashes the 4 bytes located at index, shifting the resulting hash into
143      * the appropriate range for our hash table.
144      *
145      * @param in The input buffer to read 4 bytes from
146      * @param index The index to read at
147      * @param shift The shift value, for ensuring that the resulting value is
148      *     withing the range of our hash table size
149      * @return A 32-bit hash of 4 bytes located at index
150      */
151     private static int hash(ByteBuf in, int index, int shift) {
152         return in.getInt(index) * 0x1e35a7bd >>> shift;
153     }
154 
155     /**
156      * Creates an appropriately sized hashtable for the given input size
157      *
158      * @param inputSize The size of our input, ie. the number of bytes we need to encode
159      * @return An appropriately sized empty hashtable
160      */
161     private static short[] getHashTable(int inputSize) {
162         int htSize = 256;
163         while (htSize < MAX_HT_SIZE && htSize < inputSize) {
164             htSize <<= 1;
165         }
166         return new short[htSize];
167     }
168 
169     /**
170      * Iterates over the supplied input buffer between the supplied minIndex and
171      * maxIndex to find how long our matched copy overlaps with an already-written
172      * literal value.
173      *
174      * @param in The input buffer to scan over
175      * @param minIndex The index in the input buffer to start scanning from
176      * @param inIndex The index of the start of our copy
177      * @param maxIndex The length of our input buffer
178      * @return The number of bytes for which our candidate copy is a repeat of
179      */
180     private static int findMatchingLength(ByteBuf in, int minIndex, int inIndex, int maxIndex) {
181         int matched = 0;
182 
183         while (inIndex <= maxIndex - 4 &&
184                 in.getInt(inIndex) == in.getInt(minIndex + matched)) {
185             inIndex += 4;
186             matched += 4;
187         }
188 
189         while (inIndex < maxIndex && in.getByte(minIndex + matched) == in.getByte(inIndex)) {
190             ++inIndex;
191             ++matched;
192         }
193 
194         return matched;
195     }
196 
197     /**
198      * Calculates the minimum number of bits required to encode a value.  This can
199      * then in turn be used to calculate the number of septets or octets (as
200      * appropriate) to use to encode a length parameter.
201      *
202      * @param value The value to calculate the minimum number of bits required to encode
203      * @return The minimum number of bits required to encode the supplied value
204      */
205     private static int bitsToEncode(int value) {
206         int highestOneBit = Integer.highestOneBit(value);
207         int bitLength = 0;
208         while ((highestOneBit >>= 1) != 0) {
209             bitLength++;
210         }
211 
212         return bitLength;
213     }
214 
215     /**
216      * Writes a literal to the supplied output buffer by directly copying from
217      * the input buffer.  The literal is taken from the current readerIndex
218      * up to the supplied length.
219      *
220      * @param in The input buffer to copy from
221      * @param out The output buffer to copy to
222      * @param length The length of the literal to copy
223      */
224     static void encodeLiteral(ByteBuf in, ByteBuf out, int length) {
225         if (length < 61) {
226             out.writeByte(length - 1 << 2);
227         } else {
228             int bitLength = bitsToEncode(length - 1);
229             int bytesToEncode = 1 + bitLength / 8;
230             out.writeByte(59 + bytesToEncode << 2);
231             for (int i = 0; i < bytesToEncode; i++) {
232                 out.writeByte(length - 1 >> i * 8 & 0x0ff);
233             }
234         }
235 
236         out.writeBytes(in, length);
237     }
238 
239     private static void encodeCopyWithOffset(ByteBuf out, int offset, int length) {
240         if (length < 12 && offset < 2048) {
241             out.writeByte(COPY_1_BYTE_OFFSET | length - 4 << 2 | offset >> 8 << 5);
242             out.writeByte(offset & 0x0ff);
243         } else {
244             out.writeByte(COPY_2_BYTE_OFFSET | length - 1 << 2);
245             out.writeByte(offset & 0x0ff);
246             out.writeByte(offset >> 8 & 0x0ff);
247         }
248     }
249 
250     /**
251      * Encodes a series of copies, each at most 64 bytes in length.
252      *
253      * @param out The output buffer to write the copy pointer to
254      * @param offset The offset at which the original instance lies
255      * @param length The length of the original instance
256      */
257     private static void encodeCopy(ByteBuf out, int offset, int length) {
258         while (length >= 68) {
259             encodeCopyWithOffset(out, offset, 64);
260             length -= 64;
261         }
262 
263         if (length > 64) {
264             encodeCopyWithOffset(out, offset, 60);
265             length -= 60;
266         }
267 
268         encodeCopyWithOffset(out, offset, length);
269     }
270 
271     public void decode(ByteBuf in, ByteBuf out) {
272         while (in.isReadable()) {
273             switch (state) {
274             case READY:
275                 state = State.READING_PREAMBLE;
276                 // fall through
277             case READING_PREAMBLE:
278                 int uncompressedLength = readPreamble(in);
279                 if (uncompressedLength == PREAMBLE_NOT_FULL) {
280                     // We've not yet read all of the preamble, so wait until we can
281                     return;
282                 }
283                 if (uncompressedLength == 0) {
284                     // Should never happen, but it does mean we have nothing further to do
285                     state = State.READY;
286                     return;
287                 }
288                 out.ensureWritable(uncompressedLength);
289                 state = State.READING_TAG;
290                 // fall through
291             case READING_TAG:
292                 if (!in.isReadable()) {
293                     return;
294                 }
295                 tag = in.readByte();
296                 switch (tag & 0x03) {
297                 case LITERAL:
298                     state = State.READING_LITERAL;
299                     break;
300                 case COPY_1_BYTE_OFFSET:
301                 case COPY_2_BYTE_OFFSET:
302                 case COPY_4_BYTE_OFFSET:
303                     state = State.READING_COPY;
304                     break;
305                 }
306                 break;
307             case READING_LITERAL:
308                 int literalWritten = decodeLiteral(tag, in, out);
309                 if (literalWritten != NOT_ENOUGH_INPUT) {
310                     state = State.READING_TAG;
311                     written += literalWritten;
312                 } else {
313                     // Need to wait for more data
314                     return;
315                 }
316                 break;
317             case READING_COPY:
318                 int decodeWritten;
319                 switch (tag & 0x03) {
320                 case COPY_1_BYTE_OFFSET:
321                     decodeWritten = decodeCopyWith1ByteOffset(tag, in, out, written);
322                     if (decodeWritten != NOT_ENOUGH_INPUT) {
323                         state = State.READING_TAG;
324                         written += decodeWritten;
325                     } else {
326                         // Need to wait for more data
327                         return;
328                     }
329                     break;
330                 case COPY_2_BYTE_OFFSET:
331                     decodeWritten = decodeCopyWith2ByteOffset(tag, in, out, written);
332                     if (decodeWritten != NOT_ENOUGH_INPUT) {
333                         state = State.READING_TAG;
334                         written += decodeWritten;
335                     } else {
336                         // Need to wait for more data
337                         return;
338                     }
339                     break;
340                 case COPY_4_BYTE_OFFSET:
341                     decodeWritten = decodeCopyWith4ByteOffset(tag, in, out, written);
342                     if (decodeWritten != NOT_ENOUGH_INPUT) {
343                         state = State.READING_TAG;
344                         written += decodeWritten;
345                     } else {
346                         // Need to wait for more data
347                         return;
348                     }
349                     break;
350                 }
351             }
352         }
353     }
354 
355     /**
356      * Reads the length varint (a series of bytes, where the lower 7 bits
357      * are data and the upper bit is a flag to indicate more bytes to be
358      * read).
359      *
360      * @param in The input buffer to read the preamble from
361      * @return The calculated length based on the input buffer, or 0 if
362      *   no preamble is able to be calculated
363      */
364     private static int readPreamble(ByteBuf in) {
365         int length = 0;
366         int byteIndex = 0;
367         while (in.isReadable()) {
368             int current = in.readUnsignedByte();
369             length |= (current & 0x7f) << byteIndex++ * 7;
370             if ((current & 0x80) == 0) {
371                 return length;
372             }
373 
374             if (byteIndex >= 4) {
375                 throw new DecompressionException("Preamble is greater than 4 bytes");
376             }
377         }
378 
379         return 0;
380     }
381 
382     /**
383      * Reads a literal from the input buffer directly to the output buffer.
384      * A "literal" is an uncompressed segment of data stored directly in the
385      * byte stream.
386      *
387      * @param tag The tag that identified this segment as a literal is also
388      *            used to encode part of the length of the data
389      * @param in The input buffer to read the literal from
390      * @param out The output buffer to write the literal to
391      * @return The number of bytes appended to the output buffer, or -1 to indicate "try again later"
392      */
393     static int decodeLiteral(byte tag, ByteBuf in, ByteBuf out) {
394         in.markReaderIndex();
395         int length;
396         switch(tag >> 2 & 0x3F) {
397         case 60:
398             if (!in.isReadable()) {
399                 return NOT_ENOUGH_INPUT;
400             }
401             length = in.readUnsignedByte();
402             break;
403         case 61:
404             if (in.readableBytes() < 2) {
405                 return NOT_ENOUGH_INPUT;
406             }
407             length = ByteBufUtil.swapShort(in.readShort());
408             break;
409         case 62:
410             if (in.readableBytes() < 3) {
411                 return NOT_ENOUGH_INPUT;
412             }
413             length = ByteBufUtil.swapMedium(in.readUnsignedMedium());
414             break;
415         case 63:
416             if (in.readableBytes() < 4) {
417                 return NOT_ENOUGH_INPUT;
418             }
419             length = ByteBufUtil.swapInt(in.readInt());
420             break;
421         default:
422             length = tag >> 2 & 0x3F;
423         }
424         length += 1;
425 
426         if (in.readableBytes() < length) {
427             in.resetReaderIndex();
428             return NOT_ENOUGH_INPUT;
429         }
430 
431         out.writeBytes(in, length);
432         return length;
433     }
434 
435     /**
436      * Reads a compressed reference offset and length from the supplied input
437      * buffer, seeks back to the appropriate place in the input buffer and
438      * writes the found data to the supplied output stream.
439      *
440      * @param tag The tag used to identify this as a copy is also used to encode
441      *     the length and part of the offset
442      * @param in The input buffer to read from
443      * @param out The output buffer to write to
444      * @return The number of bytes appended to the output buffer, or -1 to indicate
445      *     "try again later"
446      * @throws DecompressionException If the read offset is invalid
447      */
448     private static int decodeCopyWith1ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
449         if (!in.isReadable()) {
450             return NOT_ENOUGH_INPUT;
451         }
452 
453         int initialIndex = out.writerIndex();
454         int length = 4 + ((tag & 0x01c) >> 2);
455         int offset = (tag & 0x0e0) << 8 >> 5 | in.readUnsignedByte();
456 
457         validateOffset(offset, writtenSoFar);
458 
459         out.markReaderIndex();
460         if (offset < length) {
461             int copies = length / offset;
462             for (; copies > 0; copies--) {
463                 out.readerIndex(initialIndex - offset);
464                 out.readBytes(out, offset);
465             }
466             if (length % offset != 0) {
467                 out.readerIndex(initialIndex - offset);
468                 out.readBytes(out, length % offset);
469             }
470         } else {
471             out.readerIndex(initialIndex - offset);
472             out.readBytes(out, length);
473         }
474         out.resetReaderIndex();
475 
476         return length;
477     }
478 
479     /**
480      * Reads a compressed reference offset and length from the supplied input
481      * buffer, seeks back to the appropriate place in the input buffer and
482      * writes the found data to the supplied output stream.
483      *
484      * @param tag The tag used to identify this as a copy is also used to encode
485      *     the length and part of the offset
486      * @param in The input buffer to read from
487      * @param out The output buffer to write to
488      * @throws DecompressionException If the read offset is invalid
489      * @return The number of bytes appended to the output buffer, or -1 to indicate
490      *     "try again later"
491      */
492     private static int decodeCopyWith2ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
493         if (in.readableBytes() < 2) {
494             return NOT_ENOUGH_INPUT;
495         }
496 
497         int initialIndex = out.writerIndex();
498         int length = 1 + (tag >> 2 & 0x03f);
499         int offset = ByteBufUtil.swapShort(in.readShort());
500 
501         validateOffset(offset, writtenSoFar);
502 
503         out.markReaderIndex();
504         if (offset < length) {
505             int copies = length / offset;
506             for (; copies > 0; copies--) {
507                 out.readerIndex(initialIndex - offset);
508                 out.readBytes(out, offset);
509             }
510             if (length % offset != 0) {
511                 out.readerIndex(initialIndex - offset);
512                 out.readBytes(out, length % offset);
513             }
514         } else {
515             out.readerIndex(initialIndex - offset);
516             out.readBytes(out, length);
517         }
518         out.resetReaderIndex();
519 
520         return length;
521     }
522 
523     /**
524      * Reads a compressed reference offset and length from the supplied input
525      * buffer, seeks back to the appropriate place in the input buffer and
526      * writes the found data to the supplied output stream.
527      *
528      * @param tag The tag used to identify this as a copy is also used to encode
529      *     the length and part of the offset
530      * @param in The input buffer to read from
531      * @param out The output buffer to write to
532      * @return The number of bytes appended to the output buffer, or -1 to indicate
533      *     "try again later"
534      * @throws DecompressionException If the read offset is invalid
535      */
536     private static int decodeCopyWith4ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
537         if (in.readableBytes() < 4) {
538             return NOT_ENOUGH_INPUT;
539         }
540 
541         int initialIndex = out.writerIndex();
542         int length = 1 + (tag >> 2 & 0x03F);
543         int offset = ByteBufUtil.swapInt(in.readInt());
544 
545         validateOffset(offset, writtenSoFar);
546 
547         out.markReaderIndex();
548         if (offset < length) {
549             int copies = length / offset;
550             for (; copies > 0; copies--) {
551                 out.readerIndex(initialIndex - offset);
552                 out.readBytes(out, offset);
553             }
554             if (length % offset != 0) {
555                 out.readerIndex(initialIndex - offset);
556                 out.readBytes(out, length % offset);
557             }
558         } else {
559             out.readerIndex(initialIndex - offset);
560             out.readBytes(out, length);
561         }
562         out.resetReaderIndex();
563 
564         return length;
565     }
566 
567     /**
568      * Validates that the offset extracted from a compressed reference is within
569      * the permissible bounds of an offset (4 <= offset <= 32768), and does not
570      * exceed the length of the chunk currently read so far.
571      *
572      * @param offset The offset extracted from the compressed reference
573      * @param chunkSizeSoFar The number of bytes read so far from this chunk
574      * @throws DecompressionException if the offset is invalid
575      */
576     private static void validateOffset(int offset, int chunkSizeSoFar) {
577         if (offset > Short.MAX_VALUE) {
578             throw new DecompressionException("Offset exceeds maximum permissible value");
579         }
580 
581         if (offset <= 0) {
582             throw new DecompressionException("Offset is less than minimum permissible value");
583         }
584 
585         if (offset > chunkSizeSoFar) {
586             throw new DecompressionException("Offset exceeds size of chunk");
587         }
588     }
589 
590     /**
591      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
592      * on the computed checksum
593      *
594      * @param data The input data to calculate the CRC32C checksum of
595      */
596     public static int calculateChecksum(ByteBuf data) {
597         return calculateChecksum(data, data.readerIndex(), data.readableBytes());
598     }
599 
600     /**
601      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
602      * on the computed checksum
603      *
604      * @param data The input data to calculate the CRC32C checksum of
605      */
606     public static int calculateChecksum(ByteBuf data, int offset, int length) {
607         Crc32c crc32 = new Crc32c();
608         try {
609             if (data.hasArray()) {
610                 crc32.update(data.array(), data.arrayOffset() + offset, length);
611             } else {
612                 byte[] array = new byte[length];
613                 data.getBytes(offset, array);
614                 crc32.update(array, 0, length);
615             }
616 
617             return maskChecksum((int) crc32.getValue());
618         } finally {
619             crc32.reset();
620         }
621     }
622 
623     /**
624      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
625      * on the computed checksum, and then compares the resulting masked checksum to the
626      * supplied checksum.
627      *
628      * @param expectedChecksum The checksum decoded from the stream to compare against
629      * @param data The input data to calculate the CRC32C checksum of
630      * @throws DecompressionException If the calculated and supplied checksums do not match
631      */
632     static void validateChecksum(int expectedChecksum, ByteBuf data) {
633         validateChecksum(expectedChecksum, data, data.readerIndex(), data.readableBytes());
634     }
635 
636     /**
637      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
638      * on the computed checksum, and then compares the resulting masked checksum to the
639      * supplied checksum.
640      *
641      * @param expectedChecksum The checksum decoded from the stream to compare against
642      * @param data The input data to calculate the CRC32C checksum of
643      * @throws DecompressionException If the calculated and supplied checksums do not match
644      */
645     static void validateChecksum(int expectedChecksum, ByteBuf data, int offset, int length) {
646         final int actualChecksum = calculateChecksum(data, offset, length);
647         if (actualChecksum != expectedChecksum) {
648             throw new DecompressionException(
649                     "mismatching checksum: " + Integer.toHexString(actualChecksum) +
650                             " (expected: " + Integer.toHexString(expectedChecksum) + ')');
651         }
652     }
653 
654     /**
655      * From the spec:
656      *
657      * "Checksums are not stored directly, but masked, as checksumming data and
658      * then its own checksum can be problematic. The masking is the same as used
659      * in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant
660      * 0xa282ead8 (using wraparound as normal for unsigned integers)."
661      *
662      * @param checksum The actual checksum of the data
663      * @return The masked checksum
664      */
665     static int maskChecksum(int checksum) {
666         return (checksum >> 15 | checksum << 17) + 0xa282ead8;
667     }
668 }