View Javadoc
1   /*
2    * Copyright 2012 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import io.netty.buffer.ByteBuf;
19  
20  /**
21   * Uncompresses an input {@link ByteBuf} encoded with Snappy compression into an
22   * output {@link ByteBuf}.
23   *
24   * See <a href="https://github.com/google/snappy/blob/master/format_description.txt">snappy format</a>.
25   */
26  public final class Snappy {
27  
28      private static final int MAX_HT_SIZE = 1 << 14;
29      private static final int MIN_COMPRESSIBLE_BYTES = 15;
30  
31      // used as a return value to indicate that we haven't yet read our full preamble
32      private static final int PREAMBLE_NOT_FULL = -1;
33      private static final int NOT_ENOUGH_INPUT = -1;
34  
35      // constants for the tag types
36      private static final int LITERAL = 0;
37      private static final int COPY_1_BYTE_OFFSET = 1;
38      private static final int COPY_2_BYTE_OFFSET = 2;
39      private static final int COPY_4_BYTE_OFFSET = 3;
40  
41      private State state = State.READING_PREAMBLE;
42      private byte tag;
43      private int written;
44  
45      private enum State {
46          READING_PREAMBLE,
47          READING_TAG,
48          READING_LITERAL,
49          READING_COPY
50      }
51  
52      public void reset() {
53          state = State.READING_PREAMBLE;
54          tag = 0;
55          written = 0;
56      }
57  
58      public void encode(final ByteBuf in, final ByteBuf out, final int length) {
59          // Write the preamble length to the output buffer
60          for (int i = 0;; i ++) {
61              int b = length >>> i * 7;
62              if ((b & 0xFFFFFF80) != 0) {
63                  out.writeByte(b & 0x7f | 0x80);
64              } else {
65                  out.writeByte(b);
66                  break;
67              }
68          }
69  
70          int inIndex = in.readerIndex();
71          final int baseIndex = inIndex;
72  
73          final short[] table = getHashTable(length);
74          final int shift = Integer.numberOfLeadingZeros(table.length) + 1;
75  
76          int nextEmit = inIndex;
77  
78          if (length - inIndex >= MIN_COMPRESSIBLE_BYTES) {
79              int nextHash = hash(in, ++inIndex, shift);
80              outer: while (true) {
81                  int skip = 32;
82  
83                  int candidate;
84                  int nextIndex = inIndex;
85                  do {
86                      inIndex = nextIndex;
87                      int hash = nextHash;
88                      int bytesBetweenHashLookups = skip++ >> 5;
89                      nextIndex = inIndex + bytesBetweenHashLookups;
90  
91                      // We need at least 4 remaining bytes to read the hash
92                      if (nextIndex > length - 4) {
93                          break outer;
94                      }
95  
96                      nextHash = hash(in, nextIndex, shift);
97  
98                      candidate = baseIndex + table[hash];
99  
100                     table[hash] = (short) (inIndex - baseIndex);
101                 }
102                 while (in.getInt(inIndex) != in.getInt(candidate));
103 
104                 encodeLiteral(in, out, inIndex - nextEmit);
105 
106                 int insertTail;
107                 do {
108                     int base = inIndex;
109                     int matched = 4 + findMatchingLength(in, candidate + 4, inIndex + 4, length);
110                     inIndex += matched;
111                     int offset = base - candidate;
112                     encodeCopy(out, offset, matched);
113                     in.readerIndex(in.readerIndex() + matched);
114                     insertTail = inIndex - 1;
115                     nextEmit = inIndex;
116                     if (inIndex >= length - 4) {
117                         break outer;
118                     }
119 
120                     int prevHash = hash(in, insertTail, shift);
121                     table[prevHash] = (short) (inIndex - baseIndex - 1);
122                     int currentHash = hash(in, insertTail + 1, shift);
123                     candidate = baseIndex + table[currentHash];
124                     table[currentHash] = (short) (inIndex - baseIndex);
125                 }
126                 while (in.getInt(insertTail + 1) == in.getInt(candidate));
127 
128                 nextHash = hash(in, insertTail + 2, shift);
129                 ++inIndex;
130             }
131         }
132 
133         // If there are any remaining characters, write them out as a literal
134         if (nextEmit < length) {
135             encodeLiteral(in, out, length - nextEmit);
136         }
137     }
138 
139     /**
140      * Hashes the 4 bytes located at index, shifting the resulting hash into
141      * the appropriate range for our hash table.
142      *
143      * @param in The input buffer to read 4 bytes from
144      * @param index The index to read at
145      * @param shift The shift value, for ensuring that the resulting value is
146      *     within the range of our hash table size
147      * @return A 32-bit hash of 4 bytes located at index
148      */
149     private static int hash(ByteBuf in, int index, int shift) {
150         return in.getInt(index) * 0x1e35a7bd >>> shift;
151     }
152 
153     /**
154      * Creates an appropriately sized hashtable for the given input size
155      *
156      * @param inputSize The size of our input, ie. the number of bytes we need to encode
157      * @return An appropriately sized empty hashtable
158      */
159     private static short[] getHashTable(int inputSize) {
160         int htSize = 256;
161         while (htSize < MAX_HT_SIZE && htSize < inputSize) {
162             htSize <<= 1;
163         }
164         return new short[htSize];
165     }
166 
167     /**
168      * Iterates over the supplied input buffer between the supplied minIndex and
169      * maxIndex to find how long our matched copy overlaps with an already-written
170      * literal value.
171      *
172      * @param in The input buffer to scan over
173      * @param minIndex The index in the input buffer to start scanning from
174      * @param inIndex The index of the start of our copy
175      * @param maxIndex The length of our input buffer
176      * @return The number of bytes for which our candidate copy is a repeat of
177      */
178     private static int findMatchingLength(ByteBuf in, int minIndex, int inIndex, int maxIndex) {
179         int matched = 0;
180 
181         while (inIndex <= maxIndex - 4 &&
182                 in.getInt(inIndex) == in.getInt(minIndex + matched)) {
183             inIndex += 4;
184             matched += 4;
185         }
186 
187         while (inIndex < maxIndex && in.getByte(minIndex + matched) == in.getByte(inIndex)) {
188             ++inIndex;
189             ++matched;
190         }
191 
192         return matched;
193     }
194 
195     /**
196      * Calculates the minimum number of bits required to encode a value.  This can
197      * then in turn be used to calculate the number of septets or octets (as
198      * appropriate) to use to encode a length parameter.
199      *
200      * @param value The value to calculate the minimum number of bits required to encode
201      * @return The minimum number of bits required to encode the supplied value
202      */
203     private static int bitsToEncode(int value) {
204         int highestOneBit = Integer.highestOneBit(value);
205         int bitLength = 0;
206         while ((highestOneBit >>= 1) != 0) {
207             bitLength++;
208         }
209 
210         return bitLength;
211     }
212 
213     /**
214      * Writes a literal to the supplied output buffer by directly copying from
215      * the input buffer.  The literal is taken from the current readerIndex
216      * up to the supplied length.
217      *
218      * @param in The input buffer to copy from
219      * @param out The output buffer to copy to
220      * @param length The length of the literal to copy
221      */
222     static void encodeLiteral(ByteBuf in, ByteBuf out, int length) {
223         if (length < 61) {
224             out.writeByte(length - 1 << 2);
225         } else {
226             int bitLength = bitsToEncode(length - 1);
227             int bytesToEncode = 1 + bitLength / 8;
228             out.writeByte(59 + bytesToEncode << 2);
229             for (int i = 0; i < bytesToEncode; i++) {
230                 out.writeByte(length - 1 >> i * 8 & 0x0ff);
231             }
232         }
233 
234         out.writeBytes(in, length);
235     }
236 
237     private static void encodeCopyWithOffset(ByteBuf out, int offset, int length) {
238         if (length < 12 && offset < 2048) {
239             out.writeByte(COPY_1_BYTE_OFFSET | length - 4 << 2 | offset >> 8 << 5);
240             out.writeByte(offset & 0x0ff);
241         } else {
242             out.writeByte(COPY_2_BYTE_OFFSET | length - 1 << 2);
243             out.writeByte(offset & 0x0ff);
244             out.writeByte(offset >> 8 & 0x0ff);
245         }
246     }
247 
248     /**
249      * Encodes a series of copies, each at most 64 bytes in length.
250      *
251      * @param out The output buffer to write the copy pointer to
252      * @param offset The offset at which the original instance lies
253      * @param length The length of the original instance
254      */
255     private static void encodeCopy(ByteBuf out, int offset, int length) {
256         while (length >= 68) {
257             encodeCopyWithOffset(out, offset, 64);
258             length -= 64;
259         }
260 
261         if (length > 64) {
262             encodeCopyWithOffset(out, offset, 60);
263             length -= 60;
264         }
265 
266         encodeCopyWithOffset(out, offset, length);
267     }
268 
269     public void decode(ByteBuf in, ByteBuf out) {
270         while (in.isReadable()) {
271             switch (state) {
272             case READING_PREAMBLE:
273                 int uncompressedLength = readPreamble(in);
274                 if (uncompressedLength == PREAMBLE_NOT_FULL) {
275                     // We've not yet read all of the preamble, so wait until we can
276                     return;
277                 }
278                 if (uncompressedLength == 0) {
279                     // Should never happen, but it does mean we have nothing further to do
280                     return;
281                 }
282                 out.ensureWritable(uncompressedLength);
283                 state = State.READING_TAG;
284                 // fall through
285             case READING_TAG:
286                 if (!in.isReadable()) {
287                     return;
288                 }
289                 tag = in.readByte();
290                 switch (tag & 0x03) {
291                 case LITERAL:
292                     state = State.READING_LITERAL;
293                     break;
294                 case COPY_1_BYTE_OFFSET:
295                 case COPY_2_BYTE_OFFSET:
296                 case COPY_4_BYTE_OFFSET:
297                     state = State.READING_COPY;
298                     break;
299                 }
300                 break;
301             case READING_LITERAL:
302                 int literalWritten = decodeLiteral(tag, in, out);
303                 if (literalWritten != NOT_ENOUGH_INPUT) {
304                     state = State.READING_TAG;
305                     written += literalWritten;
306                 } else {
307                     // Need to wait for more data
308                     return;
309                 }
310                 break;
311             case READING_COPY:
312                 int decodeWritten;
313                 switch (tag & 0x03) {
314                 case COPY_1_BYTE_OFFSET:
315                     decodeWritten = decodeCopyWith1ByteOffset(tag, in, out, written);
316                     if (decodeWritten != NOT_ENOUGH_INPUT) {
317                         state = State.READING_TAG;
318                         written += decodeWritten;
319                     } else {
320                         // Need to wait for more data
321                         return;
322                     }
323                     break;
324                 case COPY_2_BYTE_OFFSET:
325                     decodeWritten = decodeCopyWith2ByteOffset(tag, in, out, written);
326                     if (decodeWritten != NOT_ENOUGH_INPUT) {
327                         state = State.READING_TAG;
328                         written += decodeWritten;
329                     } else {
330                         // Need to wait for more data
331                         return;
332                     }
333                     break;
334                 case COPY_4_BYTE_OFFSET:
335                     decodeWritten = decodeCopyWith4ByteOffset(tag, in, out, written);
336                     if (decodeWritten != NOT_ENOUGH_INPUT) {
337                         state = State.READING_TAG;
338                         written += decodeWritten;
339                     } else {
340                         // Need to wait for more data
341                         return;
342                     }
343                     break;
344                 }
345             }
346         }
347     }
348 
349     /**
350      * Reads the length varint (a series of bytes, where the lower 7 bits
351      * are data and the upper bit is a flag to indicate more bytes to be
352      * read).
353      *
354      * @param in The input buffer to read the preamble from
355      * @return The calculated length based on the input buffer, or 0 if
356      *   no preamble is able to be calculated
357      */
358     private static int readPreamble(ByteBuf in) {
359         int length = 0;
360         int byteIndex = 0;
361         while (in.isReadable()) {
362             int current = in.readUnsignedByte();
363             length |= (current & 0x7f) << byteIndex++ * 7;
364             if ((current & 0x80) == 0) {
365                 return length;
366             }
367 
368             if (byteIndex >= 4) {
369                 throw new DecompressionException("Preamble is greater than 4 bytes");
370             }
371         }
372 
373         return 0;
374     }
375 
376     /**
377      * Get the length varint (a series of bytes, where the lower 7 bits
378      * are data and the upper bit is a flag to indicate more bytes to be
379      * read).
380      *
381      * @param in The input buffer to get the preamble from
382      * @return The calculated length based on the input buffer, or 0 if
383      *   no preamble is able to be calculated
384      */
385     int getPreamble(ByteBuf in) {
386         if (state == State.READING_PREAMBLE) {
387             int readerIndex = in.readerIndex();
388             try {
389                 return readPreamble(in);
390             } finally {
391                 in.readerIndex(readerIndex);
392             }
393         }
394         return 0;
395     }
396 
397     /**
398      * Reads a literal from the input buffer directly to the output buffer.
399      * A "literal" is an uncompressed segment of data stored directly in the
400      * byte stream.
401      *
402      * @param tag The tag that identified this segment as a literal is also
403      *            used to encode part of the length of the data
404      * @param in The input buffer to read the literal from
405      * @param out The output buffer to write the literal to
406      * @return The number of bytes appended to the output buffer, or -1 to indicate "try again later"
407      */
408     static int decodeLiteral(byte tag, ByteBuf in, ByteBuf out) {
409         in.markReaderIndex();
410         int length;
411         switch(tag >> 2 & 0x3F) {
412         case 60:
413             if (!in.isReadable()) {
414                 return NOT_ENOUGH_INPUT;
415             }
416             length = in.readUnsignedByte();
417             break;
418         case 61:
419             if (in.readableBytes() < 2) {
420                 return NOT_ENOUGH_INPUT;
421             }
422             length = in.readUnsignedShortLE();
423             break;
424         case 62:
425             if (in.readableBytes() < 3) {
426                 return NOT_ENOUGH_INPUT;
427             }
428             length = in.readUnsignedMediumLE();
429             break;
430         case 63:
431             if (in.readableBytes() < 4) {
432                 return NOT_ENOUGH_INPUT;
433             }
434             length = in.readIntLE();
435             break;
436         default:
437             length = tag >> 2 & 0x3F;
438         }
439         length += 1;
440 
441         if (in.readableBytes() < length) {
442             in.resetReaderIndex();
443             return NOT_ENOUGH_INPUT;
444         }
445 
446         out.writeBytes(in, length);
447         return length;
448     }
449 
450     /**
451      * Reads a compressed reference offset and length from the supplied input
452      * buffer, seeks back to the appropriate place in the input buffer and
453      * writes the found data to the supplied output stream.
454      *
455      * @param tag The tag used to identify this as a copy is also used to encode
456      *     the length and part of the offset
457      * @param in The input buffer to read from
458      * @param out The output buffer to write to
459      * @return The number of bytes appended to the output buffer, or -1 to indicate
460      *     "try again later"
461      * @throws DecompressionException If the read offset is invalid
462      */
463     private static int decodeCopyWith1ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
464         if (!in.isReadable()) {
465             return NOT_ENOUGH_INPUT;
466         }
467 
468         int initialIndex = out.writerIndex();
469         int length = 4 + ((tag & 0x01c) >> 2);
470         int offset = (tag & 0x0e0) << 8 >> 5 | in.readUnsignedByte();
471 
472         validateOffset(offset, writtenSoFar);
473 
474         out.markReaderIndex();
475         if (offset < length) {
476             int copies = length / offset;
477             for (; copies > 0; copies--) {
478                 out.readerIndex(initialIndex - offset);
479                 out.readBytes(out, offset);
480             }
481             if (length % offset != 0) {
482                 out.readerIndex(initialIndex - offset);
483                 out.readBytes(out, length % offset);
484             }
485         } else {
486             out.readerIndex(initialIndex - offset);
487             out.readBytes(out, length);
488         }
489         out.resetReaderIndex();
490 
491         return length;
492     }
493 
494     /**
495      * Reads a compressed reference offset and length from the supplied input
496      * buffer, seeks back to the appropriate place in the input buffer and
497      * writes the found data to the supplied output stream.
498      *
499      * @param tag The tag used to identify this as a copy is also used to encode
500      *     the length and part of the offset
501      * @param in The input buffer to read from
502      * @param out The output buffer to write to
503      * @throws DecompressionException If the read offset is invalid
504      * @return The number of bytes appended to the output buffer, or -1 to indicate
505      *     "try again later"
506      */
507     private static int decodeCopyWith2ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
508         if (in.readableBytes() < 2) {
509             return NOT_ENOUGH_INPUT;
510         }
511 
512         int initialIndex = out.writerIndex();
513         int length = 1 + (tag >> 2 & 0x03f);
514         int offset = in.readUnsignedShortLE();
515 
516         validateOffset(offset, writtenSoFar);
517 
518         out.markReaderIndex();
519         if (offset < length) {
520             int copies = length / offset;
521             for (; copies > 0; copies--) {
522                 out.readerIndex(initialIndex - offset);
523                 out.readBytes(out, offset);
524             }
525             if (length % offset != 0) {
526                 out.readerIndex(initialIndex - offset);
527                 out.readBytes(out, length % offset);
528             }
529         } else {
530             out.readerIndex(initialIndex - offset);
531             out.readBytes(out, length);
532         }
533         out.resetReaderIndex();
534 
535         return length;
536     }
537 
538     /**
539      * Reads a compressed reference offset and length from the supplied input
540      * buffer, seeks back to the appropriate place in the input buffer and
541      * writes the found data to the supplied output stream.
542      *
543      * @param tag The tag used to identify this as a copy is also used to encode
544      *     the length and part of the offset
545      * @param in The input buffer to read from
546      * @param out The output buffer to write to
547      * @return The number of bytes appended to the output buffer, or -1 to indicate
548      *     "try again later"
549      * @throws DecompressionException If the read offset is invalid
550      */
551     private static int decodeCopyWith4ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
552         if (in.readableBytes() < 4) {
553             return NOT_ENOUGH_INPUT;
554         }
555 
556         int initialIndex = out.writerIndex();
557         int length = 1 + (tag >> 2 & 0x03F);
558         int offset = in.readIntLE();
559 
560         validateOffset(offset, writtenSoFar);
561 
562         out.markReaderIndex();
563         if (offset < length) {
564             int copies = length / offset;
565             for (; copies > 0; copies--) {
566                 out.readerIndex(initialIndex - offset);
567                 out.readBytes(out, offset);
568             }
569             if (length % offset != 0) {
570                 out.readerIndex(initialIndex - offset);
571                 out.readBytes(out, length % offset);
572             }
573         } else {
574             out.readerIndex(initialIndex - offset);
575             out.readBytes(out, length);
576         }
577         out.resetReaderIndex();
578 
579         return length;
580     }
581 
582     /**
583      * Validates that the offset extracted from a compressed reference is within
584      * the permissible bounds of an offset (0 < offset < Integer.MAX_VALUE), and does not
585      * exceed the length of the chunk currently read so far.
586      *
587      * @param offset The offset extracted from the compressed reference
588      * @param chunkSizeSoFar The number of bytes read so far from this chunk
589      * @throws DecompressionException if the offset is invalid
590      */
591     private static void validateOffset(int offset, int chunkSizeSoFar) {
592         if (offset == 0) {
593             throw new DecompressionException("Offset is less than minimum permissible value");
594         }
595 
596         if (offset < 0) {
597             // Due to arithmetic overflow
598             throw new DecompressionException("Offset is greater than maximum value supported by this implementation");
599         }
600 
601         if (offset > chunkSizeSoFar) {
602             throw new DecompressionException("Offset exceeds size of chunk");
603         }
604     }
605 
606     /**
607      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
608      * on the computed checksum
609      *
610      * @param data The input data to calculate the CRC32C checksum of
611      */
612     static int calculateChecksum(ByteBuf data) {
613         return calculateChecksum(data, data.readerIndex(), data.readableBytes());
614     }
615 
616     /**
617      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
618      * on the computed checksum
619      *
620      * @param data The input data to calculate the CRC32C checksum of
621      */
622     static int calculateChecksum(ByteBuf data, int offset, int length) {
623         Crc32c crc32 = new Crc32c();
624         try {
625             crc32.update(data, offset, length);
626             return maskChecksum(crc32.getValue());
627         } finally {
628             crc32.reset();
629         }
630     }
631 
632     /**
633      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
634      * on the computed checksum, and then compares the resulting masked checksum to the
635      * supplied checksum.
636      *
637      * @param expectedChecksum The checksum decoded from the stream to compare against
638      * @param data The input data to calculate the CRC32C checksum of
639      * @throws DecompressionException If the calculated and supplied checksums do not match
640      */
641     static void validateChecksum(int expectedChecksum, ByteBuf data) {
642         validateChecksum(expectedChecksum, data, data.readerIndex(), data.readableBytes());
643     }
644 
645     /**
646      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
647      * on the computed checksum, and then compares the resulting masked checksum to the
648      * supplied checksum.
649      *
650      * @param expectedChecksum The checksum decoded from the stream to compare against
651      * @param data The input data to calculate the CRC32C checksum of
652      * @throws DecompressionException If the calculated and supplied checksums do not match
653      */
654     static void validateChecksum(int expectedChecksum, ByteBuf data, int offset, int length) {
655         final int actualChecksum = calculateChecksum(data, offset, length);
656         if (actualChecksum != expectedChecksum) {
657             throw new DecompressionException(
658                     "mismatching checksum: " + Integer.toHexString(actualChecksum) +
659                             " (expected: " + Integer.toHexString(expectedChecksum) + ')');
660         }
661     }
662 
663     /**
664      * From the spec:
665      *
666      * "Checksums are not stored directly, but masked, as checksumming data and
667      * then its own checksum can be problematic. The masking is the same as used
668      * in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant
669      * 0xa282ead8 (using wraparound as normal for unsigned integers)."
670      *
671      * @param checksum The actual checksum of the data
672      * @return The masked checksum
673      */
674     static int maskChecksum(long checksum) {
675         return (int) ((checksum >> 15 | checksum << 17) + 0xa282ead8);
676     }
677 }