View Javadoc
1   /*
2    * Copyright 2012 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty5.handler.codec.compression;
17  
18  import io.netty5.buffer.BufferUtil;
19  import io.netty5.buffer.api.Buffer;
20  
21  /**
22   * Uncompresses an input {@link Buffer} encoded with Snappy compression into an
23   * output {@link Buffer}.
24   *
25   * See <a href="https://github.com/google/snappy/blob/master/format_description.txt">snappy format</a>.
26   */
27  public final class Snappy {
28  
29      private static final int MAX_HT_SIZE = 1 << 14;
30      private static final int MIN_COMPRESSIBLE_BYTES = 15;
31  
32      // used as a return value to indicate that we haven't yet read our full preamble
33      private static final int PREAMBLE_NOT_FULL = -1;
34      private static final int NOT_ENOUGH_INPUT = -1;
35  
36      // constants for the tag types
37      private static final int LITERAL = 0;
38      private static final int COPY_1_BYTE_OFFSET = 1;
39      private static final int COPY_2_BYTE_OFFSET = 2;
40      private static final int COPY_4_BYTE_OFFSET = 3;
41  
42      private State state = State.READING_PREAMBLE;
43      private byte tag;
44      private int written;
45  
46      private enum State {
47          READING_PREAMBLE,
48          READING_TAG,
49          READING_LITERAL,
50          READING_COPY
51      }
52  
53      public void reset() {
54          state = State.READING_PREAMBLE;
55          tag = 0;
56          written = 0;
57      }
58  
59      public void encode(final Buffer in, final Buffer out, final int length) {
60          // Write the preamble length to the output buffer
61          for (int i = 0;; i ++) {
62              int b = length >>> i * 7;
63              if ((b & 0xFFFFFF80) != 0) {
64                  out.writeByte((byte) (b & 0x7f | 0x80));
65              } else {
66                  out.writeByte((byte) b);
67                  break;
68              }
69          }
70  
71          int inIndex = in.readerOffset();
72          final int baseIndex = inIndex;
73  
74          final short[] table = getHashTable(length);
75          final int shift = Integer.numberOfLeadingZeros(table.length) + 1;
76  
77          int nextEmit = inIndex;
78  
79          if (length - inIndex >= MIN_COMPRESSIBLE_BYTES) {
80              int nextHash = hash(in, ++inIndex, shift);
81              outer: while (true) {
82                  int skip = 32;
83  
84                  int candidate;
85                  int nextIndex = inIndex;
86                  do {
87                      inIndex = nextIndex;
88                      int hash = nextHash;
89                      int bytesBetweenHashLookups = skip++ >> 5;
90                      nextIndex = inIndex + bytesBetweenHashLookups;
91  
92                      // We need at least 4 remaining bytes to read the hash
93                      if (nextIndex > length - 4) {
94                          break outer;
95                      }
96  
97                      nextHash = hash(in, nextIndex, shift);
98  
99                      candidate = baseIndex + table[hash];
100 
101                     table[hash] = (short) (inIndex - baseIndex);
102                 }
103                 while (in.getInt(inIndex) != in.getInt(candidate));
104 
105                 encodeLiteral(in, out, inIndex - nextEmit);
106 
107                 int insertTail;
108                 do {
109                     int base = inIndex;
110                     int matched = 4 + findMatchingLength(in, candidate + 4, inIndex + 4, length);
111                     inIndex += matched;
112                     int offset = base - candidate;
113                     encodeCopy(out, offset, matched);
114                     in.readerOffset(in.readerOffset() + matched);
115                     insertTail = inIndex - 1;
116                     nextEmit = inIndex;
117                     if (inIndex >= length - 4) {
118                         break outer;
119                     }
120 
121                     int prevHash = hash(in, insertTail, shift);
122                     table[prevHash] = (short) (inIndex - baseIndex - 1);
123                     int currentHash = hash(in, insertTail + 1, shift);
124                     candidate = baseIndex + table[currentHash];
125                     table[currentHash] = (short) (inIndex - baseIndex);
126                 }
127                 while (in.getInt(insertTail + 1) == in.getInt(candidate));
128 
129                 nextHash = hash(in, insertTail + 2, shift);
130                 ++inIndex;
131             }
132         }
133 
134         // If there are any remaining characters, write them out as a literal
135         if (nextEmit < length) {
136             encodeLiteral(in, out, length - nextEmit);
137         }
138     }
139 
140     /**
141      * Hashes the 4 bytes located at index, shifting the resulting hash into
142      * the appropriate range for our hash table.
143      *
144      * @param in The input buffer to read 4 bytes from
145      * @param index The index to read at
146      * @param shift The shift value, for ensuring that the resulting value is
147      *     within the range of our hash table size
148      * @return A 32-bit hash of 4 bytes located at index
149      */
150     private static int hash(Buffer in, int index, int shift) {
151         return in.getInt(index) * 0x1e35a7bd >>> shift;
152     }
153 
154     /**
155      * Creates an appropriately sized hashtable for the given input size
156      *
157      * @param inputSize The size of our input, ie. the number of bytes we need to encode
158      * @return An appropriately sized empty hashtable
159      */
160     private static short[] getHashTable(int inputSize) {
161         int htSize = 256;
162         while (htSize < MAX_HT_SIZE && htSize < inputSize) {
163             htSize <<= 1;
164         }
165         return new short[htSize];
166     }
167 
168     /**
169      * Iterates over the supplied input buffer between the supplied minIndex and
170      * maxIndex to find how long our matched copy overlaps with an already-written
171      * literal value.
172      *
173      * @param in The input buffer to scan over
174      * @param minIndex The index in the input buffer to start scanning from
175      * @param inIndex The index of the start of our copy
176      * @param maxIndex The length of our input buffer
177      * @return The number of bytes for which our candidate copy is a repeat of
178      */
179     private static int findMatchingLength(Buffer in, int minIndex, int inIndex, int maxIndex) {
180         int matched = 0;
181 
182         while (inIndex <= maxIndex - 4 &&
183                 in.getInt(inIndex) == in.getInt(minIndex + matched)) {
184             inIndex += 4;
185             matched += 4;
186         }
187 
188         while (inIndex < maxIndex && in.getByte(minIndex + matched) == in.getByte(inIndex)) {
189             ++inIndex;
190             ++matched;
191         }
192 
193         return matched;
194     }
195 
196     /**
197      * Calculates the minimum number of bits required to encode a value.  This can
198      * then in turn be used to calculate the number of septets or octets (as
199      * appropriate) to use to encode a length parameter.
200      *
201      * @param value The value to calculate the minimum number of bits required to encode
202      * @return The minimum number of bits required to encode the supplied value
203      */
204     private static int bitsToEncode(int value) {
205         int highestOneBit = Integer.highestOneBit(value);
206         int bitLength = 0;
207         while ((highestOneBit >>= 1) != 0) {
208             bitLength++;
209         }
210 
211         return bitLength;
212     }
213 
214     /**
215      * Writes a literal to the supplied output buffer by directly copying from
216      * the input buffer.  The literal is taken from the current readerIndex
217      * up to the supplied length.
218      *
219      * @param in The input buffer to copy from
220      * @param out The output buffer to copy to
221      * @param length The length of the literal to copy
222      */
223     static void encodeLiteral(Buffer in, Buffer out, int length) {
224         if (length < 61) {
225             out.writeByte((byte) (length - 1 << 2));
226         } else {
227             int bitLength = bitsToEncode(length - 1);
228             int bytesToEncode = 1 + bitLength / 8;
229             out.writeByte((byte) (59 + bytesToEncode << 2));
230             for (int i = 0; i < bytesToEncode; i++) {
231                 out.writeByte((byte) (length - 1 >> i * 8 & 0x0ff));
232             }
233         }
234 
235         out.ensureWritable(length);
236         in.copyInto(in.readerOffset(), out, out.writerOffset(), length);
237         in.skipReadableBytes(length);
238         out.skipWritableBytes(length);
239     }
240 
241     private static void encodeCopyWithOffset(Buffer out, int offset, int length) {
242         if (length < 12 && offset < 2048) {
243             out.writeByte((byte) (COPY_1_BYTE_OFFSET | length - 4 << 2 | offset >> 8 << 5));
244             out.writeByte((byte) (offset & 0x0ff));
245         } else {
246             out.writeByte((byte) (COPY_2_BYTE_OFFSET | length - 1 << 2));
247             out.writeByte((byte) (offset & 0x0ff));
248             out.writeByte((byte) (offset >> 8 & 0x0ff));
249         }
250     }
251 
252     /**
253      * Encodes a series of copies, each at most 64 bytes in length.
254      *
255      * @param out The output buffer to write the copy pointer to
256      * @param offset The offset at which the original instance lies
257      * @param length The length of the original instance
258      */
259     private static void encodeCopy(Buffer out, int offset, int length) {
260         while (length >= 68) {
261             encodeCopyWithOffset(out, offset, 64);
262             length -= 64;
263         }
264 
265         if (length > 64) {
266             encodeCopyWithOffset(out, offset, 60);
267             length -= 60;
268         }
269 
270         encodeCopyWithOffset(out, offset, length);
271     }
272 
273     public void decode(Buffer in, Buffer out) {
274         while (in.readableBytes() > 0) {
275             switch (state) {
276             case READING_PREAMBLE:
277                 int uncompressedLength = readPreamble(in);
278                 if (uncompressedLength == PREAMBLE_NOT_FULL) {
279                     // We've not yet read all of the preamble, so wait until we can
280                     return;
281                 }
282                 if (uncompressedLength == 0) {
283                     // Should never happen, but it does mean we have nothing further to do
284                     return;
285                 }
286                 out.ensureWritable(uncompressedLength);
287                 state = State.READING_TAG;
288                 // fall through
289             case READING_TAG:
290                 if (in.readableBytes() == 0) {
291                     return;
292                 }
293                 tag = in.readByte();
294                 switch (tag & 0x03) {
295                 case LITERAL:
296                     state = State.READING_LITERAL;
297                     break;
298                 case COPY_1_BYTE_OFFSET:
299                 case COPY_2_BYTE_OFFSET:
300                 case COPY_4_BYTE_OFFSET:
301                     state = State.READING_COPY;
302                     break;
303                 }
304                 break;
305             case READING_LITERAL:
306                 int literalWritten = decodeLiteral(tag, in, out);
307                 if (literalWritten != NOT_ENOUGH_INPUT) {
308                     state = State.READING_TAG;
309                     written += literalWritten;
310                 } else {
311                     // Need to wait for more data
312                     return;
313                 }
314                 break;
315             case READING_COPY:
316                 int decodeWritten;
317                 switch (tag & 0x03) {
318                 case COPY_1_BYTE_OFFSET:
319                     decodeWritten = decodeCopyWith1ByteOffset(tag, in, out, written);
320                     if (decodeWritten != NOT_ENOUGH_INPUT) {
321                         state = State.READING_TAG;
322                         written += decodeWritten;
323                     } else {
324                         // Need to wait for more data
325                         return;
326                     }
327                     break;
328                 case COPY_2_BYTE_OFFSET:
329                     decodeWritten = decodeCopyWith2ByteOffset(tag, in, out, written);
330                     if (decodeWritten != NOT_ENOUGH_INPUT) {
331                         state = State.READING_TAG;
332                         written += decodeWritten;
333                     } else {
334                         // Need to wait for more data
335                         return;
336                     }
337                     break;
338                 case COPY_4_BYTE_OFFSET:
339                     decodeWritten = decodeCopyWith4ByteOffset(tag, in, out, written);
340                     if (decodeWritten != NOT_ENOUGH_INPUT) {
341                         state = State.READING_TAG;
342                         written += decodeWritten;
343                     } else {
344                         // Need to wait for more data
345                         return;
346                     }
347                     break;
348                 }
349             }
350         }
351     }
352 
353     /**
354      * Reads the length varint (a series of bytes, where the lower 7 bits
355      * are data and the upper bit is a flag to indicate more bytes to be
356      * read).
357      *
358      * @param in The input buffer to read the preamble from
359      * @return The calculated length based on the input buffer, or 0 if
360      *   no preamble is able to be calculated
361      */
362     private static int readPreamble(Buffer in) {
363         int length = 0;
364         int byteIndex = 0;
365         while (in.readableBytes() > 0) {
366             int current = in.readUnsignedByte();
367             length |= (current & 0x7f) << byteIndex++ * 7;
368             if ((current & 0x80) == 0) {
369                 return length;
370             }
371 
372             if (byteIndex >= 4) {
373                 throw new DecompressionException("Preamble is greater than 4 bytes");
374             }
375         }
376 
377         return 0;
378     }
379 
380     /**
381      * Get the length varint (a series of bytes, where the lower 7 bits
382      * are data and the upper bit is a flag to indicate more bytes to be
383      * read).
384      *
385      * @param in The input buffer to get the preamble from
386      * @return The calculated length based on the input buffer, or 0 if
387      *   no preamble is able to be calculated
388      */
389     int getPreamble(Buffer in) {
390         if (state == State.READING_PREAMBLE) {
391             int readerIndex = in.readerOffset();
392             try {
393                 return readPreamble(in);
394             } finally {
395                 in.readerOffset(readerIndex);
396             }
397         }
398         return 0;
399     }
400 
401     /**
402      * Reads a literal from the input buffer directly to the output buffer.
403      * A "literal" is an uncompressed segment of data stored directly in the
404      * byte stream.
405      *
406      * @param tag The tag that identified this segment as a literal is also
407      *            used to encode part of the length of the data
408      * @param in The input buffer to read the literal from
409      * @param out The output buffer to write the literal to
410      * @return The number of bytes appended to the output buffer, or -1 to indicate "try again later"
411      */
412     static int decodeLiteral(byte tag, Buffer in, Buffer out) {
413         int readerIndex = in.readerOffset();
414         int length;
415         switch(tag >> 2 & 0x3F) {
416         case 60:
417             if (in.readableBytes() == 0) {
418                 return NOT_ENOUGH_INPUT;
419             }
420             length = in.readUnsignedByte();
421             break;
422         case 61:
423             if (in.readableBytes() < 2) {
424                 return NOT_ENOUGH_INPUT;
425             }
426             length = BufferUtil.reverseUnsignedShort(in.readUnsignedShort());
427             break;
428         case 62:
429             if (in.readableBytes() < 3) {
430                 return NOT_ENOUGH_INPUT;
431             }
432             length = BufferUtil.reverseUnsignedMedium(in.readUnsignedMedium());
433             break;
434         case 63:
435             if (in.readableBytes() < 4) {
436                 return NOT_ENOUGH_INPUT;
437             }
438             length = Integer.reverseBytes(in.readInt());
439             break;
440         default:
441             length = tag >> 2 & 0x3F;
442         }
443         length += 1;
444 
445         if (in.readableBytes() < length) {
446             in.readerOffset(readerIndex);
447             return NOT_ENOUGH_INPUT;
448         }
449 
450         out.ensureWritable(length);
451         in.copyInto(in.readerOffset(), out, out.writerOffset(), length);
452         in.skipReadableBytes(length);
453         out.skipWritableBytes(length);
454         return length;
455     }
456 
457     /**
458      * Reads a compressed reference offset and length from the supplied input
459      * buffer, seeks back to the appropriate place in the input buffer and
460      * writes the found data to the supplied output stream.
461      *
462      * @param tag The tag used to identify this as a copy is also used to encode
463      *     the length and part of the offset
464      * @param in The input buffer to read from
465      * @param out The output buffer to write to
466      * @return The number of bytes appended to the output buffer, or -1 to indicate
467      *     "try again later"
468      * @throws DecompressionException If the read offset is invalid
469      */
470     private static int decodeCopyWith1ByteOffset(byte tag, Buffer in, Buffer out, int writtenSoFar) {
471         if (in.readableBytes() == 0) {
472             return NOT_ENOUGH_INPUT;
473         }
474 
475         int initialIndex = out.writerOffset();
476         int length = 4 + ((tag & 0x01c) >> 2);
477         int offset = (tag & 0x0e0) << 8 >> 5 | in.readUnsignedByte();
478 
479         validateOffset(offset, writtenSoFar);
480 
481         int readerIndex = out.readerOffset();
482         if (offset < length) {
483             int copies = length / offset;
484             for (; copies > 0; copies--) {
485                 copyRegion(out, initialIndex, offset, offset);
486             }
487             if (length % offset != 0) {
488                 copyRegion(out, initialIndex, offset, length % offset);
489             }
490         } else {
491             copyRegion(out, initialIndex, offset, length);
492         }
493         out.readerOffset(readerIndex);
494 
495         return length;
496     }
497 
498     private static void copyRegion(Buffer out, int initialIndex, int offset, int length) {
499         out.readerOffset(initialIndex - offset);
500         out.copyInto(out.readerOffset(), out, out.writerOffset(), length);
501         out.skipWritableBytes(length).skipReadableBytes(length);
502     }
503 
504     /**
505      * Reads a compressed reference offset and length from the supplied input
506      * buffer, seeks back to the appropriate place in the input buffer and
507      * writes the found data to the supplied output stream.
508      *
509      * @param tag The tag used to identify this as a copy is also used to encode
510      *     the length and part of the offset
511      * @param in The input buffer to read from
512      * @param out The output buffer to write to
513      * @throws DecompressionException If the read offset is invalid
514      * @return The number of bytes appended to the output buffer, or -1 to indicate
515      *     "try again later"
516      */
517     private static int decodeCopyWith2ByteOffset(byte tag, Buffer in, Buffer out, int writtenSoFar) {
518         if (in.readableBytes() < 2) {
519             return NOT_ENOUGH_INPUT;
520         }
521 
522         int initialIndex = out.writerOffset();
523         int length = 1 + (tag >> 2 & 0x03f);
524         int offset = BufferUtil.reverseUnsignedShort(in.readUnsignedShort());
525 
526         validateOffset(offset, writtenSoFar);
527 
528         int readerIndex = out.readerOffset();
529         if (offset < length) {
530             int copies = length / offset;
531             for (; copies > 0; copies--) {
532                 copyRegion(out, initialIndex, offset, offset);
533             }
534             if (length % offset != 0) {
535                 copyRegion(out, initialIndex, offset, length % offset);
536             }
537         } else {
538             copyRegion(out, initialIndex, offset, length);
539         }
540         out.readerOffset(readerIndex);
541 
542         return length;
543     }
544 
545     /**
546      * Reads a compressed reference offset and length from the supplied input
547      * buffer, seeks back to the appropriate place in the input buffer and
548      * writes the found data to the supplied output stream.
549      *
550      * @param tag The tag used to identify this as a copy is also used to encode
551      *     the length and part of the offset
552      * @param in The input buffer to read from
553      * @param out The output buffer to write to
554      * @return The number of bytes appended to the output buffer, or -1 to indicate
555      *     "try again later"
556      * @throws DecompressionException If the read offset is invalid
557      */
558     private static int decodeCopyWith4ByteOffset(byte tag, Buffer in, Buffer out, int writtenSoFar) {
559         if (in.readableBytes() < 4) {
560             return NOT_ENOUGH_INPUT;
561         }
562 
563         int initialIndex = out.writerOffset();
564         int length = 1 + (tag >> 2 & 0x03F);
565         int offset = Integer.reverseBytes(in.readInt());
566 
567         validateOffset(offset, writtenSoFar);
568 
569         int readerIndex = out.readerOffset();
570         if (offset < length) {
571             int copies = length / offset;
572             for (; copies > 0; copies--) {
573                 copyRegion(out, initialIndex, offset, offset);
574             }
575             if (length % offset != 0) {
576                 copyRegion(out, initialIndex, offset, length % offset);
577             }
578         } else {
579             copyRegion(out, initialIndex, offset, length);
580         }
581         out.readerOffset(readerIndex);
582 
583         return length;
584     }
585 
586     /**
587      * Validates that the offset extracted from a compressed reference is within
588      * the permissible bounds of an offset (0 < offset < Integer.MAX_VALUE), and does not
589      * exceed the length of the chunk currently read so far.
590      *
591      * @param offset The offset extracted from the compressed reference
592      * @param chunkSizeSoFar The number of bytes read so far from this chunk
593      * @throws DecompressionException if the offset is invalid
594      */
595     private static void validateOffset(int offset, int chunkSizeSoFar) {
596         if (offset == 0) {
597             throw new DecompressionException("Offset is less than minimum permissible value");
598         }
599 
600         if (offset < 0) {
601             // Due to arithmetic overflow
602             throw new DecompressionException("Offset is greater than maximum value supported by this implementation");
603         }
604 
605         if (offset > chunkSizeSoFar) {
606             throw new DecompressionException("Offset exceeds size of chunk");
607         }
608     }
609 
610     /**
611      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
612      * on the computed checksum
613      *
614      * @param data The input data to calculate the CRC32C checksum of
615      */
616     static int calculateChecksum(Buffer data) {
617         return calculateChecksum(data, data.readerOffset(), data.readableBytes());
618     }
619 
620     /**
621      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
622      * on the computed checksum
623      *
624      * @param data The input data to calculate the CRC32C checksum of
625      */
626     static int calculateChecksum(Buffer data, int offset, int length) {
627         Crc32c crc32 = new Crc32c();
628         try {
629             crc32.update(data, offset, length);
630             return maskChecksum(crc32.getValue());
631         } finally {
632             crc32.reset();
633         }
634     }
635 
636     /**
637      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
638      * on the computed checksum, and then compares the resulting masked checksum to the
639      * supplied checksum.
640      *
641      * @param expectedChecksum The checksum decoded from the stream to compare against
642      * @param data The input data to calculate the CRC32C checksum of
643      * @throws DecompressionException If the calculated and supplied checksums do not match
644      */
645     static void validateChecksum(int expectedChecksum, Buffer data) {
646         validateChecksum(expectedChecksum, data, data.readerOffset(), data.readableBytes());
647     }
648 
649     /**
650      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
651      * on the computed checksum, and then compares the resulting masked checksum to the
652      * supplied checksum.
653      *
654      * @param expectedChecksum The checksum decoded from the stream to compare against
655      * @param data The input data to calculate the CRC32C checksum of
656      * @throws DecompressionException If the calculated and supplied checksums do not match
657      */
658     static void validateChecksum(int expectedChecksum, Buffer data, int offset, int length) {
659         final int actualChecksum = calculateChecksum(data, offset, length);
660         if (actualChecksum != expectedChecksum) {
661             throw new DecompressionException(
662                     "mismatching checksum: " + Integer.toHexString(actualChecksum) +
663                             " (expected: " + Integer.toHexString(expectedChecksum) + ')');
664         }
665     }
666 
667     /**
668      * From the spec:
669      *
670      * "Checksums are not stored directly, but masked, as checksumming data and
671      * then its own checksum can be problematic. The masking is the same as used
672      * in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant
673      * 0xa282ead8 (using wraparound as normal for unsigned integers)."
674      *
675      * @param checksum The actual checksum of the data
676      * @return The masked checksum
677      */
678     static int maskChecksum(long checksum) {
679         return (int) ((checksum >> 15 | checksum << 17) + 0xa282ead8);
680     }
681 }