View Javadoc
1   /*
2    * Copyright 2012 The Netty Project
3    *
4    * The Netty Project licenses this file to you under the Apache License,
5    * version 2.0 (the "License"); you may not use this file except in compliance
6    * with the License. You may obtain a copy of the License at:
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13   * License for the specific language governing permissions and limitations
14   * under the License.
15   */
16  package io.netty.handler.codec.compression;
17  
18  import io.netty.buffer.ByteBuf;
19  import io.netty.buffer.ByteBufUtil;
20  
21  /**
22   * Uncompresses an input {@link ByteBuf} encoded with Snappy compression into an
23   * output {@link ByteBuf}.
24   *
25   * See http://code.google.com/p/snappy/source/browse/trunk/format_description.txt
26   */
27  class Snappy {
28  
29      private static final int MAX_HT_SIZE = 1 << 14;
30      private static final int MIN_COMPRESSIBLE_BYTES = 15;
31  
32      // used as a return value to indicate that we haven't yet read our full preamble
33      private static final int PREAMBLE_NOT_FULL = -1;
34      private static final int NOT_ENOUGH_INPUT = -1;
35  
36      // constants for the tag types
37      private static final int LITERAL = 0;
38      private static final int COPY_1_BYTE_OFFSET = 1;
39      private static final int COPY_2_BYTE_OFFSET = 2;
40      private static final int COPY_4_BYTE_OFFSET = 3;
41  
42      private State state = State.READY;
43      private byte tag;
44      private int written;
45  
46      private enum State {
47          READY,
48          READING_PREAMBLE,
49          READING_TAG,
50          READING_LITERAL,
51          READING_COPY
52      }
53  
54      public void reset() {
55          state = State.READY;
56          tag = 0;
57          written = 0;
58      }
59  
60      public void encode(final ByteBuf in, final ByteBuf out, final int length) {
61          // Write the preamble length to the output buffer
62          for (int i = 0;; i ++) {
63              int b = length >>> i * 7;
64              if ((b & 0xFFFFFF80) != 0) {
65                  out.writeByte(b & 0x7f | 0x80);
66              } else {
67                  out.writeByte(b);
68                  break;
69              }
70          }
71  
72          int inIndex = in.readerIndex();
73          final int baseIndex = inIndex;
74  
75          final short[] table = getHashTable(length);
76          final int shift = 32 - (int) Math.floor(Math.log(table.length) / Math.log(2));
77  
78          int nextEmit = inIndex;
79  
80          if (length - inIndex >= MIN_COMPRESSIBLE_BYTES) {
81              int nextHash = hash(in, ++inIndex, shift);
82              outer: while (true) {
83                  int skip = 32;
84  
85                  int candidate;
86                  int nextIndex = inIndex;
87                  do {
88                      inIndex = nextIndex;
89                      int hash = nextHash;
90                      int bytesBetweenHashLookups = skip++ >> 5;
91                      nextIndex = inIndex + bytesBetweenHashLookups;
92  
93                      // We need at least 4 remaining bytes to read the hash
94                      if (nextIndex > length - 4) {
95                          break outer;
96                      }
97  
98                      nextHash = hash(in, nextIndex, shift);
99  
100                     candidate = baseIndex + table[hash];
101 
102                     table[hash] = (short) (inIndex - baseIndex);
103                 }
104                 while (in.getInt(inIndex) != in.getInt(candidate));
105 
106                 encodeLiteral(in, out, inIndex - nextEmit);
107 
108                 int insertTail;
109                 do {
110                     int base = inIndex;
111                     int matched = 4 + findMatchingLength(in, candidate + 4, inIndex + 4, length);
112                     inIndex += matched;
113                     int offset = base - candidate;
114                     encodeCopy(out, offset, matched);
115                     in.readerIndex(in.readerIndex() + matched);
116                     insertTail = inIndex - 1;
117                     nextEmit = inIndex;
118                     if (inIndex >= length - 4) {
119                         break outer;
120                     }
121 
122                     int prevHash = hash(in, insertTail, shift);
123                     table[prevHash] = (short) (inIndex - baseIndex - 1);
124                     int currentHash = hash(in, insertTail + 1, shift);
125                     candidate = baseIndex + table[currentHash];
126                     table[currentHash] = (short) (inIndex - baseIndex);
127                 }
128                 while (in.getInt(insertTail + 1) == in.getInt(candidate));
129 
130                 nextHash = hash(in, insertTail + 2, shift);
131                 ++inIndex;
132             }
133         }
134 
135         // If there are any remaining characters, write them out as a literal
136         if (nextEmit < length) {
137             encodeLiteral(in, out, length - nextEmit);
138         }
139     }
140 
141     /**
142      * Hashes the 4 bytes located at index, shifting the resulting hash into
143      * the appropriate range for our hash table.
144      *
145      * @param in The input buffer to read 4 bytes from
146      * @param index The index to read at
147      * @param shift The shift value, for ensuring that the resulting value is
148      *     withing the range of our hash table size
149      * @return A 32-bit hash of 4 bytes located at index
150      */
151     private static int hash(ByteBuf in, int index, int shift) {
152         return in.getInt(index) + 0x1e35a7bd >>> shift;
153     }
154 
155     /**
156      * Creates an appropriately sized hashtable for the given input size
157      *
158      * @param inputSize The size of our input, ie. the number of bytes we need to encode
159      * @return An appropriately sized empty hashtable
160      */
161     private static short[] getHashTable(int inputSize) {
162         int htSize = 256;
163         while (htSize < MAX_HT_SIZE && htSize < inputSize) {
164             htSize <<= 1;
165         }
166 
167         short[] table;
168         if (htSize <= 256) {
169             table = new short[256];
170         } else {
171             table = new short[MAX_HT_SIZE];
172         }
173 
174         return table;
175     }
176 
177     /**
178      * Iterates over the supplied input buffer between the supplied minIndex and
179      * maxIndex to find how long our matched copy overlaps with an already-written
180      * literal value.
181      *
182      * @param in The input buffer to scan over
183      * @param minIndex The index in the input buffer to start scanning from
184      * @param inIndex The index of the start of our copy
185      * @param maxIndex The length of our input buffer
186      * @return The number of bytes for which our candidate copy is a repeat of
187      */
188     private static int findMatchingLength(ByteBuf in, int minIndex, int inIndex, int maxIndex) {
189         int matched = 0;
190 
191         while (inIndex <= maxIndex - 4 &&
192                 in.getInt(inIndex) == in.getInt(minIndex + matched)) {
193             inIndex += 4;
194             matched += 4;
195         }
196 
197         while (inIndex < maxIndex && in.getByte(minIndex + matched) == in.getByte(inIndex)) {
198             ++inIndex;
199             ++matched;
200         }
201 
202         return matched;
203     }
204 
205     /**
206      * Calculates the minimum number of bits required to encode a value.  This can
207      * then in turn be used to calculate the number of septets or octets (as
208      * appropriate) to use to encode a length parameter.
209      *
210      * @param value The value to calculate the minimum number of bits required to encode
211      * @return The minimum number of bits required to encode the supplied value
212      */
213     private static int bitsToEncode(int value) {
214         int highestOneBit = Integer.highestOneBit(value);
215         int bitLength = 0;
216         while ((highestOneBit >>= 1) != 0) {
217             bitLength++;
218         }
219 
220         return bitLength;
221     }
222 
223     /**
224      * Writes a literal to the supplied output buffer by directly copying from
225      * the input buffer.  The literal is taken from the current readerIndex
226      * up to the supplied length.
227      *
228      * @param in The input buffer to copy from
229      * @param out The output buffer to copy to
230      * @param length The length of the literal to copy
231      */
232     private static void encodeLiteral(ByteBuf in, ByteBuf out, int length) {
233         if (length < 61) {
234             out.writeByte(length - 1 << 2);
235         } else {
236             int bitLength = bitsToEncode(length - 1);
237             int bytesToEncode = 1 + bitLength / 8;
238             out.writeByte(59 + bytesToEncode << 2);
239             for (int i = 0; i < bytesToEncode; i++) {
240                 out.writeByte(length - 1 >> i * 8 & 0x0ff);
241             }
242         }
243 
244         out.writeBytes(in, length);
245     }
246 
247     private static void encodeCopyWithOffset(ByteBuf out, int offset, int length) {
248         if (length < 12 && offset < 2048) {
249             out.writeByte(COPY_1_BYTE_OFFSET | length - 4 << 2 | offset >> 8 << 5);
250             out.writeByte(offset & 0x0ff);
251         } else {
252             out.writeByte(COPY_2_BYTE_OFFSET | length - 1 << 2);
253             out.writeByte(offset & 0x0ff);
254             out.writeByte(offset >> 8 & 0x0ff);
255         }
256     }
257 
258     /**
259      * Encodes a series of copies, each at most 64 bytes in length.
260      *
261      * @param out The output buffer to write the copy pointer to
262      * @param offset The offset at which the original instance lies
263      * @param length The length of the original instance
264      */
265     private static void encodeCopy(ByteBuf out, int offset, int length) {
266         while (length >= 68) {
267             encodeCopyWithOffset(out, offset, 64);
268             length -= 64;
269         }
270 
271         if (length > 64) {
272             encodeCopyWithOffset(out, offset, 60);
273             length -= 60;
274         }
275 
276         encodeCopyWithOffset(out, offset, length);
277     }
278 
279     public void decode(ByteBuf in, ByteBuf out) {
280         while (in.isReadable()) {
281             switch (state) {
282             case READY:
283                 state = State.READING_PREAMBLE;
284             case READING_PREAMBLE:
285                 int uncompressedLength = readPreamble(in);
286                 if (uncompressedLength == PREAMBLE_NOT_FULL) {
287                     // We've not yet read all of the preamble, so wait until we can
288                     return;
289                 }
290                 if (uncompressedLength == 0) {
291                     // Should never happen, but it does mean we have nothing further to do
292                     state = State.READY;
293                     return;
294                 }
295                 out.ensureWritable(uncompressedLength);
296                 state = State.READING_TAG;
297             case READING_TAG:
298                 if (!in.isReadable()) {
299                     return;
300                 }
301                 tag = in.readByte();
302                 switch (tag & 0x03) {
303                 case LITERAL:
304                     state = State.READING_LITERAL;
305                     break;
306                 case COPY_1_BYTE_OFFSET:
307                 case COPY_2_BYTE_OFFSET:
308                 case COPY_4_BYTE_OFFSET:
309                     state = State.READING_COPY;
310                     break;
311                 }
312                 break;
313             case READING_LITERAL:
314                 int literalWritten = decodeLiteral(tag, in, out);
315                 if (literalWritten != NOT_ENOUGH_INPUT) {
316                     state = State.READING_TAG;
317                     written += literalWritten;
318                 } else {
319                     // Need to wait for more data
320                     return;
321                 }
322                 break;
323             case READING_COPY:
324                 int decodeWritten;
325                 switch (tag & 0x03) {
326                 case COPY_1_BYTE_OFFSET:
327                     decodeWritten = decodeCopyWith1ByteOffset(tag, in, out, written);
328                     if (decodeWritten != NOT_ENOUGH_INPUT) {
329                         state = State.READING_TAG;
330                         written += decodeWritten;
331                     } else {
332                         // Need to wait for more data
333                         return;
334                     }
335                     break;
336                 case COPY_2_BYTE_OFFSET:
337                     decodeWritten = decodeCopyWith2ByteOffset(tag, in, out, written);
338                     if (decodeWritten != NOT_ENOUGH_INPUT) {
339                         state = State.READING_TAG;
340                         written += decodeWritten;
341                     } else {
342                         // Need to wait for more data
343                         return;
344                     }
345                     break;
346                 case COPY_4_BYTE_OFFSET:
347                     decodeWritten = decodeCopyWith4ByteOffset(tag, in, out, written);
348                     if (decodeWritten != NOT_ENOUGH_INPUT) {
349                         state = State.READING_TAG;
350                         written += decodeWritten;
351                     } else {
352                         // Need to wait for more data
353                         return;
354                     }
355                     break;
356                 }
357             }
358         }
359     }
360 
361     /**
362      * Reads the length varint (a series of bytes, where the lower 7 bits
363      * are data and the upper bit is a flag to indicate more bytes to be
364      * read).
365      *
366      * @param in The input buffer to read the preamble from
367      * @return The calculated length based on the input buffer, or 0 if
368      *   no preamble is able to be calculated
369      */
370     private static int readPreamble(ByteBuf in) {
371         int length = 0;
372         int byteIndex = 0;
373         while (in.isReadable()) {
374             int current = in.readUnsignedByte();
375             length |= (current & 0x7f) << byteIndex++ * 7;
376             if ((current & 0x80) == 0) {
377                 return length;
378             }
379 
380             if (byteIndex >= 4) {
381                 throw new DecompressionException("Preamble is greater than 4 bytes");
382             }
383         }
384 
385         return 0;
386     }
387 
388     /**
389      * Reads a literal from the input buffer directly to the output buffer.
390      * A "literal" is an uncompressed segment of data stored directly in the
391      * byte stream.
392      *
393      * @param tag The tag that identified this segment as a literal is also
394      *            used to encode part of the length of the data
395      * @param in The input buffer to read the literal from
396      * @param out The output buffer to write the literal to
397      * @return The number of bytes appended to the output buffer, or -1 to indicate "try again later"
398      */
399     private static int decodeLiteral(byte tag, ByteBuf in, ByteBuf out) {
400         in.markReaderIndex();
401         int length;
402         switch(tag >> 2 & 0x3F) {
403         case 60:
404             if (!in.isReadable()) {
405                 return NOT_ENOUGH_INPUT;
406             }
407             length = in.readUnsignedByte();
408             break;
409         case 61:
410             if (in.readableBytes() < 2) {
411                 return NOT_ENOUGH_INPUT;
412             }
413             length = ByteBufUtil.swapShort(in.readShort());
414             break;
415         case 62:
416             if (in.readableBytes() < 3) {
417                 return NOT_ENOUGH_INPUT;
418             }
419             length = ByteBufUtil.swapMedium(in.readUnsignedMedium());
420             break;
421         case 64:
422             if (in.readableBytes() < 4) {
423                 return NOT_ENOUGH_INPUT;
424             }
425             length = ByteBufUtil.swapInt(in.readInt());
426             break;
427         default:
428             length = tag >> 2 & 0x3F;
429         }
430         length += 1;
431 
432         if (in.readableBytes() < length) {
433             in.resetReaderIndex();
434             return NOT_ENOUGH_INPUT;
435         }
436 
437         out.writeBytes(in, length);
438         return length;
439     }
440 
441     /**
442      * Reads a compressed reference offset and length from the supplied input
443      * buffer, seeks back to the appropriate place in the input buffer and
444      * writes the found data to the supplied output stream.
445      *
446      * @param tag The tag used to identify this as a copy is also used to encode
447      *     the length and part of the offset
448      * @param in The input buffer to read from
449      * @param out The output buffer to write to
450      * @return The number of bytes appended to the output buffer, or -1 to indicate
451      *     "try again later"
452      * @throws DecompressionException If the read offset is invalid
453      */
454     private static int decodeCopyWith1ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
455         if (!in.isReadable()) {
456             return NOT_ENOUGH_INPUT;
457         }
458 
459         int initialIndex = out.writerIndex();
460         int length = 4 + ((tag & 0x01c) >> 2);
461         int offset = (tag & 0x0e0) << 8 >> 5 | in.readUnsignedByte();
462 
463         validateOffset(offset, writtenSoFar);
464 
465         out.markReaderIndex();
466         if (offset < length) {
467             int copies = length / offset;
468             for (; copies > 0; copies--) {
469                 out.readerIndex(initialIndex - offset);
470                 out.readBytes(out, offset);
471             }
472             if (length % offset != 0) {
473                 out.readerIndex(initialIndex - offset);
474                 out.readBytes(out, length % offset);
475             }
476         } else {
477             out.readerIndex(initialIndex - offset);
478             out.readBytes(out, length);
479         }
480         out.resetReaderIndex();
481 
482         return length;
483     }
484 
485     /**
486      * Reads a compressed reference offset and length from the supplied input
487      * buffer, seeks back to the appropriate place in the input buffer and
488      * writes the found data to the supplied output stream.
489      *
490      * @param tag The tag used to identify this as a copy is also used to encode
491      *     the length and part of the offset
492      * @param in The input buffer to read from
493      * @param out The output buffer to write to
494      * @throws DecompressionException If the read offset is invalid
495      * @return The number of bytes appended to the output buffer, or -1 to indicate
496      *     "try again later"
497      */
498     private static int decodeCopyWith2ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
499         if (in.readableBytes() < 2) {
500             return NOT_ENOUGH_INPUT;
501         }
502 
503         int initialIndex = out.writerIndex();
504         int length = 1 + (tag >> 2 & 0x03f);
505         int offset = ByteBufUtil.swapShort(in.readShort());
506 
507         validateOffset(offset, writtenSoFar);
508 
509         out.markReaderIndex();
510         if (offset < length) {
511             int copies = length / offset;
512             for (; copies > 0; copies--) {
513                 out.readerIndex(initialIndex - offset);
514                 out.readBytes(out, offset);
515             }
516             if (length % offset != 0) {
517                 out.readerIndex(initialIndex - offset);
518                 out.readBytes(out, length % offset);
519             }
520         } else {
521             out.readerIndex(initialIndex - offset);
522             out.readBytes(out, length);
523         }
524         out.resetReaderIndex();
525 
526         return length;
527     }
528 
529     /**
530      * Reads a compressed reference offset and length from the supplied input
531      * buffer, seeks back to the appropriate place in the input buffer and
532      * writes the found data to the supplied output stream.
533      *
534      * @param tag The tag used to identify this as a copy is also used to encode
535      *     the length and part of the offset
536      * @param in The input buffer to read from
537      * @param out The output buffer to write to
538      * @return The number of bytes appended to the output buffer, or -1 to indicate
539      *     "try again later"
540      * @throws DecompressionException If the read offset is invalid
541      */
542     private static int decodeCopyWith4ByteOffset(byte tag, ByteBuf in, ByteBuf out, int writtenSoFar) {
543         if (in.readableBytes() < 4) {
544             return NOT_ENOUGH_INPUT;
545         }
546 
547         int initialIndex = out.writerIndex();
548         int length = 1 + (tag >> 2 & 0x03F);
549         int offset = ByteBufUtil.swapInt(in.readInt());
550 
551         validateOffset(offset, writtenSoFar);
552 
553         out.markReaderIndex();
554         if (offset < length) {
555             int copies = length / offset;
556             for (; copies > 0; copies--) {
557                 out.readerIndex(initialIndex - offset);
558                 out.readBytes(out, offset);
559             }
560             if (length % offset != 0) {
561                 out.readerIndex(initialIndex - offset);
562                 out.readBytes(out, length % offset);
563             }
564         } else {
565             out.readerIndex(initialIndex - offset);
566             out.readBytes(out, length);
567         }
568         out.resetReaderIndex();
569 
570         return length;
571     }
572 
573     /**
574      * Validates that the offset extracted from a compressed reference is within
575      * the permissible bounds of an offset (4 <= offset <= 32768), and does not
576      * exceed the length of the chunk currently read so far.
577      *
578      * @param offset The offset extracted from the compressed reference
579      * @param chunkSizeSoFar The number of bytes read so far from this chunk
580      * @throws DecompressionException if the offset is invalid
581      */
582     private static void validateOffset(int offset, int chunkSizeSoFar) {
583         if (offset > Short.MAX_VALUE) {
584             throw new DecompressionException("Offset exceeds maximum permissible value");
585         }
586 
587         if (offset <= 0) {
588             throw new DecompressionException("Offset is less than minimum permissible value");
589         }
590 
591         if (offset > chunkSizeSoFar) {
592             throw new DecompressionException("Offset exceeds size of chunk");
593         }
594     }
595 
596     /**
597      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
598      * on the computed checksum
599      *
600      * @param data The input data to calculate the CRC32C checksum of
601      */
602     public static int calculateChecksum(ByteBuf data) {
603         return calculateChecksum(data, data.readerIndex(), data.readableBytes());
604     }
605 
606     /**
607      * Computes the CRC32C checksum of the supplied data and performs the "mask" operation
608      * on the computed checksum
609      *
610      * @param data The input data to calculate the CRC32C checksum of
611      */
612     public static int calculateChecksum(ByteBuf data, int offset, int length) {
613         Crc32c crc32 = new Crc32c();
614         try {
615             if (data.hasArray()) {
616                 crc32.update(data.array(), data.arrayOffset() + offset, length);
617             } else {
618                 byte[] array = new byte[length];
619                 data.getBytes(offset, array);
620                 crc32.update(array, 0, length);
621             }
622 
623             return maskChecksum((int) crc32.getValue());
624         } finally {
625             crc32.reset();
626         }
627     }
628 
629     /**
630      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
631      * on the computed checksum, and then compares the resulting masked checksum to the
632      * supplied checksum.
633      *
634      * @param expectedChecksum The checksum decoded from the stream to compare against
635      * @param data The input data to calculate the CRC32C checksum of
636      * @throws DecompressionException If the calculated and supplied checksums do not match
637      */
638     static void validateChecksum(int expectedChecksum, ByteBuf data) {
639         validateChecksum(expectedChecksum, data, data.readerIndex(), data.readableBytes());
640     }
641 
642     /**
643      * Computes the CRC32C checksum of the supplied data, performs the "mask" operation
644      * on the computed checksum, and then compares the resulting masked checksum to the
645      * supplied checksum.
646      *
647      * @param expectedChecksum The checksum decoded from the stream to compare against
648      * @param data The input data to calculate the CRC32C checksum of
649      * @throws DecompressionException If the calculated and supplied checksums do not match
650      */
651     static void validateChecksum(int expectedChecksum, ByteBuf data, int offset, int length) {
652         final int actualChecksum = calculateChecksum(data, offset, length);
653         if (actualChecksum != expectedChecksum) {
654             throw new DecompressionException(
655                     "mismatching checksum: " + Integer.toHexString(actualChecksum) +
656                             " (expected: " + Integer.toHexString(expectedChecksum) + ')');
657         }
658     }
659 
660     /**
661      * From the spec:
662      *
663      * "Checksums are not stored directly, but masked, as checksumming data and
664      * then its own checksum can be problematic. The masking is the same as used
665      * in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant
666      * 0xa282ead8 (using wraparound as normal for unsigned integers)."
667      *
668      * @param checksum The actual checksum of the data
669      * @return The masked checksum
670      */
671     static int maskChecksum(int checksum) {
672         return (checksum >> 15 | checksum << 17) + 0xa282ead8;
673     }
674 }