SnappyCompressorInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.snappy;

  20. import java.io.IOException;
  21. import java.io.InputStream;

  22. import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
  23. import org.apache.commons.compress.utils.ByteUtils;

  24. /**
  25.  * CompressorInputStream for the raw Snappy format.
  26.  *
  27.  * <p>
  28.  * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
  29.  * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
  30.  * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
  31.  * </p>
  32.  *
  33.  * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
  34.  * @since 1.7
  35.  */
  36. public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {

  37.     private enum State {
  38.         NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
  39.     }

  40.     /** Mask used to determine the type of "tag" is being processed */
  41.     private static final int TAG_MASK = 0x03;

  42.     /** Default block size */
  43.     public static final int DEFAULT_BLOCK_SIZE = 32768;

  44.     /** The size of the uncompressed data */
  45.     private final int size;

  46.     /** Number of uncompressed bytes still to be read. */
  47.     private int uncompressedBytesRemaining;

  48.     /** Current state of the stream */
  49.     private State state = State.NO_BLOCK;

  50.     private boolean endReached;

  51.     /**
  52.      * Constructor using the default buffer size of 32k.
  53.      *
  54.      * @param is An InputStream to read compressed data from
  55.      *
  56.      * @throws IOException if reading fails
  57.      */
  58.     public SnappyCompressorInputStream(final InputStream is) throws IOException {
  59.         this(is, DEFAULT_BLOCK_SIZE);
  60.     }

  61.     /**
  62.      * Constructor using a configurable buffer size.
  63.      *
  64.      * @param is        An InputStream to read compressed data from
  65.      * @param blockSize The block size used in compression
  66.      *
  67.      * @throws IOException              if reading fails
  68.      * @throws IllegalArgumentException if blockSize is not bigger than 0
  69.      */
  70.     public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
  71.         super(is, blockSize);
  72.         uncompressedBytesRemaining = size = (int) readSize();
  73.     }

  74.     /**
  75.      * Try to fill the buffer with the next block of data.
  76.      */
  77.     private void fill() throws IOException {
  78.         if (uncompressedBytesRemaining == 0) {
  79.             endReached = true;
  80.             return;
  81.         }

  82.         int b = readOneByte();
  83.         if (b == -1) {
  84.             throw new IOException("Premature end of stream reading block start");
  85.         }
  86.         int length = 0;
  87.         int offset = 0;

  88.         switch (b & TAG_MASK) {

  89.         case 0x00:

  90.             length = readLiteralLength(b);
  91.             if (length < 0) {
  92.                 throw new IOException("Illegal block with a negative literal size found");
  93.             }
  94.             uncompressedBytesRemaining -= length;
  95.             startLiteral(length);
  96.             state = State.IN_LITERAL;
  97.             break;

  98.         case 0x01:

  99.             /*
  100.              * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
  101.              * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
  102.              * the lower eight are stored in a byte following the tag byte.
  103.              */

  104.             length = 4 + (b >> 2 & 0x07);
  105.             uncompressedBytesRemaining -= length;
  106.             offset = (b & 0xE0) << 3;
  107.             b = readOneByte();
  108.             if (b == -1) {
  109.                 throw new IOException("Premature end of stream reading back-reference length");
  110.             }
  111.             offset |= b;

  112.             try {
  113.                 startBackReference(offset, length);
  114.             } catch (final IllegalArgumentException ex) {
  115.                 throw new IOException("Illegal block with bad offset found", ex);
  116.             }
  117.             state = State.IN_BACK_REFERENCE;
  118.             break;

  119.         case 0x02:

  120.             /*
  121.              * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
  122.              * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
  123.              */

  124.             length = (b >> 2) + 1;
  125.             if (length < 0) {
  126.                 throw new IOException("Illegal block with a negative match length found");
  127.             }
  128.             uncompressedBytesRemaining -= length;

  129.             offset = (int) ByteUtils.fromLittleEndian(supplier, 2);

  130.             try {
  131.                 startBackReference(offset, length);
  132.             } catch (final IllegalArgumentException ex) {
  133.                 throw new IOException("Illegal block with bad offset found", ex);
  134.             }
  135.             state = State.IN_BACK_REFERENCE;
  136.             break;

  137.         case 0x03:

  138.             /*
  139.              * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
  140.              * integer (and thus will occupy four bytes).
  141.              */

  142.             length = (b >> 2) + 1;
  143.             if (length < 0) {
  144.                 throw new IOException("Illegal block with a negative match length found");
  145.             }
  146.             uncompressedBytesRemaining -= length;

  147.             offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;

  148.             try {
  149.                 startBackReference(offset, length);
  150.             } catch (final IllegalArgumentException ex) {
  151.                 throw new IOException("Illegal block with bad offset found", ex);
  152.             }
  153.             state = State.IN_BACK_REFERENCE;
  154.             break;
  155.         default:
  156.             // impossible as TAG_MASK is two bits and all four possible cases have been covered
  157.             break;
  158.         }
  159.     }

  160.     /**
  161.      * Gets the uncompressed size of the stream
  162.      *
  163.      * @return the uncompressed size
  164.      */
  165.     @Override
  166.     public int getSize() {
  167.         return size;
  168.     }

  169.     /**
  170.      * {@inheritDoc}
  171.      */
  172.     @Override
  173.     public int read(final byte[] b, final int off, final int len) throws IOException {
  174.         if (len == 0) {
  175.             return 0;
  176.         }
  177.         if (endReached) {
  178.             return -1;
  179.         }
  180.         switch (state) {
  181.         case NO_BLOCK:
  182.             fill();
  183.             return read(b, off, len);
  184.         case IN_LITERAL:
  185.             final int litLen = readLiteral(b, off, len);
  186.             if (!hasMoreDataInBlock()) {
  187.                 state = State.NO_BLOCK;
  188.             }
  189.             return litLen > 0 ? litLen : read(b, off, len);
  190.         case IN_BACK_REFERENCE:
  191.             final int backReferenceLen = readBackReference(b, off, len);
  192.             if (!hasMoreDataInBlock()) {
  193.                 state = State.NO_BLOCK;
  194.             }
  195.             return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
  196.         default:
  197.             throw new IOException("Unknown stream state " + state);
  198.         }
  199.     }

  200.     /*
  201.      * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
  202.      * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
  203.      * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
  204.      */
  205.     private int readLiteralLength(final int b) throws IOException {
  206.         final int length;
  207.         switch (b >> 2) {
  208.         case 60:
  209.             length = readOneByte();
  210.             if (length == -1) {
  211.                 throw new IOException("Premature end of stream reading literal length");
  212.             }
  213.             break;
  214.         case 61:
  215.             length = (int) ByteUtils.fromLittleEndian(supplier, 2);
  216.             break;
  217.         case 62:
  218.             length = (int) ByteUtils.fromLittleEndian(supplier, 3);
  219.             break;
  220.         case 63:
  221.             length = (int) ByteUtils.fromLittleEndian(supplier, 4);
  222.             break;
  223.         default:
  224.             length = b >> 2;
  225.             break;
  226.         }

  227.         return length + 1;
  228.     }

  229.     /**
  230.      * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
  231.      * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
  232.      * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
  233.      *
  234.      * @return The size of the uncompressed data
  235.      *
  236.      * @throws IOException Could not read a byte
  237.      */
  238.     private long readSize() throws IOException {
  239.         int index = 0;
  240.         long sz = 0;
  241.         int b = 0;

  242.         do {
  243.             b = readOneByte();
  244.             if (b == -1) {
  245.                 throw new IOException("Premature end of stream reading size");
  246.             }
  247.             sz |= (b & 0x7f) << index++ * 7;
  248.         } while (0 != (b & 0x80));
  249.         return sz;
  250.     }
  251. }