SnappyCompressorInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  *   https://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.snappy;

  20. import java.io.IOException;
  21. import java.io.InputStream;

  22. import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
  23. import org.apache.commons.compress.utils.ByteUtils;

  24. /**
  25.  * CompressorInputStream for the raw Snappy format.
  26.  *
  27.  * <p>
  28.  * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
  29.  * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
  30.  * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
  31.  * </p>
  32.  *
  33.  * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
  34.  * @since 1.7
  35.  */
  36. public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {

  37.     private enum State {
  38.         NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
  39.     }

  40.     /** Mask used to determine the type of "tag" is being processed */
  41.     private static final int TAG_MASK = 0x03;

  42.     /** Default block size */
  43.     public static final int DEFAULT_BLOCK_SIZE = 32768;

  44.     /** The size of the uncompressed data */
  45.     private final int size;

  46.     /** Number of uncompressed bytes still to be read. */
  47.     private int uncompressedBytesRemaining;

  48.     /** Current state of the stream */
  49.     private State state = State.NO_BLOCK;

  50.     private boolean endReached;

  51.     /**
  52.      * Constructor using the default buffer size of 32k.
  53.      *
  54.      * @param is An InputStream to read compressed data from
  55.      * @throws IOException if reading fails
  56.      */
  57.     public SnappyCompressorInputStream(final InputStream is) throws IOException {
  58.         this(is, DEFAULT_BLOCK_SIZE);
  59.     }

  60.     /**
  61.      * Constructor using a configurable buffer size.
  62.      *
  63.      * @param is        An InputStream to read compressed data from
  64.      * @param blockSize The block size used in compression
  65.      * @throws IOException              if reading fails
  66.      * @throws IllegalArgumentException if blockSize is not bigger than 0
  67.      */
  68.     public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
  69.         super(is, blockSize);
  70.         uncompressedBytesRemaining = size = (int) readSize();
  71.     }

  72.     /**
  73.      * Try to fill the buffer with the next block of data.
  74.      */
  75.     private void fill() throws IOException {
  76.         if (uncompressedBytesRemaining == 0) {
  77.             endReached = true;
  78.             return;
  79.         }

  80.         int b = readOneByte();
  81.         if (b == -1) {
  82.             throw new IOException("Premature end of stream reading block start");
  83.         }
  84.         int length = 0;
  85.         int offset = 0;

  86.         switch (b & TAG_MASK) {

  87.         case 0x00:

  88.             length = readLiteralLength(b);
  89.             if (length < 0) {
  90.                 throw new IOException("Illegal block with a negative literal size found");
  91.             }
  92.             uncompressedBytesRemaining -= length;
  93.             startLiteral(length);
  94.             state = State.IN_LITERAL;
  95.             break;

  96.         case 0x01:

  97.             /*
  98.              * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
  99.              * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
  100.              * the lower eight are stored in a byte following the tag byte.
  101.              */

  102.             length = 4 + (b >> 2 & 0x07);
  103.             uncompressedBytesRemaining -= length;
  104.             offset = (b & 0xE0) << 3;
  105.             b = readOneByte();
  106.             if (b == -1) {
  107.                 throw new IOException("Premature end of stream reading back-reference length");
  108.             }
  109.             offset |= b;

  110.             try {
  111.                 startBackReference(offset, length);
  112.             } catch (final IllegalArgumentException ex) {
  113.                 throw new IOException("Illegal block with bad offset found", ex);
  114.             }
  115.             state = State.IN_BACK_REFERENCE;
  116.             break;

  117.         case 0x02:

  118.             /*
  119.              * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
  120.              * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
  121.              */

  122.             length = (b >> 2) + 1;
  123.             if (length < 0) {
  124.                 throw new IOException("Illegal block with a negative match length found");
  125.             }
  126.             uncompressedBytesRemaining -= length;

  127.             offset = (int) ByteUtils.fromLittleEndian(supplier, 2);

  128.             try {
  129.                 startBackReference(offset, length);
  130.             } catch (final IllegalArgumentException ex) {
  131.                 throw new IOException("Illegal block with bad offset found", ex);
  132.             }
  133.             state = State.IN_BACK_REFERENCE;
  134.             break;

  135.         case 0x03:

  136.             /*
  137.              * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
  138.              * integer (and thus will occupy four bytes).
  139.              */

  140.             length = (b >> 2) + 1;
  141.             if (length < 0) {
  142.                 throw new IOException("Illegal block with a negative match length found");
  143.             }
  144.             uncompressedBytesRemaining -= length;

  145.             offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;

  146.             try {
  147.                 startBackReference(offset, length);
  148.             } catch (final IllegalArgumentException ex) {
  149.                 throw new IOException("Illegal block with bad offset found", ex);
  150.             }
  151.             state = State.IN_BACK_REFERENCE;
  152.             break;
  153.         default:
  154.             // impossible as TAG_MASK is two bits and all four possible cases have been covered
  155.             break;
  156.         }
  157.     }

  158.     /**
  159.      * Gets the uncompressed size of the stream
  160.      *
  161.      * @return the uncompressed size
  162.      */
  163.     @Override
  164.     public int getSize() {
  165.         return size;
  166.     }

  167.     /**
  168.      * {@inheritDoc}
  169.      */
  170.     @Override
  171.     public int read(final byte[] b, final int off, final int len) throws IOException {
  172.         if (len == 0) {
  173.             return 0;
  174.         }
  175.         if (endReached) {
  176.             return -1;
  177.         }
  178.         switch (state) {
  179.         case NO_BLOCK:
  180.             fill();
  181.             return read(b, off, len);
  182.         case IN_LITERAL:
  183.             final int litLen = readLiteral(b, off, len);
  184.             if (!hasMoreDataInBlock()) {
  185.                 state = State.NO_BLOCK;
  186.             }
  187.             return litLen > 0 ? litLen : read(b, off, len);
  188.         case IN_BACK_REFERENCE:
  189.             final int backReferenceLen = readBackReference(b, off, len);
  190.             if (!hasMoreDataInBlock()) {
  191.                 state = State.NO_BLOCK;
  192.             }
  193.             return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
  194.         default:
  195.             throw new IOException("Unknown stream state " + state);
  196.         }
  197.     }

  198.     /*
  199.      * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
  200.      * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
  201.      * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
  202.      */
  203.     private int readLiteralLength(final int b) throws IOException {
  204.         final int length;
  205.         switch (b >> 2) {
  206.         case 60:
  207.             length = readOneByte();
  208.             if (length == -1) {
  209.                 throw new IOException("Premature end of stream reading literal length");
  210.             }
  211.             break;
  212.         case 61:
  213.             length = (int) ByteUtils.fromLittleEndian(supplier, 2);
  214.             break;
  215.         case 62:
  216.             length = (int) ByteUtils.fromLittleEndian(supplier, 3);
  217.             break;
  218.         case 63:
  219.             length = (int) ByteUtils.fromLittleEndian(supplier, 4);
  220.             break;
  221.         default:
  222.             length = b >> 2;
  223.             break;
  224.         }

  225.         return length + 1;
  226.     }

  227.     /**
  228.      * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
  229.      * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
  230.      * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
  231.      *
  232.      * @return The size of the uncompressed data
  233.      * @throws IOException Could not read a byte
  234.      */
  235.     private long readSize() throws IOException {
  236.         int index = 0;
  237.         long sz = 0;
  238.         int b = 0;

  239.         do {
  240.             b = readOneByte();
  241.             if (b == -1) {
  242.                 throw new IOException("Premature end of stream reading size");
  243.             }
  244.             sz |= (b & 0x7f) << index++ * 7;
  245.         } while (0 != (b & 0x80));
  246.         return sz;
  247.     }
  248. }