SnappyCompressorInputStream.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
- package org.apache.commons.compress.compressors.snappy;
- import java.io.IOException;
- import java.io.InputStream;
- import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
- import org.apache.commons.compress.utils.ByteUtils;
- /**
- * CompressorInputStream for the raw Snappy format.
- *
- * <p>
- * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
- * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
- * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
- * </p>
- *
- * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
- * @since 1.7
- */
- public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
- private enum State {
- NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
- }
- /** Mask used to determine the type of "tag" is being processed */
- private static final int TAG_MASK = 0x03;
- /** Default block size */
- public static final int DEFAULT_BLOCK_SIZE = 32768;
- /** The size of the uncompressed data */
- private final int size;
- /** Number of uncompressed bytes still to be read. */
- private int uncompressedBytesRemaining;
- /** Current state of the stream */
- private State state = State.NO_BLOCK;
- private boolean endReached;
- /**
- * Constructor using the default buffer size of 32k.
- *
- * @param is An InputStream to read compressed data from
- *
- * @throws IOException if reading fails
- */
- public SnappyCompressorInputStream(final InputStream is) throws IOException {
- this(is, DEFAULT_BLOCK_SIZE);
- }
- /**
- * Constructor using a configurable buffer size.
- *
- * @param is An InputStream to read compressed data from
- * @param blockSize The block size used in compression
- *
- * @throws IOException if reading fails
- * @throws IllegalArgumentException if blockSize is not bigger than 0
- */
- public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
- super(is, blockSize);
- uncompressedBytesRemaining = size = (int) readSize();
- }
- /**
- * Try to fill the buffer with the next block of data.
- */
- private void fill() throws IOException {
- if (uncompressedBytesRemaining == 0) {
- endReached = true;
- return;
- }
- int b = readOneByte();
- if (b == -1) {
- throw new IOException("Premature end of stream reading block start");
- }
- int length = 0;
- int offset = 0;
- switch (b & TAG_MASK) {
- case 0x00:
- length = readLiteralLength(b);
- if (length < 0) {
- throw new IOException("Illegal block with a negative literal size found");
- }
- uncompressedBytesRemaining -= length;
- startLiteral(length);
- state = State.IN_LITERAL;
- break;
- case 0x01:
- /*
- * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
- * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
- * the lower eight are stored in a byte following the tag byte.
- */
- length = 4 + (b >> 2 & 0x07);
- uncompressedBytesRemaining -= length;
- offset = (b & 0xE0) << 3;
- b = readOneByte();
- if (b == -1) {
- throw new IOException("Premature end of stream reading back-reference length");
- }
- offset |= b;
- try {
- startBackReference(offset, length);
- } catch (final IllegalArgumentException ex) {
- throw new IOException("Illegal block with bad offset found", ex);
- }
- state = State.IN_BACK_REFERENCE;
- break;
- case 0x02:
- /*
- * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
- * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
- */
- length = (b >> 2) + 1;
- if (length < 0) {
- throw new IOException("Illegal block with a negative match length found");
- }
- uncompressedBytesRemaining -= length;
- offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
- try {
- startBackReference(offset, length);
- } catch (final IllegalArgumentException ex) {
- throw new IOException("Illegal block with bad offset found", ex);
- }
- state = State.IN_BACK_REFERENCE;
- break;
- case 0x03:
- /*
- * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
- * integer (and thus will occupy four bytes).
- */
- length = (b >> 2) + 1;
- if (length < 0) {
- throw new IOException("Illegal block with a negative match length found");
- }
- uncompressedBytesRemaining -= length;
- offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
- try {
- startBackReference(offset, length);
- } catch (final IllegalArgumentException ex) {
- throw new IOException("Illegal block with bad offset found", ex);
- }
- state = State.IN_BACK_REFERENCE;
- break;
- default:
- // impossible as TAG_MASK is two bits and all four possible cases have been covered
- break;
- }
- }
- /**
- * Gets the uncompressed size of the stream
- *
- * @return the uncompressed size
- */
- @Override
- public int getSize() {
- return size;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public int read(final byte[] b, final int off, final int len) throws IOException {
- if (len == 0) {
- return 0;
- }
- if (endReached) {
- return -1;
- }
- switch (state) {
- case NO_BLOCK:
- fill();
- return read(b, off, len);
- case IN_LITERAL:
- final int litLen = readLiteral(b, off, len);
- if (!hasMoreDataInBlock()) {
- state = State.NO_BLOCK;
- }
- return litLen > 0 ? litLen : read(b, off, len);
- case IN_BACK_REFERENCE:
- final int backReferenceLen = readBackReference(b, off, len);
- if (!hasMoreDataInBlock()) {
- state = State.NO_BLOCK;
- }
- return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
- default:
- throw new IOException("Unknown stream state " + state);
- }
- }
- /*
- * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
- * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
- * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
- */
- private int readLiteralLength(final int b) throws IOException {
- final int length;
- switch (b >> 2) {
- case 60:
- length = readOneByte();
- if (length == -1) {
- throw new IOException("Premature end of stream reading literal length");
- }
- break;
- case 61:
- length = (int) ByteUtils.fromLittleEndian(supplier, 2);
- break;
- case 62:
- length = (int) ByteUtils.fromLittleEndian(supplier, 3);
- break;
- case 63:
- length = (int) ByteUtils.fromLittleEndian(supplier, 4);
- break;
- default:
- length = b >> 2;
- break;
- }
- return length + 1;
- }
- /**
- * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
- * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
- * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
- *
- * @return The size of the uncompressed data
- *
- * @throws IOException Could not read a byte
- */
- private long readSize() throws IOException {
- int index = 0;
- long sz = 0;
- int b = 0;
- do {
- b = readOneByte();
- if (b == -1) {
- throw new IOException("Premature end of stream reading size");
- }
- sz |= (b & 0x7f) << index++ * 7;
- } while (0 != (b & 0x80));
- return sz;
- }
- }