001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
025import org.apache.commons.compress.utils.ByteUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>
031 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
032 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
033 * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
034 * </p>
035 *
036 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
037 * @since 1.7
038 */
039public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
040
041    private enum State {
042        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
043    }
044
045    /** Mask used to determine the type of "tag" is being processed */
046    private static final int TAG_MASK = 0x03;
047
048    /** Default block size */
049    public static final int DEFAULT_BLOCK_SIZE = 32768;
050
051    /** The size of the uncompressed data */
052    private final int size;
053
054    /** Number of uncompressed bytes still to be read. */
055    private int uncompressedBytesRemaining;
056
057    /** Current state of the stream */
058    private State state = State.NO_BLOCK;
059
060    private boolean endReached;
061
062    /**
063     * Constructor using the default buffer size of 32k.
064     *
065     * @param is An InputStream to read compressed data from
066     *
067     * @throws IOException if reading fails
068     */
069    public SnappyCompressorInputStream(final InputStream is) throws IOException {
070        this(is, DEFAULT_BLOCK_SIZE);
071    }
072
073    /**
074     * Constructor using a configurable buffer size.
075     *
076     * @param is        An InputStream to read compressed data from
077     * @param blockSize The block size used in compression
078     *
079     * @throws IOException              if reading fails
080     * @throws IllegalArgumentException if blockSize is not bigger than 0
081     */
082    public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
083        super(is, blockSize);
084        uncompressedBytesRemaining = size = (int) readSize();
085    }
086
087    /**
088     * Try to fill the buffer with the next block of data.
089     */
090    private void fill() throws IOException {
091        if (uncompressedBytesRemaining == 0) {
092            endReached = true;
093            return;
094        }
095
096        int b = readOneByte();
097        if (b == -1) {
098            throw new IOException("Premature end of stream reading block start");
099        }
100        int length = 0;
101        int offset = 0;
102
103        switch (b & TAG_MASK) {
104
105        case 0x00:
106
107            length = readLiteralLength(b);
108            if (length < 0) {
109                throw new IOException("Illegal block with a negative literal size found");
110            }
111            uncompressedBytesRemaining -= length;
112            startLiteral(length);
113            state = State.IN_LITERAL;
114            break;
115
116        case 0x01:
117
118            /*
119             * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
120             * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
121             * the lower eight are stored in a byte following the tag byte.
122             */
123
124            length = 4 + (b >> 2 & 0x07);
125            uncompressedBytesRemaining -= length;
126            offset = (b & 0xE0) << 3;
127            b = readOneByte();
128            if (b == -1) {
129                throw new IOException("Premature end of stream reading back-reference length");
130            }
131            offset |= b;
132
133            try {
134                startBackReference(offset, length);
135            } catch (final IllegalArgumentException ex) {
136                throw new IOException("Illegal block with bad offset found", ex);
137            }
138            state = State.IN_BACK_REFERENCE;
139            break;
140
141        case 0x02:
142
143            /*
144             * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
145             * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
146             */
147
148            length = (b >> 2) + 1;
149            if (length < 0) {
150                throw new IOException("Illegal block with a negative match length found");
151            }
152            uncompressedBytesRemaining -= length;
153
154            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
155
156            try {
157                startBackReference(offset, length);
158            } catch (final IllegalArgumentException ex) {
159                throw new IOException("Illegal block with bad offset found", ex);
160            }
161            state = State.IN_BACK_REFERENCE;
162            break;
163
164        case 0x03:
165
166            /*
167             * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
168             * integer (and thus will occupy four bytes).
169             */
170
171            length = (b >> 2) + 1;
172            if (length < 0) {
173                throw new IOException("Illegal block with a negative match length found");
174            }
175            uncompressedBytesRemaining -= length;
176
177            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
178
179            try {
180                startBackReference(offset, length);
181            } catch (final IllegalArgumentException ex) {
182                throw new IOException("Illegal block with bad offset found", ex);
183            }
184            state = State.IN_BACK_REFERENCE;
185            break;
186        default:
187            // impossible as TAG_MASK is two bits and all four possible cases have been covered
188            break;
189        }
190    }
191
192    /**
193     * Gets the uncompressed size of the stream
194     *
195     * @return the uncompressed size
196     */
197    @Override
198    public int getSize() {
199        return size;
200    }
201
202    /**
203     * {@inheritDoc}
204     */
205    @Override
206    public int read(final byte[] b, final int off, final int len) throws IOException {
207        if (len == 0) {
208            return 0;
209        }
210        if (endReached) {
211            return -1;
212        }
213        switch (state) {
214        case NO_BLOCK:
215            fill();
216            return read(b, off, len);
217        case IN_LITERAL:
218            final int litLen = readLiteral(b, off, len);
219            if (!hasMoreDataInBlock()) {
220                state = State.NO_BLOCK;
221            }
222            return litLen > 0 ? litLen : read(b, off, len);
223        case IN_BACK_REFERENCE:
224            final int backReferenceLen = readBackReference(b, off, len);
225            if (!hasMoreDataInBlock()) {
226                state = State.NO_BLOCK;
227            }
228            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
229        default:
230            throw new IOException("Unknown stream state " + state);
231        }
232    }
233
234    /*
235     * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
236     * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
237     * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
238     */
239    private int readLiteralLength(final int b) throws IOException {
240        final int length;
241        switch (b >> 2) {
242        case 60:
243            length = readOneByte();
244            if (length == -1) {
245                throw new IOException("Premature end of stream reading literal length");
246            }
247            break;
248        case 61:
249            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
250            break;
251        case 62:
252            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
253            break;
254        case 63:
255            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
256            break;
257        default:
258            length = b >> 2;
259            break;
260        }
261
262        return length + 1;
263    }
264
265    /**
266     * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
267     * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
268     * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
269     *
270     * @return The size of the uncompressed data
271     *
272     * @throws IOException Could not read a byte
273     */
274    private long readSize() throws IOException {
275        int index = 0;
276        long sz = 0;
277        int b = 0;
278
279        do {
280            b = readOneByte();
281            if (b == -1) {
282                throw new IOException("Premature end of stream reading size");
283            }
284            sz |= (b & 0x7f) << index++ * 7;
285        } while (0 != (b & 0x80));
286        return sz;
287    }
288}