001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
025import org.apache.commons.compress.utils.ByteUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>This implementation uses an internal buffer in order to handle
031 * the back-references that are at the heart of the LZ77 algorithm.
032 * The size of the buffer must be at least as big as the biggest
033 * offset used in the compressed stream.  The current version of the
034 * Snappy algorithm as defined by Google works on 32k blocks and
035 * doesn't contain offsets bigger than 32k which is the default block
036 * size used by this class.</p>
037 *
038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
039 * @since 1.7
040 */
041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
042
043    /** Mask used to determine the type of "tag" is being processed */
044    private static final int TAG_MASK = 0x03;
045
046    /** Default block size */
047    public static final int DEFAULT_BLOCK_SIZE = 32768;
048
049    /** The size of the uncompressed data */
050    private final int size;
051
052    /** Number of uncompressed bytes still to be read. */
053    private int uncompressedBytesRemaining;
054
055    /** Current state of the stream */
056    private State state = State.NO_BLOCK;
057
058    private boolean endReached = false;
059
060    /**
061     * Constructor using the default buffer size of 32k.
062     *
063     * @param is
064     *            An InputStream to read compressed data from
065     *
066     * @throws IOException if reading fails
067     */
068    public SnappyCompressorInputStream(final InputStream is) throws IOException {
069        this(is, DEFAULT_BLOCK_SIZE);
070    }
071
072    /**
073     * Constructor using a configurable buffer size.
074     *
075     * @param is
076     *            An InputStream to read compressed data from
077     * @param blockSize
078     *            The block size used in compression
079     *
080     * @throws IOException if reading fails
081     * @throws IllegalArgumentException if blockSize is not bigger than 0
082     */
083    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
084            throws IOException {
085        super(is, blockSize);
086        uncompressedBytesRemaining = size = (int) readSize();
087    }
088
089    /**
090     * {@inheritDoc}
091     */
092    @Override
093    public int read(final byte[] b, final int off, final int len) throws IOException {
094        if (endReached) {
095            return -1;
096        }
097        switch (state) {
098        case NO_BLOCK:
099            fill();
100            return read(b, off, len);
101        case IN_LITERAL:
102            int litLen = readLiteral(b, off, len);
103            if (!hasMoreDataInBlock()) {
104                state = State.NO_BLOCK;
105            }
106            return litLen > 0 ? litLen : read(b, off, len);
107        case IN_BACK_REFERENCE:
108            int backReferenceLen = readBackReference(b, off, len);
109            if (!hasMoreDataInBlock()) {
110                state = State.NO_BLOCK;
111            }
112            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
113        default:
114            throw new IOException("Unknown stream state " + state);
115        }
116    }
117
118    /**
119     * Try to fill the buffer with the next block of data.
120     */
121    private void fill() throws IOException {
122        if (uncompressedBytesRemaining == 0) {
123            endReached = true;
124            return;
125        }
126
127        int b = readOneByte();
128        if (b == -1) {
129            throw new IOException("Premature end of stream reading block start");
130        }
131        int length = 0;
132        int offset = 0;
133
134        switch (b & TAG_MASK) {
135
136        case 0x00:
137
138            length = readLiteralLength(b);
139            if (length < 0) {
140                throw new IOException("Illegal block with a negative literal size found");
141            }
142            uncompressedBytesRemaining -= length;
143            startLiteral(length);
144            state = State.IN_LITERAL;
145            break;
146
147        case 0x01:
148
149            /*
150             * These elements can encode lengths between [4..11] bytes and
151             * offsets between [0..2047] bytes. (len-4) occupies three bits
152             * and is stored in bits [2..4] of the tag byte. The offset
153             * occupies 11 bits, of which the upper three are stored in the
154             * upper three bits ([5..7]) of the tag byte, and the lower
155             * eight are stored in a byte following the tag byte.
156             */
157
158            length = 4 + ((b >> 2) & 0x07);
159            if (length < 0) {
160                throw new IOException("Illegal block with a negative match length found");
161            }
162            uncompressedBytesRemaining -= length;
163            offset = (b & 0xE0) << 3;
164            b = readOneByte();
165            if (b == -1) {
166                throw new IOException("Premature end of stream reading back-reference length");
167            }
168            offset |= b;
169
170            try {
171                startBackReference(offset, length);
172            } catch (IllegalArgumentException ex) {
173                throw new IOException("Illegal block with bad offset found", ex);
174            }
175            state = State.IN_BACK_REFERENCE;
176            break;
177
178        case 0x02:
179
180            /*
181             * These elements can encode lengths between [1..64] and offsets
182             * from [0..65535]. (len-1) occupies six bits and is stored in
183             * the upper six bits ([2..7]) of the tag byte. The offset is
184             * stored as a little-endian 16-bit integer in the two bytes
185             * following the tag byte.
186             */
187
188            length = (b >> 2) + 1;
189            if (length < 0) {
190                throw new IOException("Illegal block with a negative match length found");
191            }
192            uncompressedBytesRemaining -= length;
193
194            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
195
196            try {
197                startBackReference(offset, length);
198            } catch (IllegalArgumentException ex) {
199                throw new IOException("Illegal block with bad offset found", ex);
200            }
201            state = State.IN_BACK_REFERENCE;
202            break;
203
204        case 0x03:
205
206            /*
207             * These are like the copies with 2-byte offsets (see previous
208             * subsection), except that the offset is stored as a 32-bit
209             * integer instead of a 16-bit integer (and thus will occupy
210             * four bytes).
211             */
212
213            length = (b >> 2) + 1;
214            if (length < 0) {
215                throw new IOException("Illegal block with a negative match length found");
216            }
217            uncompressedBytesRemaining -= length;
218
219            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
220
221            try {
222                startBackReference(offset, length);
223            } catch (IllegalArgumentException ex) {
224                throw new IOException("Illegal block with bad offset found", ex);
225            }
226            state = State.IN_BACK_REFERENCE;
227            break;
228        default:
229            // impossible as TAG_MASK is two bits and all four possible cases have been covered
230            break;
231        }
232    }
233
234    /*
235     * For literals up to and including 60 bytes in length, the
236     * upper six bits of the tag byte contain (len-1). The literal
237     * follows immediately thereafter in the bytestream. - For
238     * longer literals, the (len-1) value is stored after the tag
239     * byte, little-endian. The upper six bits of the tag byte
240     * describe how many bytes are used for the length; 60, 61, 62
241     * or 63 for 1-4 bytes, respectively. The literal itself follows
242     * after the length.
243     */
244    private int readLiteralLength(final int b) throws IOException {
245        int length;
246        switch (b >> 2) {
247        case 60:
248            length = readOneByte();
249            if (length == -1) {
250                throw new IOException("Premature end of stream reading literal length");
251            }
252            break;
253        case 61:
254            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
255            break;
256        case 62:
257            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
258            break;
259        case 63:
260            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
261            break;
262        default:
263            length = b >> 2;
264            break;
265        }
266
267        return length + 1;
268    }
269
270    /**
271     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
272     * 1), stored as a little-endian varint. Varints consist of a series of
273     * bytes, where the lower 7 bits are data and the upper bit is set iff there
274     * are more bytes to be read. In other words, an uncompressed length of 64
275     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
276     * would be stored as 0xFE 0xFF 0x7F.
277     *
278     * @return The size of the uncompressed data
279     *
280     * @throws IOException
281     *             Could not read a byte
282     */
283    private long readSize() throws IOException {
284        int index = 0;
285        long sz = 0;
286        int b = 0;
287
288        do {
289            b = readOneByte();
290            if (b == -1) {
291                throw new IOException("Premature end of stream reading size");
292            }
293            sz |= (b & 0x7f) << (index++ * 7);
294        } while (0 != (b & 0x80));
295        return sz;
296    }
297
298    /**
299     * Get the uncompressed size of the stream
300     *
301     * @return the uncompressed size
302     */
303    @Override
304    public int getSize() {
305        return size;
306    }
307
308    private enum State {
309        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
310    }
311}