Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
025import org.apache.commons.compress.utils.ByteUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>
031 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
032 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
033 * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
034 * </p>
035 *
036 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
037 * @since 1.7
038 */
039public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
040
041    private enum State {
042        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
043    }
044
045    /** Mask used to determine the type of "tag" is being processed */
046    private static final int TAG_MASK = 0x03;
047
048    /** Default block size */
049    public static final int DEFAULT_BLOCK_SIZE = 32768;
050
051    /** The size of the uncompressed data */
052    private final int size;
053
054    /** Number of uncompressed bytes still to be read. */
055    private int uncompressedBytesRemaining;
056
057    /** Current state of the stream */
058    private State state = State.NO_BLOCK;
059
060    private boolean endReached;
061
062    /**
063     * Constructor using the default buffer size of 32k.
064     *
065     * @param is An InputStream to read compressed data from
066     * @throws IOException if reading fails
067     */
068    public SnappyCompressorInputStream(final InputStream is) throws IOException {
069        this(is, DEFAULT_BLOCK_SIZE);
070    }
071
072    /**
073     * Constructor using a configurable buffer size.
074     *
075     * @param is        An InputStream to read compressed data from
076     * @param blockSize The block size used in compression
077     * @throws IOException              if reading fails
078     * @throws IllegalArgumentException if blockSize is not bigger than 0
079     */
080    public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
081        super(is, blockSize);
082        uncompressedBytesRemaining = size = (int) readSize();
083    }
084
085    /**
086     * Try to fill the buffer with the next block of data.
087     */
088    private void fill() throws IOException {
089        if (uncompressedBytesRemaining == 0) {
090            endReached = true;
091            return;
092        }
093
094        int b = readOneByte();
095        if (b == -1) {
096            throw new IOException("Premature end of stream reading block start");
097        }
098        int length = 0;
099        int offset = 0;
100
101        switch (b & TAG_MASK) {
102
103        case 0x00:
104
105            length = readLiteralLength(b);
106            if (length < 0) {
107                throw new IOException("Illegal block with a negative literal size found");
108            }
109            uncompressedBytesRemaining -= length;
110            startLiteral(length);
111            state = State.IN_LITERAL;
112            break;
113
114        case 0x01:
115
116            /*
117             * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
118             * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
119             * the lower eight are stored in a byte following the tag byte.
120             */
121
122            length = 4 + (b >> 2 & 0x07);
123            uncompressedBytesRemaining -= length;
124            offset = (b & 0xE0) << 3;
125            b = readOneByte();
126            if (b == -1) {
127                throw new IOException("Premature end of stream reading back-reference length");
128            }
129            offset |= b;
130
131            try {
132                startBackReference(offset, length);
133            } catch (final IllegalArgumentException ex) {
134                throw new IOException("Illegal block with bad offset found", ex);
135            }
136            state = State.IN_BACK_REFERENCE;
137            break;
138
139        case 0x02:
140
141            /*
142             * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
143             * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
144             */
145
146            length = (b >> 2) + 1;
147            if (length < 0) {
148                throw new IOException("Illegal block with a negative match length found");
149            }
150            uncompressedBytesRemaining -= length;
151
152            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
153
154            try {
155                startBackReference(offset, length);
156            } catch (final IllegalArgumentException ex) {
157                throw new IOException("Illegal block with bad offset found", ex);
158            }
159            state = State.IN_BACK_REFERENCE;
160            break;
161
162        case 0x03:
163
164            /*
165             * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
166             * integer (and thus will occupy four bytes).
167             */
168
169            length = (b >> 2) + 1;
170            if (length < 0) {
171                throw new IOException("Illegal block with a negative match length found");
172            }
173            uncompressedBytesRemaining -= length;
174
175            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
176
177            try {
178                startBackReference(offset, length);
179            } catch (final IllegalArgumentException ex) {
180                throw new IOException("Illegal block with bad offset found", ex);
181            }
182            state = State.IN_BACK_REFERENCE;
183            break;
184        default:
185            // impossible as TAG_MASK is two bits and all four possible cases have been covered
186            break;
187        }
188    }
189
190    /**
191     * Gets the uncompressed size of the stream
192     *
193     * @return the uncompressed size
194     */
195    @Override
196    public int getSize() {
197        return size;
198    }
199
200    /**
201     * {@inheritDoc}
202     */
203    @Override
204    public int read(final byte[] b, final int off, final int len) throws IOException {
205        if (len == 0) {
206            return 0;
207        }
208        if (endReached) {
209            return -1;
210        }
211        switch (state) {
212        case NO_BLOCK:
213            fill();
214            return read(b, off, len);
215        case IN_LITERAL:
216            final int litLen = readLiteral(b, off, len);
217            if (!hasMoreDataInBlock()) {
218                state = State.NO_BLOCK;
219            }
220            return litLen > 0 ? litLen : read(b, off, len);
221        case IN_BACK_REFERENCE:
222            final int backReferenceLen = readBackReference(b, off, len);
223            if (!hasMoreDataInBlock()) {
224                state = State.NO_BLOCK;
225            }
226            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
227        default:
228            throw new IOException("Unknown stream state " + state);
229        }
230    }
231
232    /*
233     * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
234     * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
235     * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
236     */
237    private int readLiteralLength(final int b) throws IOException {
238        final int length;
239        switch (b >> 2) {
240        case 60:
241            length = readOneByte();
242            if (length == -1) {
243                throw new IOException("Premature end of stream reading literal length");
244            }
245            break;
246        case 61:
247            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
248            break;
249        case 62:
250            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
251            break;
252        case 63:
253            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
254            break;
255        default:
256            length = b >> 2;
257            break;
258        }
259
260        return length + 1;
261    }
262
263    /**
264     * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
265     * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
266     * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
267     *
268     * @return The size of the uncompressed data
269     * @throws IOException Could not read a byte
270     */
271    private long readSize() throws IOException {
272        int index = 0;
273        long sz = 0;
274        int b = 0;
275
276        do {
277            b = readOneByte();
278            if (b == -1) {
279                throw new IOException("Premature end of stream reading size");
280            }
281            sz |= (b & 0x7f) << index++ * 7;
282        } while (0 != (b & 0x80));
283        return sz;
284    }
285}