1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.commons.compress.compressors.snappy;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23
24 import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
25 import org.apache.commons.compress.utils.ByteUtils;
26
27 /**
28 * CompressorInputStream for the raw Snappy format.
29 *
30 * <p>
31 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
32 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
33 * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
34 * </p>
35 *
36 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
37 * @since 1.7
38 */
39 public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
40
41 private enum State {
42 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
43 }
44
45 /** Mask used to determine the type of "tag" is being processed */
46 private static final int TAG_MASK = 0x03;
47
48 /** Default block size */
49 public static final int DEFAULT_BLOCK_SIZE = 32768;
50
51 /** The size of the uncompressed data */
52 private final int size;
53
54 /** Number of uncompressed bytes still to be read. */
55 private int uncompressedBytesRemaining;
56
57 /** Current state of the stream */
58 private State state = State.NO_BLOCK;
59
60 private boolean endReached;
61
62 /**
63 * Constructor using the default buffer size of 32k.
64 *
65 * @param is An InputStream to read compressed data from
66 * @throws IOException if reading fails
67 */
68 public SnappyCompressorInputStream(final InputStream is) throws IOException {
69 this(is, DEFAULT_BLOCK_SIZE);
70 }
71
72 /**
73 * Constructor using a configurable buffer size.
74 *
75 * @param is An InputStream to read compressed data from
76 * @param blockSize The block size used in compression
77 * @throws IOException if reading fails
78 * @throws IllegalArgumentException if blockSize is not bigger than 0
79 */
80 public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
81 super(is, blockSize);
82 uncompressedBytesRemaining = size = (int) readSize();
83 }
84
85 /**
86 * Try to fill the buffer with the next block of data.
87 */
88 private void fill() throws IOException {
89 if (uncompressedBytesRemaining == 0) {
90 endReached = true;
91 return;
92 }
93
94 int b = readOneByte();
95 if (b == -1) {
96 throw new IOException("Premature end of stream reading block start");
97 }
98 int length = 0;
99 int offset = 0;
100
101 switch (b & TAG_MASK) {
102
103 case 0x00:
104
105 length = readLiteralLength(b);
106 if (length < 0) {
107 throw new IOException("Illegal block with a negative literal size found");
108 }
109 uncompressedBytesRemaining -= length;
110 startLiteral(length);
111 state = State.IN_LITERAL;
112 break;
113
114 case 0x01:
115
116 /*
117 * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
118 * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
119 * the lower eight are stored in a byte following the tag byte.
120 */
121
122 length = 4 + (b >> 2 & 0x07);
123 uncompressedBytesRemaining -= length;
124 offset = (b & 0xE0) << 3;
125 b = readOneByte();
126 if (b == -1) {
127 throw new IOException("Premature end of stream reading back-reference length");
128 }
129 offset |= b;
130
131 try {
132 startBackReference(offset, length);
133 } catch (final IllegalArgumentException ex) {
134 throw new IOException("Illegal block with bad offset found", ex);
135 }
136 state = State.IN_BACK_REFERENCE;
137 break;
138
139 case 0x02:
140
141 /*
142 * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
143 * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
144 */
145
146 length = (b >> 2) + 1;
147 if (length < 0) {
148 throw new IOException("Illegal block with a negative match length found");
149 }
150 uncompressedBytesRemaining -= length;
151
152 offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
153
154 try {
155 startBackReference(offset, length);
156 } catch (final IllegalArgumentException ex) {
157 throw new IOException("Illegal block with bad offset found", ex);
158 }
159 state = State.IN_BACK_REFERENCE;
160 break;
161
162 case 0x03:
163
164 /*
165 * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
166 * integer (and thus will occupy four bytes).
167 */
168
169 length = (b >> 2) + 1;
170 if (length < 0) {
171 throw new IOException("Illegal block with a negative match length found");
172 }
173 uncompressedBytesRemaining -= length;
174
175 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
176
177 try {
178 startBackReference(offset, length);
179 } catch (final IllegalArgumentException ex) {
180 throw new IOException("Illegal block with bad offset found", ex);
181 }
182 state = State.IN_BACK_REFERENCE;
183 break;
184 default:
185 // impossible as TAG_MASK is two bits and all four possible cases have been covered
186 break;
187 }
188 }
189
190 /**
191 * Gets the uncompressed size of the stream
192 *
193 * @return the uncompressed size
194 */
195 @Override
196 public int getSize() {
197 return size;
198 }
199
200 /**
201 * {@inheritDoc}
202 */
203 @Override
204 public int read(final byte[] b, final int off, final int len) throws IOException {
205 if (len == 0) {
206 return 0;
207 }
208 if (endReached) {
209 return -1;
210 }
211 switch (state) {
212 case NO_BLOCK:
213 fill();
214 return read(b, off, len);
215 case IN_LITERAL:
216 final int litLen = readLiteral(b, off, len);
217 if (!hasMoreDataInBlock()) {
218 state = State.NO_BLOCK;
219 }
220 return litLen > 0 ? litLen : read(b, off, len);
221 case IN_BACK_REFERENCE:
222 final int backReferenceLen = readBackReference(b, off, len);
223 if (!hasMoreDataInBlock()) {
224 state = State.NO_BLOCK;
225 }
226 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
227 default:
228 throw new IOException("Unknown stream state " + state);
229 }
230 }
231
232 /*
233 * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
234 * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
235 * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
236 */
237 private int readLiteralLength(final int b) throws IOException {
238 final int length;
239 switch (b >> 2) {
240 case 60:
241 length = readOneByte();
242 if (length == -1) {
243 throw new IOException("Premature end of stream reading literal length");
244 }
245 break;
246 case 61:
247 length = (int) ByteUtils.fromLittleEndian(supplier, 2);
248 break;
249 case 62:
250 length = (int) ByteUtils.fromLittleEndian(supplier, 3);
251 break;
252 case 63:
253 length = (int) ByteUtils.fromLittleEndian(supplier, 4);
254 break;
255 default:
256 length = b >> 2;
257 break;
258 }
259
260 return length + 1;
261 }
262
263 /**
264 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
265 * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
266 * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
267 *
268 * @return The size of the uncompressed data
269 * @throws IOException Could not read a byte
270 */
271 private long readSize() throws IOException {
272 int index = 0;
273 long sz = 0;
274 int b = 0;
275
276 do {
277 b = readOneByte();
278 if (b == -1) {
279 throw new IOException("Premature end of stream reading size");
280 }
281 sz |= (b & 0x7f) << index++ * 7;
282 } while (0 != (b & 0x80));
283 return sz;
284 }
285 }