View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.snappy;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  
24  import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
25  import org.apache.commons.compress.utils.ByteUtils;
26  
27  /**
28   * CompressorInputStream for the raw Snappy format.
29   *
30   * <p>
31   * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
32   * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
33   * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
34   * </p>
35   *
36   * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
37   * @since 1.7
38   */
39  public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
40  
41      private enum State {
42          NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
43      }
44  
45      /** Mask used to determine the type of "tag" is being processed */
46      private static final int TAG_MASK = 0x03;
47  
48      /** Default block size */
49      public static final int DEFAULT_BLOCK_SIZE = 32768;
50  
51      /** The size of the uncompressed data */
52      private final int size;
53  
54      /** Number of uncompressed bytes still to be read. */
55      private int uncompressedBytesRemaining;
56  
57      /** Current state of the stream */
58      private State state = State.NO_BLOCK;
59  
60      private boolean endReached;
61  
62      /**
63       * Constructor using the default buffer size of 32k.
64       *
65       * @param is An InputStream to read compressed data from
66       *
67       * @throws IOException if reading fails
68       */
69      public SnappyCompressorInputStream(final InputStream is) throws IOException {
70          this(is, DEFAULT_BLOCK_SIZE);
71      }
72  
73      /**
74       * Constructor using a configurable buffer size.
75       *
76       * @param is        An InputStream to read compressed data from
77       * @param blockSize The block size used in compression
78       *
79       * @throws IOException              if reading fails
80       * @throws IllegalArgumentException if blockSize is not bigger than 0
81       */
82      public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
83          super(is, blockSize);
84          uncompressedBytesRemaining = size = (int) readSize();
85      }
86  
87      /**
88       * Try to fill the buffer with the next block of data.
89       */
90      private void fill() throws IOException {
91          if (uncompressedBytesRemaining == 0) {
92              endReached = true;
93              return;
94          }
95  
96          int b = readOneByte();
97          if (b == -1) {
98              throw new IOException("Premature end of stream reading block start");
99          }
100         int length = 0;
101         int offset = 0;
102 
103         switch (b & TAG_MASK) {
104 
105         case 0x00:
106 
107             length = readLiteralLength(b);
108             if (length < 0) {
109                 throw new IOException("Illegal block with a negative literal size found");
110             }
111             uncompressedBytesRemaining -= length;
112             startLiteral(length);
113             state = State.IN_LITERAL;
114             break;
115 
116         case 0x01:
117 
118             /*
119              * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
120              * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
121              * the lower eight are stored in a byte following the tag byte.
122              */
123 
124             length = 4 + (b >> 2 & 0x07);
125             uncompressedBytesRemaining -= length;
126             offset = (b & 0xE0) << 3;
127             b = readOneByte();
128             if (b == -1) {
129                 throw new IOException("Premature end of stream reading back-reference length");
130             }
131             offset |= b;
132 
133             try {
134                 startBackReference(offset, length);
135             } catch (final IllegalArgumentException ex) {
136                 throw new IOException("Illegal block with bad offset found", ex);
137             }
138             state = State.IN_BACK_REFERENCE;
139             break;
140 
141         case 0x02:
142 
143             /*
144              * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
145              * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
146              */
147 
148             length = (b >> 2) + 1;
149             if (length < 0) {
150                 throw new IOException("Illegal block with a negative match length found");
151             }
152             uncompressedBytesRemaining -= length;
153 
154             offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
155 
156             try {
157                 startBackReference(offset, length);
158             } catch (final IllegalArgumentException ex) {
159                 throw new IOException("Illegal block with bad offset found", ex);
160             }
161             state = State.IN_BACK_REFERENCE;
162             break;
163 
164         case 0x03:
165 
166             /*
167              * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
168              * integer (and thus will occupy four bytes).
169              */
170 
171             length = (b >> 2) + 1;
172             if (length < 0) {
173                 throw new IOException("Illegal block with a negative match length found");
174             }
175             uncompressedBytesRemaining -= length;
176 
177             offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
178 
179             try {
180                 startBackReference(offset, length);
181             } catch (final IllegalArgumentException ex) {
182                 throw new IOException("Illegal block with bad offset found", ex);
183             }
184             state = State.IN_BACK_REFERENCE;
185             break;
186         default:
187             // impossible as TAG_MASK is two bits and all four possible cases have been covered
188             break;
189         }
190     }
191 
192     /**
193      * Gets the uncompressed size of the stream
194      *
195      * @return the uncompressed size
196      */
197     @Override
198     public int getSize() {
199         return size;
200     }
201 
202     /**
203      * {@inheritDoc}
204      */
205     @Override
206     public int read(final byte[] b, final int off, final int len) throws IOException {
207         if (len == 0) {
208             return 0;
209         }
210         if (endReached) {
211             return -1;
212         }
213         switch (state) {
214         case NO_BLOCK:
215             fill();
216             return read(b, off, len);
217         case IN_LITERAL:
218             final int litLen = readLiteral(b, off, len);
219             if (!hasMoreDataInBlock()) {
220                 state = State.NO_BLOCK;
221             }
222             return litLen > 0 ? litLen : read(b, off, len);
223         case IN_BACK_REFERENCE:
224             final int backReferenceLen = readBackReference(b, off, len);
225             if (!hasMoreDataInBlock()) {
226                 state = State.NO_BLOCK;
227             }
228             return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
229         default:
230             throw new IOException("Unknown stream state " + state);
231         }
232     }
233 
234     /*
235      * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
236      * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
237      * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
238      */
239     private int readLiteralLength(final int b) throws IOException {
240         final int length;
241         switch (b >> 2) {
242         case 60:
243             length = readOneByte();
244             if (length == -1) {
245                 throw new IOException("Premature end of stream reading literal length");
246             }
247             break;
248         case 61:
249             length = (int) ByteUtils.fromLittleEndian(supplier, 2);
250             break;
251         case 62:
252             length = (int) ByteUtils.fromLittleEndian(supplier, 3);
253             break;
254         case 63:
255             length = (int) ByteUtils.fromLittleEndian(supplier, 4);
256             break;
257         default:
258             length = b >> 2;
259             break;
260         }
261 
262         return length + 1;
263     }
264 
265     /**
266      * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
267      * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
268      * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
269      *
270      * @return The size of the uncompressed data
271      *
272      * @throws IOException Could not read a byte
273      */
274     private long readSize() throws IOException {
275         int index = 0;
276         long sz = 0;
277         int b = 0;
278 
279         do {
280             b = readOneByte();
281             if (b == -1) {
282                 throw new IOException("Premature end of stream reading size");
283             }
284             sz |= (b & 0x7f) << index++ * 7;
285         } while (0 != (b & 0x80));
286         return sz;
287     }
288 }