View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.snappy;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  
24  import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
25  import org.apache.commons.compress.utils.ByteUtils;
26  
27  /**
28   * CompressorInputStream for the raw Snappy format.
29   *
30   * <p>
31   * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
32   * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
33   * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
34   * </p>
35   *
36   * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
37   * @since 1.7
38   */
39  public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
40  
41      private enum State {
42          NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
43      }
44  
45      /** Mask used to determine the type of "tag" is being processed */
46      private static final int TAG_MASK = 0x03;
47  
48      /** Default block size */
49      public static final int DEFAULT_BLOCK_SIZE = 32768;
50  
51      /** The size of the uncompressed data */
52      private final int size;
53  
54      /** Number of uncompressed bytes still to be read. */
55      private int uncompressedBytesRemaining;
56  
57      /** Current state of the stream */
58      private State state = State.NO_BLOCK;
59  
60      private boolean endReached;
61  
62      /**
63       * Constructor using the default buffer size of 32k.
64       *
65       * @param is An InputStream to read compressed data from
66       * @throws IOException if reading fails
67       */
68      public SnappyCompressorInputStream(final InputStream is) throws IOException {
69          this(is, DEFAULT_BLOCK_SIZE);
70      }
71  
72      /**
73       * Constructor using a configurable buffer size.
74       *
75       * @param is        An InputStream to read compressed data from
76       * @param blockSize The block size used in compression
77       * @throws IOException              if reading fails
78       * @throws IllegalArgumentException if blockSize is not bigger than 0
79       */
80      public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
81          super(is, blockSize);
82          uncompressedBytesRemaining = size = (int) readSize();
83      }
84  
85      /**
86       * Try to fill the buffer with the next block of data.
87       */
88      private void fill() throws IOException {
89          if (uncompressedBytesRemaining == 0) {
90              endReached = true;
91              return;
92          }
93  
94          int b = readOneByte();
95          if (b == -1) {
96              throw new IOException("Premature end of stream reading block start");
97          }
98          int length = 0;
99          int offset = 0;
100 
101         switch (b & TAG_MASK) {
102 
103         case 0x00:
104 
105             length = readLiteralLength(b);
106             if (length < 0) {
107                 throw new IOException("Illegal block with a negative literal size found");
108             }
109             uncompressedBytesRemaining -= length;
110             startLiteral(length);
111             state = State.IN_LITERAL;
112             break;
113 
114         case 0x01:
115 
116             /*
117              * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
118              * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
119              * the lower eight are stored in a byte following the tag byte.
120              */
121 
122             length = 4 + (b >> 2 & 0x07);
123             uncompressedBytesRemaining -= length;
124             offset = (b & 0xE0) << 3;
125             b = readOneByte();
126             if (b == -1) {
127                 throw new IOException("Premature end of stream reading back-reference length");
128             }
129             offset |= b;
130 
131             try {
132                 startBackReference(offset, length);
133             } catch (final IllegalArgumentException ex) {
134                 throw new IOException("Illegal block with bad offset found", ex);
135             }
136             state = State.IN_BACK_REFERENCE;
137             break;
138 
139         case 0x02:
140 
141             /*
142              * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
143              * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
144              */
145 
146             length = (b >> 2) + 1;
147             if (length < 0) {
148                 throw new IOException("Illegal block with a negative match length found");
149             }
150             uncompressedBytesRemaining -= length;
151 
152             offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
153 
154             try {
155                 startBackReference(offset, length);
156             } catch (final IllegalArgumentException ex) {
157                 throw new IOException("Illegal block with bad offset found", ex);
158             }
159             state = State.IN_BACK_REFERENCE;
160             break;
161 
162         case 0x03:
163 
164             /*
165              * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
166              * integer (and thus will occupy four bytes).
167              */
168 
169             length = (b >> 2) + 1;
170             if (length < 0) {
171                 throw new IOException("Illegal block with a negative match length found");
172             }
173             uncompressedBytesRemaining -= length;
174 
175             offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
176 
177             try {
178                 startBackReference(offset, length);
179             } catch (final IllegalArgumentException ex) {
180                 throw new IOException("Illegal block with bad offset found", ex);
181             }
182             state = State.IN_BACK_REFERENCE;
183             break;
184         default:
185             // impossible as TAG_MASK is two bits and all four possible cases have been covered
186             break;
187         }
188     }
189 
190     /**
191      * Gets the uncompressed size of the stream
192      *
193      * @return the uncompressed size
194      */
195     @Override
196     public int getSize() {
197         return size;
198     }
199 
200     /**
201      * {@inheritDoc}
202      */
203     @Override
204     public int read(final byte[] b, final int off, final int len) throws IOException {
205         if (len == 0) {
206             return 0;
207         }
208         if (endReached) {
209             return -1;
210         }
211         switch (state) {
212         case NO_BLOCK:
213             fill();
214             return read(b, off, len);
215         case IN_LITERAL:
216             final int litLen = readLiteral(b, off, len);
217             if (!hasMoreDataInBlock()) {
218                 state = State.NO_BLOCK;
219             }
220             return litLen > 0 ? litLen : read(b, off, len);
221         case IN_BACK_REFERENCE:
222             final int backReferenceLen = readBackReference(b, off, len);
223             if (!hasMoreDataInBlock()) {
224                 state = State.NO_BLOCK;
225             }
226             return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
227         default:
228             throw new IOException("Unknown stream state " + state);
229         }
230     }
231 
232     /*
233      * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
234      * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
235      * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
236      */
237     private int readLiteralLength(final int b) throws IOException {
238         final int length;
239         switch (b >> 2) {
240         case 60:
241             length = readOneByte();
242             if (length == -1) {
243                 throw new IOException("Premature end of stream reading literal length");
244             }
245             break;
246         case 61:
247             length = (int) ByteUtils.fromLittleEndian(supplier, 2);
248             break;
249         case 62:
250             length = (int) ByteUtils.fromLittleEndian(supplier, 3);
251             break;
252         case 63:
253             length = (int) ByteUtils.fromLittleEndian(supplier, 4);
254             break;
255         default:
256             length = b >> 2;
257             break;
258         }
259 
260         return length + 1;
261     }
262 
263     /**
264      * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
265      * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
266      * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
267      *
268      * @return The size of the uncompressed data
269      * @throws IOException Could not read a byte
270      */
271     private long readSize() throws IOException {
272         int index = 0;
273         long sz = 0;
274         int b = 0;
275 
276         do {
277             b = readOneByte();
278             if (b == -1) {
279                 throw new IOException("Premature end of stream reading size");
280             }
281             sz |= (b & 0x7f) << index++ * 7;
282         } while (0 != (b & 0x80));
283         return sz;
284     }
285 }