View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.snappy;
20  
21  import java.io.IOException;
22  import java.io.OutputStream;
23  
24  import org.apache.commons.compress.compressors.CompressorOutputStream;
25  import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
26  import org.apache.commons.compress.compressors.lz77support.Parameters;
27  import org.apache.commons.compress.utils.ByteUtils;
28  
29  /**
30   * CompressorOutputStream for the raw Snappy format.
31   *
32   * <p>
33   * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
34   * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
35   * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
36   * </p>
37   *
38   * <p>
39   * The raw Snappy format requires the uncompressed size to be written at the beginning of the stream using a varint representation, i.e. the number of bytes
40   * needed to write the information is not known before the uncompressed size is known. We've chosen to make the uncompressedSize a parameter of the constructor
41   * in favor of buffering the whole output until the size is known. When using the {@link FramedSnappyCompressorOutputStream} this limitation is taken care of by
42   * the warpping framing format.
43   * </p>
44   *
45   * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
46   * @since 1.14
47   * @NotThreadSafe
48   */
49  public class SnappyCompressorOutputStream extends CompressorOutputStream {
50      // literal length is stored as (len - 1) either inside the tag
51      // (six bits minus four flags) or in 1 to 4 bytes after the tag
52      private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60;
53      private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8;
54      private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16;
55  
56      private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24;
57  
58      private static final int ONE_SIZE_BYTE_MARKER = 60 << 2;
59  
60      private static final int TWO_SIZE_BYTE_MARKER = 61 << 2;
61  
62      private static final int THREE_SIZE_BYTE_MARKER = 62 << 2;
63  
64      private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2;
65  
66      // Back-references ("copies") have their offset/size information
67      // in two, three or five bytes.
68      private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4;
69  
70      private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11;
71  
72      private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1;
73  
74      private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1;
75  
76      private static final int ONE_BYTE_COPY_TAG = 1;
77  
78      private static final int TWO_BYTE_COPY_TAG = 2;
79      private static final int FOUR_BYTE_COPY_TAG = 3;
80      // technically the format could use shorter matches but with a
81      // length of three the offset would be encoded as at least two
82      // bytes in addition to the tag, so yield no compression at all
83      private static final int MIN_MATCH_LENGTH = 4;
84      // Snappy stores the match length in six bits of the tag
85      private static final int MAX_MATCH_LENGTH = 64;
86  
87      /**
88       * Returns a builder correctly configured for the Snappy algorithm using the gven block size.
89       *
90       * @param blockSize the block size.
91       * @return a builder correctly configured for the Snappy algorithm using the gven block size
92       */
93      public static Parameters.Builder createParameterBuilder(final int blockSize) {
94          // the max offset and max literal length defined by the format
95          // are 2^32 - 1 and 2^32 respectively - with blockSize being
96          // an integer we will never exceed that
97          return Parameters.builder(blockSize).withMinBackReferenceLength(MIN_MATCH_LENGTH).withMaxBackReferenceLength(MAX_MATCH_LENGTH).withMaxOffset(blockSize)
98                  .withMaxLiteralLength(blockSize);
99      }
100 
101     private final LZ77Compressor compressor;
102     private final OutputStream os;
103     private final ByteUtils.ByteConsumer consumer;
104 
105     // used in one-arg write method
106     private final byte[] oneByte = new byte[1];
107 
108     private boolean finished;
109 
110     /**
111      * Constructor using the default block size of 32k.
112      *
113      * @param os               the outputstream to write compressed data to
114      * @param uncompressedSize the uncompressed size of data
115      * @throws IOException if writing of the size fails
116      */
117     public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException {
118         this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE);
119     }
120 
121     /**
122      * Constructor using a configurable block size.
123      *
124      * @param os               the outputstream to write compressed data to
125      * @param uncompressedSize the uncompressed size of data
126      * @param blockSize        the block size used - must be a power of two
127      * @throws IOException if writing of the size fails
128      */
129     public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize) throws IOException {
130         this(os, uncompressedSize, createParameterBuilder(blockSize).build());
131     }
132 
133     /**
134      * Constructor providing full control over the underlying LZ77 compressor.
135      *
136      * @param os               the outputstream to write compressed data to
137      * @param uncompressedSize the uncompressed size of data
138      * @param params           the parameters to use by the compressor - note that the format itself imposes some limits like a maximum match length of 64 bytes
139      * @throws IOException if writing of the size fails
140      */
141     public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final Parameters params) throws IOException {
142         this.os = os;
143         consumer = new ByteUtils.OutputStreamByteConsumer(os);
144         compressor = new LZ77Compressor(params, block -> {
145             switch (block.getType()) {
146             case LITERAL:
147                 writeLiteralBlock((LZ77Compressor.LiteralBlock) block);
148                 break;
149             case BACK_REFERENCE:
150                 writeBackReference((LZ77Compressor.BackReference) block);
151                 break;
152             case EOD:
153                 break;
154             }
155         });
156         writeUncompressedSize(uncompressedSize);
157     }
158 
159     @Override
160     public void close() throws IOException {
161         try {
162             finish();
163         } finally {
164             os.close();
165         }
166     }
167 
168     /**
169      * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
170      *
171      * @throws IOException if an error occurs
172      */
173     public void finish() throws IOException {
174         if (!finished) {
175             compressor.finish();
176             finished = true;
177         }
178     }
179 
180     @Override
181     public void write(final byte[] data, final int off, final int len) throws IOException {
182         compressor.compress(data, off, len);
183     }
184 
185     @Override
186     public void write(final int b) throws IOException {
187         oneByte[0] = (byte) (b & 0xff);
188         write(oneByte);
189     }
190 
191     private void writeBackReference(final LZ77Compressor.BackReference block) throws IOException {
192         final int len = block.getLength();
193         final int offset = block.getOffset();
194         if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) {
195             writeBackReferenceWithOneOffsetByte(len, offset);
196         } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) {
197             writeBackReferenceWithTwoOffsetBytes(len, offset);
198         } else {
199             writeBackReferenceWithFourOffsetBytes(len, offset);
200         }
201     }
202 
203     private void writeBackReferenceWithFourOffsetBytes(final int len, final int offset) throws IOException {
204         writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset);
205     }
206 
207     private void writeBackReferenceWithLittleEndianOffset(final int tag, final int offsetBytes, final int len, final int offset) throws IOException {
208         os.write(tag | len - 1 << 2);
209         writeLittleEndian(offsetBytes, offset);
210     }
211 
212     private void writeBackReferenceWithOneOffsetByte(final int len, final int offset) throws IOException {
213         os.write(ONE_BYTE_COPY_TAG | len - 4 << 2 | (offset & 0x700) >> 3);
214         os.write(offset & 0xff);
215     }
216 
217     private void writeBackReferenceWithTwoOffsetBytes(final int len, final int offset) throws IOException {
218         writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset);
219     }
220 
221     private void writeLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
222         final int len = block.getLength();
223         if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) {
224             writeLiteralBlockNoSizeBytes(block, len);
225         } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) {
226             writeLiteralBlockOneSizeByte(block, len);
227         } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) {
228             writeLiteralBlockTwoSizeBytes(block, len);
229         } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) {
230             writeLiteralBlockThreeSizeBytes(block, len);
231         } else {
232             writeLiteralBlockFourSizeBytes(block, len);
233         }
234     }
235 
236     private void writeLiteralBlockFourSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
237         writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block);
238     }
239 
240     private void writeLiteralBlockNoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
241         writeLiteralBlockWithSize(len - 1 << 2, 0, len, block);
242     }
243 
244     private void writeLiteralBlockOneSizeByte(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
245         writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block);
246     }
247 
248     private void writeLiteralBlockThreeSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
249         writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block);
250     }
251 
252     private void writeLiteralBlockTwoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
253         writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block);
254     }
255 
256     private void writeLiteralBlockWithSize(final int tagByte, final int sizeBytes, final int len, final LZ77Compressor.LiteralBlock block) throws IOException {
257         os.write(tagByte);
258         writeLittleEndian(sizeBytes, len - 1);
259         os.write(block.getData(), block.getOffset(), len);
260     }
261 
262     private void writeLittleEndian(final int numBytes, final int num) throws IOException {
263         ByteUtils.toLittleEndian(consumer, num, numBytes);
264     }
265 
266     private void writeUncompressedSize(long uncompressedSize) throws IOException {
267         boolean more;
268         do {
269             int currentByte = (int) (uncompressedSize & 0x7F);
270             more = uncompressedSize > currentByte;
271             if (more) {
272                 currentByte |= 0x80;
273             }
274             os.write(currentByte);
275             uncompressedSize >>= 7;
276         } while (more);
277     }
278 }