001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.OutputStream;
023
024import org.apache.commons.compress.compressors.CompressorOutputStream;
025import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
026import org.apache.commons.compress.compressors.lz77support.Parameters;
027import org.apache.commons.compress.utils.ByteUtils;
028
029/**
030 * CompressorOutputStream for the raw Snappy format.
031 *
032 * <p>
033 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
034 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
035 * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
036 * </p>
037 *
038 * <p>
039 * The raw Snappy format requires the uncompressed size to be written at the beginning of the stream using a varint representation, i.e. the number of bytes
040 * needed to write the information is not known before the uncompressed size is known. We've chosen to make the uncompressedSize a parameter of the constructor
041 * in favor of buffering the whole output until the size is known. When using the {@link FramedSnappyCompressorOutputStream} this limitation is taken care of by
042 * the warpping framing format.
043 * </p>
044 *
045 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
046 * @since 1.14
047 * @NotThreadSafe
048 */
049public class SnappyCompressorOutputStream extends CompressorOutputStream {
050    // literal length is stored as (len - 1) either inside the tag
051    // (six bits minus four flags) or in 1 to 4 bytes after the tag
052    private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60;
053    private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8;
054    private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16;
055
056    private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24;
057
058    private static final int ONE_SIZE_BYTE_MARKER = 60 << 2;
059
060    private static final int TWO_SIZE_BYTE_MARKER = 61 << 2;
061
062    private static final int THREE_SIZE_BYTE_MARKER = 62 << 2;
063
064    private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2;
065
066    // Back-references ("copies") have their offset/size information
067    // in two, three or five bytes.
068    private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4;
069
070    private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11;
071
072    private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1;
073
074    private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1;
075
076    private static final int ONE_BYTE_COPY_TAG = 1;
077
078    private static final int TWO_BYTE_COPY_TAG = 2;
079    private static final int FOUR_BYTE_COPY_TAG = 3;
080    // technically the format could use shorter matches but with a
081    // length of three the offset would be encoded as at least two
082    // bytes in addition to the tag, so yield no compression at all
083    private static final int MIN_MATCH_LENGTH = 4;
084    // Snappy stores the match length in six bits of the tag
085    private static final int MAX_MATCH_LENGTH = 64;
086
087    /**
088     * Returns a builder correctly configured for the Snappy algorithm using the gven block size.
089     *
090     * @param blockSize the block size.
091     * @return a builder correctly configured for the Snappy algorithm using the gven block size
092     */
093    public static Parameters.Builder createParameterBuilder(final int blockSize) {
094        // the max offset and max literal length defined by the format
095        // are 2^32 - 1 and 2^32 respectively - with blockSize being
096        // an integer we will never exceed that
097        return Parameters.builder(blockSize).withMinBackReferenceLength(MIN_MATCH_LENGTH).withMaxBackReferenceLength(MAX_MATCH_LENGTH).withMaxOffset(blockSize)
098                .withMaxLiteralLength(blockSize);
099    }
100
101    private final LZ77Compressor compressor;
102    private final OutputStream os;
103    private final ByteUtils.ByteConsumer consumer;
104
105    // used in one-arg write method
106    private final byte[] oneByte = new byte[1];
107
108    private boolean finished;
109
110    /**
111     * Constructor using the default block size of 32k.
112     *
113     * @param os               the outputstream to write compressed data to
114     * @param uncompressedSize the uncompressed size of data
115     * @throws IOException if writing of the size fails
116     */
117    public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException {
118        this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE);
119    }
120
121    /**
122     * Constructor using a configurable block size.
123     *
124     * @param os               the outputstream to write compressed data to
125     * @param uncompressedSize the uncompressed size of data
126     * @param blockSize        the block size used - must be a power of two
127     * @throws IOException if writing of the size fails
128     */
129    public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize) throws IOException {
130        this(os, uncompressedSize, createParameterBuilder(blockSize).build());
131    }
132
133    /**
134     * Constructor providing full control over the underlying LZ77 compressor.
135     *
136     * @param os               the outputstream to write compressed data to
137     * @param uncompressedSize the uncompressed size of data
138     * @param params           the parameters to use by the compressor - note that the format itself imposes some limits like a maximum match length of 64 bytes
139     * @throws IOException if writing of the size fails
140     */
141    public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final Parameters params) throws IOException {
142        this.os = os;
143        consumer = new ByteUtils.OutputStreamByteConsumer(os);
144        compressor = new LZ77Compressor(params, block -> {
145            switch (block.getType()) {
146            case LITERAL:
147                writeLiteralBlock((LZ77Compressor.LiteralBlock) block);
148                break;
149            case BACK_REFERENCE:
150                writeBackReference((LZ77Compressor.BackReference) block);
151                break;
152            case EOD:
153                break;
154            }
155        });
156        writeUncompressedSize(uncompressedSize);
157    }
158
159    @Override
160    public void close() throws IOException {
161        try {
162            finish();
163        } finally {
164            os.close();
165        }
166    }
167
168    /**
169     * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
170     *
171     * @throws IOException if an error occurs
172     */
173    public void finish() throws IOException {
174        if (!finished) {
175            compressor.finish();
176            finished = true;
177        }
178    }
179
180    @Override
181    public void write(final byte[] data, final int off, final int len) throws IOException {
182        compressor.compress(data, off, len);
183    }
184
185    @Override
186    public void write(final int b) throws IOException {
187        oneByte[0] = (byte) (b & 0xff);
188        write(oneByte);
189    }
190
191    private void writeBackReference(final LZ77Compressor.BackReference block) throws IOException {
192        final int len = block.getLength();
193        final int offset = block.getOffset();
194        if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) {
195            writeBackReferenceWithOneOffsetByte(len, offset);
196        } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) {
197            writeBackReferenceWithTwoOffsetBytes(len, offset);
198        } else {
199            writeBackReferenceWithFourOffsetBytes(len, offset);
200        }
201    }
202
203    private void writeBackReferenceWithFourOffsetBytes(final int len, final int offset) throws IOException {
204        writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset);
205    }
206
207    private void writeBackReferenceWithLittleEndianOffset(final int tag, final int offsetBytes, final int len, final int offset) throws IOException {
208        os.write(tag | len - 1 << 2);
209        writeLittleEndian(offsetBytes, offset);
210    }
211
212    private void writeBackReferenceWithOneOffsetByte(final int len, final int offset) throws IOException {
213        os.write(ONE_BYTE_COPY_TAG | len - 4 << 2 | (offset & 0x700) >> 3);
214        os.write(offset & 0xff);
215    }
216
217    private void writeBackReferenceWithTwoOffsetBytes(final int len, final int offset) throws IOException {
218        writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset);
219    }
220
221    private void writeLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
222        final int len = block.getLength();
223        if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) {
224            writeLiteralBlockNoSizeBytes(block, len);
225        } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) {
226            writeLiteralBlockOneSizeByte(block, len);
227        } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) {
228            writeLiteralBlockTwoSizeBytes(block, len);
229        } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) {
230            writeLiteralBlockThreeSizeBytes(block, len);
231        } else {
232            writeLiteralBlockFourSizeBytes(block, len);
233        }
234    }
235
236    private void writeLiteralBlockFourSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
237        writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block);
238    }
239
240    private void writeLiteralBlockNoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
241        writeLiteralBlockWithSize(len - 1 << 2, 0, len, block);
242    }
243
244    private void writeLiteralBlockOneSizeByte(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
245        writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block);
246    }
247
248    private void writeLiteralBlockThreeSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
249        writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block);
250    }
251
252    private void writeLiteralBlockTwoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
253        writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block);
254    }
255
256    private void writeLiteralBlockWithSize(final int tagByte, final int sizeBytes, final int len, final LZ77Compressor.LiteralBlock block) throws IOException {
257        os.write(tagByte);
258        writeLittleEndian(sizeBytes, len - 1);
259        os.write(block.getData(), block.getOffset(), len);
260    }
261
262    private void writeLittleEndian(final int numBytes, final int num) throws IOException {
263        ByteUtils.toLittleEndian(consumer, num, numBytes);
264    }
265
266    private void writeUncompressedSize(long uncompressedSize) throws IOException {
267        boolean more;
268        do {
269            int currentByte = (int) (uncompressedSize & 0x7F);
270            more = uncompressedSize > currentByte;
271            if (more) {
272                currentByte |= 0x80;
273            }
274            os.write(currentByte);
275            uncompressedSize >>= 7;
276        } while (more);
277    }
278}