SnappyCompressorOutputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.snappy;

  20. import java.io.IOException;
  21. import java.io.OutputStream;

  22. import org.apache.commons.compress.compressors.CompressorOutputStream;
  23. import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
  24. import org.apache.commons.compress.compressors.lz77support.Parameters;
  25. import org.apache.commons.compress.utils.ByteUtils;

  26. /**
  27.  * CompressorOutputStream for the raw Snappy format.
  28.  *
  29.  * <p>
  30.  * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
  31.  * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
  32.  * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
  33.  * </p>
  34.  *
  35.  * <p>
  36.  * The raw Snappy format requires the uncompressed size to be written at the beginning of the stream using a varint representation, i.e. the number of bytes
  37.  * needed to write the information is not known before the uncompressed size is known. We've chosen to make the uncompressedSize a parameter of the constructor
  38.  * in favor of buffering the whole output until the size is known. When using the {@link FramedSnappyCompressorOutputStream} this limitation is taken care of by
  39.  * the warpping framing format.
  40.  * </p>
  41.  *
  42.  * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
  43.  * @since 1.14
  44.  * @NotThreadSafe
  45.  */
  46. public class SnappyCompressorOutputStream extends CompressorOutputStream<OutputStream> {
  47.     // literal length is stored as (len - 1) either inside the tag
  48.     // (six bits minus four flags) or in 1 to 4 bytes after the tag
  49.     private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60;
  50.     private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8;
  51.     private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16;

  52.     private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24;

  53.     private static final int ONE_SIZE_BYTE_MARKER = 60 << 2;

  54.     private static final int TWO_SIZE_BYTE_MARKER = 61 << 2;

  55.     private static final int THREE_SIZE_BYTE_MARKER = 62 << 2;

  56.     private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2;

  57.     // Back-references ("copies") have their offset/size information
  58.     // in two, three or five bytes.
  59.     private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4;

  60.     private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11;

  61.     private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1;

  62.     private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1;

  63.     private static final int ONE_BYTE_COPY_TAG = 1;

  64.     private static final int TWO_BYTE_COPY_TAG = 2;
  65.     private static final int FOUR_BYTE_COPY_TAG = 3;
  66.     // technically the format could use shorter matches but with a
  67.     // length of three the offset would be encoded as at least two
  68.     // bytes in addition to the tag, so yield no compression at all
  69.     private static final int MIN_MATCH_LENGTH = 4;
  70.     // Snappy stores the match length in six bits of the tag
  71.     private static final int MAX_MATCH_LENGTH = 64;

  72.     /**
  73.      * Returns a builder correctly configured for the Snappy algorithm using the gven block size.
  74.      *
  75.      * @param blockSize the block size.
  76.      * @return a builder correctly configured for the Snappy algorithm using the gven block size
  77.      */
  78.     public static Parameters.Builder createParameterBuilder(final int blockSize) {
  79.         // the max offset and max literal length defined by the format
  80.         // are 2^32 - 1 and 2^32 respectively - with blockSize being
  81.         // an integer we will never exceed that
  82.         return Parameters.builder(blockSize).withMinBackReferenceLength(MIN_MATCH_LENGTH).withMaxBackReferenceLength(MAX_MATCH_LENGTH).withMaxOffset(blockSize)
  83.                 .withMaxLiteralLength(blockSize);
  84.     }

  85.     private final LZ77Compressor compressor;
  86.     private final ByteUtils.ByteConsumer consumer;

  87.     // used in one-arg write method
  88.     private final byte[] oneByte = new byte[1];

  89.     private boolean finished;

  90.     /**
  91.      * Constructor using the default block size of 32k.
  92.      *
  93.      * @param os               the outputstream to write compressed data to
  94.      * @param uncompressedSize the uncompressed size of data
  95.      * @throws IOException if writing of the size fails
  96.      */
  97.     public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException {
  98.         this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE);
  99.     }

  100.     /**
  101.      * Constructor using a configurable block size.
  102.      *
  103.      * @param os               the outputstream to write compressed data to
  104.      * @param uncompressedSize the uncompressed size of data
  105.      * @param blockSize        the block size used - must be a power of two
  106.      * @throws IOException if writing of the size fails
  107.      */
  108.     public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize) throws IOException {
  109.         this(os, uncompressedSize, createParameterBuilder(blockSize).build());
  110.     }

  111.     /**
  112.      * Constructor providing full control over the underlying LZ77 compressor.
  113.      *
  114.      * @param out               the outputstream to write compressed data to
  115.      * @param uncompressedSize the uncompressed size of data
  116.      * @param params           the parameters to use by the compressor - note that the format itself imposes some limits like a maximum match length of 64 bytes
  117.      * @throws IOException if writing of the size fails
  118.      */
  119.     public SnappyCompressorOutputStream(final OutputStream out, final long uncompressedSize, final Parameters params) throws IOException {
  120.         super(out);
  121.         consumer = new ByteUtils.OutputStreamByteConsumer(out);
  122.         compressor = new LZ77Compressor(params, block -> {
  123.             switch (block.getType()) {
  124.             case LITERAL:
  125.                 writeLiteralBlock((LZ77Compressor.LiteralBlock) block);
  126.                 break;
  127.             case BACK_REFERENCE:
  128.                 writeBackReference((LZ77Compressor.BackReference) block);
  129.                 break;
  130.             case EOD:
  131.                 break;
  132.             }
  133.         });
  134.         writeUncompressedSize(uncompressedSize);
  135.     }

  136.     @Override
  137.     public void close() throws IOException {
  138.         try {
  139.             finish();
  140.         } finally {
  141.             out.close();
  142.         }
  143.     }

  144.     /**
  145.      * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
  146.      *
  147.      * @throws IOException if an error occurs
  148.      */
  149.     public void finish() throws IOException {
  150.         if (!finished) {
  151.             compressor.finish();
  152.             finished = true;
  153.         }
  154.     }

  155.     @Override
  156.     public void write(final byte[] data, final int off, final int len) throws IOException {
  157.         compressor.compress(data, off, len);
  158.     }

  159.     @Override
  160.     public void write(final int b) throws IOException {
  161.         oneByte[0] = (byte) (b & 0xff);
  162.         write(oneByte);
  163.     }

  164.     private void writeBackReference(final LZ77Compressor.BackReference block) throws IOException {
  165.         final int len = block.getLength();
  166.         final int offset = block.getOffset();
  167.         if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) {
  168.             writeBackReferenceWithOneOffsetByte(len, offset);
  169.         } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) {
  170.             writeBackReferenceWithTwoOffsetBytes(len, offset);
  171.         } else {
  172.             writeBackReferenceWithFourOffsetBytes(len, offset);
  173.         }
  174.     }

  175.     private void writeBackReferenceWithFourOffsetBytes(final int len, final int offset) throws IOException {
  176.         writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset);
  177.     }

  178.     private void writeBackReferenceWithLittleEndianOffset(final int tag, final int offsetBytes, final int len, final int offset) throws IOException {
  179.         out.write(tag | len - 1 << 2);
  180.         writeLittleEndian(offsetBytes, offset);
  181.     }

  182.     private void writeBackReferenceWithOneOffsetByte(final int len, final int offset) throws IOException {
  183.         out.write(ONE_BYTE_COPY_TAG | len - 4 << 2 | (offset & 0x700) >> 3);
  184.         out.write(offset & 0xff);
  185.     }

  186.     private void writeBackReferenceWithTwoOffsetBytes(final int len, final int offset) throws IOException {
  187.         writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset);
  188.     }

  189.     private void writeLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
  190.         final int len = block.getLength();
  191.         if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) {
  192.             writeLiteralBlockNoSizeBytes(block, len);
  193.         } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) {
  194.             writeLiteralBlockOneSizeByte(block, len);
  195.         } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) {
  196.             writeLiteralBlockTwoSizeBytes(block, len);
  197.         } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) {
  198.             writeLiteralBlockThreeSizeBytes(block, len);
  199.         } else {
  200.             writeLiteralBlockFourSizeBytes(block, len);
  201.         }
  202.     }

  203.     private void writeLiteralBlockFourSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
  204.         writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block);
  205.     }

  206.     private void writeLiteralBlockNoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
  207.         writeLiteralBlockWithSize(len - 1 << 2, 0, len, block);
  208.     }

  209.     private void writeLiteralBlockOneSizeByte(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
  210.         writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block);
  211.     }

  212.     private void writeLiteralBlockThreeSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
  213.         writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block);
  214.     }

  215.     private void writeLiteralBlockTwoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
  216.         writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block);
  217.     }

  218.     private void writeLiteralBlockWithSize(final int tagByte, final int sizeBytes, final int len, final LZ77Compressor.LiteralBlock block) throws IOException {
  219.         out.write(tagByte);
  220.         writeLittleEndian(sizeBytes, len - 1);
  221.         out.write(block.getData(), block.getOffset(), len);
  222.     }

  223.     private void writeLittleEndian(final int numBytes, final int num) throws IOException {
  224.         ByteUtils.toLittleEndian(consumer, num, numBytes);
  225.     }

  226.     private void writeUncompressedSize(long uncompressedSize) throws IOException {
  227.         boolean more;
  228.         do {
  229.             int currentByte = (int) (uncompressedSize & 0x7F);
  230.             more = uncompressedSize > currentByte;
  231.             if (more) {
  232.                 currentByte |= 0x80;
  233.             }
  234.             out.write(currentByte);
  235.             uncompressedSize >>= 7;
  236.         } while (more);
  237.     }
  238. }