FramedLZ4CompressorOutputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.lz4;

  20. import java.io.ByteArrayOutputStream;
  21. import java.io.IOException;
  22. import java.io.OutputStream;

  23. import org.apache.commons.compress.compressors.CompressorOutputStream;
  24. import org.apache.commons.compress.utils.ByteUtils;

  25. /**
  26.  * CompressorOutputStream for the LZ4 frame format.
  27.  *
  28.  * <p>
  29.  * Based on the "spec" in the version "1.5.1 (31/03/2015)"
  30.  * </p>
  31.  *
  32.  * @see <a href="https://lz4.github.io/lz4/lz4_Frame_format.html">LZ4 Frame Format Description</a>
  33.  * @since 1.14
  34.  * @NotThreadSafe
  35.  */
  36. public class FramedLZ4CompressorOutputStream extends CompressorOutputStream<OutputStream> {

  37.     /**
  38.      * The block sizes supported by the format.
  39.      */
  40.     public enum BlockSize {
  41.         /** Block size of 64K */
  42.         K64(64 * 1024, 4),
  43.         /** Block size of 256K */
  44.         K256(256 * 1024, 5),
  45.         /** Block size of 1M */
  46.         M1(1024 * 1024, 6),
  47.         /** Block size of 4M */
  48.         M4(4096 * 1024, 7);

  49.         private final int size, index;

  50.         BlockSize(final int size, final int index) {
  51.             this.size = size;
  52.             this.index = index;
  53.         }

  54.         int getIndex() {
  55.             return index;
  56.         }

  57.         int getSize() {
  58.             return size;
  59.         }
  60.     }

  61.     /**
  62.      * Parameters of the LZ4 frame format.
  63.      */
  64.     public static class Parameters {
  65.         /**
  66.          * The default parameters of 4M block size, enabled content checksum, disabled block checksums and independent blocks.
  67.          *
  68.          * <p>
  69.          * This matches the defaults of the lz4 command line utility.
  70.          * </p>
  71.          */
  72.         public static final Parameters DEFAULT = new Parameters(BlockSize.M4, true, false, false);
  73.         private final BlockSize blockSize;
  74.         private final boolean withContentChecksum, withBlockChecksum, withBlockDependency;

  75.         private final org.apache.commons.compress.compressors.lz77support.Parameters lz77params;

  76.         /**
  77.          * Sets up custom a custom block size for the LZ4 stream but otherwise uses the defaults of enabled content checksum, disabled block checksums and
  78.          * independent blocks.
  79.          *
  80.          * @param blockSize the size of a single block.
  81.          */
  82.         public Parameters(final BlockSize blockSize) {
  83.             this(blockSize, true, false, false);
  84.         }

  85.         /**
  86.          * Sets up custom parameters for the LZ4 stream.
  87.          *
  88.          * @param blockSize           the size of a single block.
  89.          * @param withContentChecksum whether to write a content checksum
  90.          * @param withBlockChecksum   whether to write a block checksum. Note that block checksums are not supported by the lz4 command line utility
  91.          * @param withBlockDependency whether a block may depend on the content of a previous block. Enabling this may improve compression ratio but makes it
  92.          *                            impossible to decompress the output in parallel.
  93.          */
  94.         public Parameters(final BlockSize blockSize, final boolean withContentChecksum, final boolean withBlockChecksum, final boolean withBlockDependency) {
  95.             this(blockSize, withContentChecksum, withBlockChecksum, withBlockDependency, BlockLZ4CompressorOutputStream.createParameterBuilder().build());
  96.         }

  97.         /**
  98.          * Sets up custom parameters for the LZ4 stream.
  99.          *
  100.          * @param blockSize           the size of a single block.
  101.          * @param withContentChecksum whether to write a content checksum
  102.          * @param withBlockChecksum   whether to write a block checksum. Note that block checksums are not supported by the lz4 command line utility
  103.          * @param withBlockDependency whether a block may depend on the content of a previous block. Enabling this may improve compression ratio but makes it
  104.          *                            impossible to decompress the output in parallel.
  105.          * @param lz77params          parameters used to fine-tune compression, in particular to balance compression ratio vs compression speed.
  106.          */
  107.         public Parameters(final BlockSize blockSize, final boolean withContentChecksum, final boolean withBlockChecksum, final boolean withBlockDependency,
  108.                 final org.apache.commons.compress.compressors.lz77support.Parameters lz77params) {
  109.             this.blockSize = blockSize;
  110.             this.withContentChecksum = withContentChecksum;
  111.             this.withBlockChecksum = withBlockChecksum;
  112.             this.withBlockDependency = withBlockDependency;
  113.             this.lz77params = lz77params;
  114.         }

  115.         /**
  116.          * Sets up custom a custom block size for the LZ4 stream but otherwise uses the defaults of enabled content checksum, disabled block checksums and
  117.          * independent blocks.
  118.          *
  119.          * @param blockSize  the size of a single block.
  120.          * @param lz77params parameters used to fine-tune compression, in particular to balance compression ratio vs compression speed.
  121.          */
  122.         public Parameters(final BlockSize blockSize, final org.apache.commons.compress.compressors.lz77support.Parameters lz77params) {
  123.             this(blockSize, true, false, false, lz77params);
  124.         }

  125.         @Override
  126.         public String toString() {
  127.             return "LZ4 Parameters with BlockSize " + blockSize + ", withContentChecksum " + withContentChecksum + ", withBlockChecksum " + withBlockChecksum
  128.                     + ", withBlockDependency " + withBlockDependency;
  129.         }
  130.     }

  131.     private static final byte[] END_MARK = new byte[4];
  132.     // used in one-arg write method
  133.     private final byte[] oneByte = new byte[1];
  134.     private final byte[] blockData;
  135.     private final Parameters params;

  136.     private boolean finished;

  137.     // used for frame header checksum and content checksum, if requested
  138.     private final org.apache.commons.codec.digest.XXHash32 contentHash = new org.apache.commons.codec.digest.XXHash32();
  139.     // used for block checksum, if requested
  140.     private final org.apache.commons.codec.digest.XXHash32 blockHash;

  141.     // only created if the config requires block dependency
  142.     private final byte[] blockDependencyBuffer;

  143.     private int collectedBlockDependencyBytes;
  144.     private int currentIndex;

  145.     /**
  146.      * Constructs a new output stream that compresses data using the LZ4 frame format using the default block size of 4MB.
  147.      *
  148.      * @param out the OutputStream to which to write the compressed data
  149.      * @throws IOException if writing the signature fails
  150.      */
  151.     public FramedLZ4CompressorOutputStream(final OutputStream out) throws IOException {
  152.         this(out, Parameters.DEFAULT);
  153.     }

  154.     /**
  155.      * Constructs a new output stream that compresses data using the LZ4 frame format using the given block size.
  156.      *
  157.      * @param out    the OutputStream to which to write the compressed data
  158.      * @param params the parameters to use
  159.      * @throws IOException if writing the signature fails
  160.      */
  161.     public FramedLZ4CompressorOutputStream(final OutputStream out, final Parameters params) throws IOException {
  162.         super(out);
  163.         this.params = params;
  164.         blockData = new byte[params.blockSize.getSize()];
  165.         blockHash = params.withBlockChecksum ? new org.apache.commons.codec.digest.XXHash32() : null;
  166.         out.write(FramedLZ4CompressorInputStream.LZ4_SIGNATURE);
  167.         writeFrameDescriptor();
  168.         blockDependencyBuffer = params.withBlockDependency ? new byte[BlockLZ4CompressorInputStream.WINDOW_SIZE] : null;
  169.     }

  170.     private void appendToBlockDependencyBuffer(final byte[] b, final int off, int len) {
  171.         len = Math.min(len, blockDependencyBuffer.length);
  172.         if (len > 0) {
  173.             final int keep = blockDependencyBuffer.length - len;
  174.             if (keep > 0) {
  175.                 // move last keep bytes towards the start of the buffer
  176.                 System.arraycopy(blockDependencyBuffer, len, blockDependencyBuffer, 0, keep);
  177.             }
  178.             // append new data
  179.             System.arraycopy(b, off, blockDependencyBuffer, keep, len);
  180.             collectedBlockDependencyBytes = Math.min(collectedBlockDependencyBytes + len, blockDependencyBuffer.length);
  181.         }
  182.     }

  183.     @Override
  184.     public void close() throws IOException {
  185.         try {
  186.             finish();
  187.         } finally {
  188.             out.close();
  189.         }
  190.     }

  191.     /**
  192.      * Compresses all blockDataRemaining data and writes it to the stream, doesn't close the underlying stream.
  193.      *
  194.      * @throws IOException if an error occurs
  195.      */
  196.     public void finish() throws IOException {
  197.         if (!finished) {
  198.             flushBlock();
  199.             writeTrailer();
  200.             finished = true;
  201.         }
  202.     }

  203.     private void flushBlock() throws IOException {
  204.         if (currentIndex == 0) {
  205.             return;
  206.         }
  207.         final boolean withBlockDependency = params.withBlockDependency;
  208.         final ByteArrayOutputStream baos = new ByteArrayOutputStream();
  209.         try (BlockLZ4CompressorOutputStream o = new BlockLZ4CompressorOutputStream(baos, params.lz77params)) {
  210.             if (withBlockDependency) {
  211.                 o.prefill(blockDependencyBuffer, blockDependencyBuffer.length - collectedBlockDependencyBytes, collectedBlockDependencyBytes);
  212.             }
  213.             o.write(blockData, 0, currentIndex);
  214.         }
  215.         if (withBlockDependency) {
  216.             appendToBlockDependencyBuffer(blockData, 0, currentIndex);
  217.         }
  218.         final byte[] b = baos.toByteArray();
  219.         if (b.length > currentIndex) { // compression increased size, maybe beyond blocksize
  220.             ByteUtils.toLittleEndian(out, currentIndex | FramedLZ4CompressorInputStream.UNCOMPRESSED_FLAG_MASK, 4);
  221.             out.write(blockData, 0, currentIndex);
  222.             if (params.withBlockChecksum) {
  223.                 blockHash.update(blockData, 0, currentIndex);
  224.             }
  225.         } else {
  226.             ByteUtils.toLittleEndian(out, b.length, 4);
  227.             out.write(b);
  228.             if (params.withBlockChecksum) {
  229.                 blockHash.update(b, 0, b.length);
  230.             }
  231.         }
  232.         if (params.withBlockChecksum) {
  233.             ByteUtils.toLittleEndian(out, blockHash.getValue(), 4);
  234.             blockHash.reset();
  235.         }
  236.         currentIndex = 0;
  237.     }

  238.     @Override
  239.     public void write(final byte[] data, int off, int len) throws IOException {
  240.         if (params.withContentChecksum) {
  241.             contentHash.update(data, off, len);
  242.         }
  243.         int blockDataRemaining = blockData.length - currentIndex;
  244.         while (len > 0) {
  245.             final int copyLen = Math.min(len, blockDataRemaining);
  246.             System.arraycopy(data, off, blockData, currentIndex, copyLen);
  247.             off += copyLen;
  248.             blockDataRemaining -= copyLen;
  249.             len -= copyLen;
  250.             currentIndex += copyLen;
  251.             if (blockDataRemaining == 0) {
  252.                 flushBlock();
  253.                 blockDataRemaining = blockData.length;
  254.             }
  255.         }
  256.     }

  257.     @Override
  258.     public void write(final int b) throws IOException {
  259.         oneByte[0] = (byte) (b & 0xff);
  260.         write(oneByte);
  261.     }

  262.     private void writeFrameDescriptor() throws IOException {
  263.         int flags = FramedLZ4CompressorInputStream.SUPPORTED_VERSION;
  264.         if (!params.withBlockDependency) {
  265.             flags |= FramedLZ4CompressorInputStream.BLOCK_INDEPENDENCE_MASK;
  266.         }
  267.         if (params.withContentChecksum) {
  268.             flags |= FramedLZ4CompressorInputStream.CONTENT_CHECKSUM_MASK;
  269.         }
  270.         if (params.withBlockChecksum) {
  271.             flags |= FramedLZ4CompressorInputStream.BLOCK_CHECKSUM_MASK;
  272.         }
  273.         out.write(flags);
  274.         contentHash.update(flags);
  275.         final int bd = params.blockSize.getIndex() << 4 & FramedLZ4CompressorInputStream.BLOCK_MAX_SIZE_MASK;
  276.         out.write(bd);
  277.         contentHash.update(bd);
  278.         out.write((int) (contentHash.getValue() >> 8 & 0xff));
  279.         contentHash.reset();
  280.     }

  281.     private void writeTrailer() throws IOException {
  282.         out.write(END_MARK);
  283.         if (params.withContentChecksum) {
  284.             ByteUtils.toLittleEndian(out, contentHash.getValue(), 4);
  285.         }
  286.     }

  287. }