BlockLZ4CompressorOutputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.lz4;

  20. import java.io.IOException;
  21. import java.io.OutputStream;
  22. import java.util.Arrays;
  23. import java.util.Deque;
  24. import java.util.Iterator;
  25. import java.util.LinkedList;

  26. import org.apache.commons.compress.compressors.CompressorOutputStream;
  27. import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
  28. import org.apache.commons.compress.compressors.lz77support.Parameters;
  29. import org.apache.commons.compress.utils.ByteUtils;

  30. /**
  31.  * CompressorOutputStream for the LZ4 block format.
  32.  *
  33.  * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
  34.  * @since 1.14
  35.  * @NotThreadSafe
  36.  */
  37. public class BlockLZ4CompressorOutputStream extends CompressorOutputStream<OutputStream> {

  38.     static final class Pair {

  39.         private static int lengths(final int litLength, final int brLength) {
  40.             final int l = Math.min(litLength, 15);
  41.             final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
  42.             return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
  43.         }

  44.         private static void writeLength(int length, final OutputStream out) throws IOException {
  45.             while (length >= 255) {
  46.                 out.write(255);
  47.                 length -= 255;
  48.             }
  49.             out.write(length);
  50.         }

  51.         private final Deque<byte[]> literals = new LinkedList<>();

  52.         private int literalLength;

  53.         private int brOffset, brLength;

  54.         private boolean written;

  55.         byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
  56.             final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength());
  57.             literals.add(copy);
  58.             literalLength += copy.length;
  59.             return copy;
  60.         }

  61.         private int backReferenceLength() {
  62.             return brLength;
  63.         }

  64.         boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
  65.             return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
  66.         }

  67.         boolean hasBackReference() {
  68.             return brOffset > 0;
  69.         }

  70.         private boolean hasBeenWritten() {
  71.             return written;
  72.         }

  73.         int length() {
  74.             return literalLength() + brLength;
  75.         }

  76.         private int literalLength() {
  77.             // This method is performance sensitive
  78.             if (literalLength != 0) {
  79.                 return literalLength;
  80.             }
  81.             int length = 0;
  82.             for (final byte[] b : literals) {
  83.                 length += b.length;
  84.             }
  85.             return literalLength = length;
  86.         }

  87.         private void prependLiteral(final byte[] data) {
  88.             literals.addFirst(data);
  89.             literalLength += data.length;
  90.         }

  91.         private void prependTo(final Pair other) {
  92.             final Iterator<byte[]> listBackwards = literals.descendingIterator();
  93.             while (listBackwards.hasNext()) {
  94.                 other.prependLiteral(listBackwards.next());
  95.             }
  96.         }

  97.         void setBackReference(final LZ77Compressor.BackReference block) {
  98.             if (hasBackReference()) {
  99.                 throw new IllegalStateException();
  100.             }
  101.             brOffset = block.getOffset();
  102.             brLength = block.getLength();
  103.         }

  104.         private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
  105.             final Pair p = new Pair();
  106.             p.literals.addAll(literals);
  107.             p.brOffset = brOffset;
  108.             p.brLength = newBackReferenceLength;
  109.             return p;
  110.         }

  111.         void writeTo(final OutputStream out) throws IOException {
  112.             final int litLength = literalLength();
  113.             out.write(lengths(litLength, brLength));
  114.             if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
  115.                 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
  116.             }
  117.             for (final byte[] b : literals) {
  118.                 out.write(b);
  119.             }
  120.             if (hasBackReference()) {
  121.                 ByteUtils.toLittleEndian(out, brOffset, 2);
  122.                 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
  123.                     writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
  124.                 }
  125.             }
  126.             written = true;
  127.         }
  128.     }

  129.     private static final int MIN_BACK_REFERENCE_LENGTH = 4;

  130.     /*
  131.      *
  132.      * The LZ4 block format has a few properties that make it less straight-forward than one would hope:
  133.      *
  134.      * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor
  135.      * must be merged into a single block.
  136.      *
  137.      * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal
  138.      * before we know how long the next back-reference is going to be.
  139.      *
  140.      * there are special rules for the final blocks
  141.      *
  142.      * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always
  143.      * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed.
  144.      *
  145.      * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this
  146.      * block's length and offset and the next block's length is at least twelve.
  147.      */

  148.     private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;

  149.     /**
  150.      * Returns a builder correctly configured for the LZ4 algorithm.
  151.      *
  152.      * @return a builder correctly configured for the LZ4 algorithm
  153.      */
  154.     public static Parameters.Builder createParameterBuilder() {
  155.         final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
  156.         return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
  157.                 .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen);
  158.     }

  159.     private final LZ77Compressor compressor;

  160.     // used in one-arg write method
  161.     private final byte[] oneByte = new byte[1];
  162.     private boolean finished;

  163.     private final Deque<Pair> pairs = new LinkedList<>();

  164.     // keeps track of the last window-size bytes (64k) in order to be
  165.     // able to expand back-references when needed
  166.     private final Deque<byte[]> expandedBlocks = new LinkedList<>();

  167.     /**
  168.      * Creates a new LZ4 output stream.
  169.      *
  170.      * @param out An OutputStream to read compressed data from
  171.      */
  172.     public BlockLZ4CompressorOutputStream(final OutputStream out) {
  173.         this(out, createParameterBuilder().build());
  174.     }

  175.     /**
  176.      * Creates a new LZ4 output stream.
  177.      *
  178.      * @param out     An OutputStream to read compressed data from
  179.      * @param params The parameters to use for LZ77 compression.
  180.      */
  181.     public BlockLZ4CompressorOutputStream(final OutputStream out, final Parameters params) {
  182.         super(out);
  183.         compressor = new LZ77Compressor(params, block -> {
  184.             switch (block.getType()) {
  185.             case LITERAL:
  186.                 addLiteralBlock((LZ77Compressor.LiteralBlock) block);
  187.                 break;
  188.             case BACK_REFERENCE:
  189.                 addBackReference((LZ77Compressor.BackReference) block);
  190.                 break;
  191.             case EOD:
  192.                 writeFinalLiteralBlock();
  193.                 break;
  194.             }
  195.         });
  196.     }

  197.     private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
  198.         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
  199.         last.setBackReference(block);
  200.         recordBackReference(block);
  201.         clearUnusedBlocksAndPairs();
  202.     }

  203.     private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
  204.         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
  205.         recordLiteral(last.addLiteral(block));
  206.         clearUnusedBlocksAndPairs();
  207.     }

  208.     private void clearUnusedBlocks() {
  209.         int blockLengths = 0;
  210.         int blocksToKeep = 0;
  211.         for (final byte[] b : expandedBlocks) {
  212.             blocksToKeep++;
  213.             blockLengths += b.length;
  214.             if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
  215.                 break;
  216.             }
  217.         }
  218.         final int size = expandedBlocks.size();
  219.         for (int i = blocksToKeep; i < size; i++) {
  220.             expandedBlocks.removeLast();
  221.         }
  222.     }

  223.     private void clearUnusedBlocksAndPairs() {
  224.         clearUnusedBlocks();
  225.         clearUnusedPairs();
  226.     }

  227.     private void clearUnusedPairs() {
  228.         int pairLengths = 0;
  229.         int pairsToKeep = 0;
  230.         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
  231.             final Pair p = it.next();
  232.             pairsToKeep++;
  233.             pairLengths += p.length();
  234.             if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
  235.                 break;
  236.             }
  237.         }
  238.         final int size = pairs.size();
  239.         for (int i = pairsToKeep; i < size; i++) {
  240.             final Pair p = pairs.peekFirst();
  241.             if (!p.hasBeenWritten()) {
  242.                 break;
  243.             }
  244.             pairs.removeFirst();
  245.         }
  246.     }

  247.     @Override
  248.     public void close() throws IOException {
  249.         try {
  250.             finish();
  251.         } finally {
  252.             out.close();
  253.         }
  254.     }

  255.     private byte[] expand(final int offset, final int length) {
  256.         final byte[] expanded = new byte[length];
  257.         if (offset == 1) { // surprisingly common special case
  258.             final byte[] block = expandedBlocks.peekFirst();
  259.             final byte b = block[block.length - 1];
  260.             if (b != 0) { // the fresh array contains 0s anyway
  261.                 Arrays.fill(expanded, b);
  262.             }
  263.         } else {
  264.             expandFromList(expanded, offset, length);
  265.         }
  266.         return expanded;
  267.     }

  268.     private void expandFromList(final byte[] expanded, final int offset, final int length) {
  269.         int offsetRemaining = offset;
  270.         int lengthRemaining = length;
  271.         int writeOffset = 0;
  272.         while (lengthRemaining > 0) {
  273.             // find block that contains offsetRemaining
  274.             byte[] block = null;
  275.             final int copyLen;
  276.             final int copyOffset;
  277.             if (offsetRemaining > 0) {
  278.                 int blockOffset = 0;
  279.                 for (final byte[] b : expandedBlocks) {
  280.                     if (b.length + blockOffset >= offsetRemaining) {
  281.                         block = b;
  282.                         break;
  283.                     }
  284.                     blockOffset += b.length;
  285.                 }
  286.                 if (block == null) {
  287.                     // should not be possible
  288.                     throw new IllegalStateException("Failed to find a block containing offset " + offset);
  289.                 }
  290.                 copyOffset = blockOffset + block.length - offsetRemaining;
  291.                 copyLen = Math.min(lengthRemaining, block.length - copyOffset);
  292.             } else {
  293.                 // offsetRemaining is negative or 0 and points into the expanded bytes
  294.                 block = expanded;
  295.                 copyOffset = -offsetRemaining;
  296.                 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
  297.             }
  298.             System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
  299.             offsetRemaining -= copyLen;
  300.             lengthRemaining -= copyLen;
  301.             writeOffset += copyLen;
  302.         }
  303.     }

  304.     /**
  305.      * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
  306.      *
  307.      * @throws IOException if an error occurs
  308.      */
  309.     public void finish() throws IOException {
  310.         if (!finished) {
  311.             compressor.finish();
  312.             finished = true;
  313.         }
  314.     }

  315.     /**
  316.      * Adds some initial data to fill the window with.
  317.      *
  318.      * @param data the data to fill the window with.
  319.      * @param off  offset of real data into the array
  320.      * @param len  amount of data
  321.      * @throws IllegalStateException if the stream has already started to write data
  322.      * @see LZ77Compressor#prefill
  323.      */
  324.     public void prefill(final byte[] data, final int off, final int len) {
  325.         if (len > 0) {
  326.             final byte[] b = Arrays.copyOfRange(data, off, off + len);
  327.             compressor.prefill(b);
  328.             recordLiteral(b);
  329.         }
  330.     }

  331.     private void recordBackReference(final LZ77Compressor.BackReference block) {
  332.         expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
  333.     }

  334.     private void recordLiteral(final byte[] b) {
  335.         expandedBlocks.addFirst(b);
  336.     }

  337.     private void rewriteLastPairs() {
  338.         final LinkedList<Pair> lastPairs = new LinkedList<>();
  339.         final LinkedList<Integer> pairLength = new LinkedList<>();
  340.         int offset = 0;
  341.         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
  342.             final Pair p = it.next();
  343.             if (p.hasBeenWritten()) {
  344.                 break;
  345.             }
  346.             final int len = p.length();
  347.             pairLength.addFirst(len);
  348.             lastPairs.addFirst(p);
  349.             offset += len;
  350.             if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
  351.                 break;
  352.             }
  353.         }
  354.         lastPairs.forEach(pairs::remove);
  355.         // lastPairs may contain between one and four Pairs:
  356.         // * the last pair may be a one byte literal
  357.         // * all other Pairs contain a back-reference which must be four bytes long at minimum
  358.         // we could merge them all into a single literal block but
  359.         // this may harm compression. For example compressing
  360.         // "bla.tar" from our tests yields a last block containing a
  361.         // back-reference of length > 2k and we'd end up with a last
  362.         // literal of that size rather than a 2k back-reference and a
  363.         // 12 byte literal at the end.

  364.         // Instead we merge all but the first of lastPairs into a new
  365.         // literal-only Pair "replacement" and look at the
  366.         // back-reference in the first of lastPairs and see if we can
  367.         // split it. We can split it if it is longer than 16 -
  368.         // replacement.length (i.e. the minimal length of four is kept
  369.         // while making sure the last literal is at least twelve bytes
  370.         // long). If we can't split it, we expand the first of the pairs
  371.         // as well.

  372.         // this is not optimal, we could get better compression
  373.         // results with more complex approaches as the last literal
  374.         // only needs to be five bytes long if the previous
  375.         // back-reference has an offset big enough

  376.         final int lastPairsSize = lastPairs.size();
  377.         int toExpand = 0;
  378.         for (int i = 1; i < lastPairsSize; i++) {
  379.             toExpand += pairLength.get(i);
  380.         }
  381.         final Pair replacement = new Pair();
  382.         if (toExpand > 0) {
  383.             replacement.prependLiteral(expand(toExpand, toExpand));
  384.         }
  385.         final Pair splitCandidate = lastPairs.get(0);
  386.         final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
  387.         final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
  388.         if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
  389.             replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
  390.             pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
  391.         } else {
  392.             if (splitCandidate.hasBackReference()) {
  393.                 replacement.prependLiteral(expand(toExpand + brLen, brLen));
  394.             }
  395.             splitCandidate.prependTo(replacement);
  396.         }
  397.         pairs.add(replacement);
  398.     }

  399.     @Override
  400.     public void write(final byte[] data, final int off, final int len) throws IOException {
  401.         compressor.compress(data, off, len);
  402.     }

  403.     @Override
  404.     public void write(final int b) throws IOException {
  405.         oneByte[0] = (byte) (b & 0xff);
  406.         write(oneByte);
  407.     }

  408.     private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
  409.         writeWritablePairs(length);
  410.         Pair last = pairs.peekLast();
  411.         if (last == null || last.hasBackReference()) {
  412.             last = new Pair();
  413.             pairs.addLast(last);
  414.         }
  415.         return last;
  416.     }

  417.     private void writeFinalLiteralBlock() throws IOException {
  418.         rewriteLastPairs();
  419.         for (final Pair p : pairs) {
  420.             if (!p.hasBeenWritten()) {
  421.                 p.writeTo(out);
  422.             }
  423.         }
  424.         pairs.clear();
  425.     }

  426.     private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
  427.         int unwrittenLength = lengthOfBlocksAfterLastPair;
  428.         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
  429.             final Pair p = it.next();
  430.             if (p.hasBeenWritten()) {
  431.                 break;
  432.             }
  433.             unwrittenLength += p.length();
  434.         }
  435.         for (final Pair p : pairs) {
  436.             if (p.hasBeenWritten()) {
  437.                 continue;
  438.             }
  439.             unwrittenLength -= p.length();
  440.             if (!p.canBeWritten(unwrittenLength)) {
  441.                 break;
  442.             }
  443.             p.writeTo(out);
  444.         }
  445.     }
  446. }