001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz4;
020
021import java.io.IOException;
022import java.io.OutputStream;
023import java.util.Arrays;
024import java.util.Deque;
025import java.util.Iterator;
026import java.util.LinkedList;
027
028import org.apache.commons.compress.compressors.CompressorOutputStream;
029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
030import org.apache.commons.compress.compressors.lz77support.Parameters;
031import org.apache.commons.compress.utils.ByteUtils;
032
033/**
034 * CompressorOutputStream for the LZ4 block format.
035 *
036 * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
037 * @since 1.14
038 * @NotThreadSafe
039 */
040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
041
042    static final class Pair {
043
044        private static int lengths(final int litLength, final int brLength) {
045            final int l = Math.min(litLength, 15);
046            final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
047            return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
048        }
049
050        private static void writeLength(int length, final OutputStream out) throws IOException {
051            while (length >= 255) {
052                out.write(255);
053                length -= 255;
054            }
055            out.write(length);
056        }
057
058        private final Deque<byte[]> literals = new LinkedList<>();
059
060        private int literalLength;
061
062        private int brOffset, brLength;
063
064        private boolean written;
065
066        byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
067            final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength());
068            literals.add(copy);
069            literalLength += copy.length;
070            return copy;
071        }
072
073        private int backReferenceLength() {
074            return brLength;
075        }
076
077        boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
078            return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
079        }
080
081        boolean hasBackReference() {
082            return brOffset > 0;
083        }
084
085        private boolean hasBeenWritten() {
086            return written;
087        }
088
089        int length() {
090            return literalLength() + brLength;
091        }
092
093        private int literalLength() {
094            // This method is performance sensitive
095            if (literalLength != 0) {
096                return literalLength;
097            }
098            int length = 0;
099            for (final byte[] b : literals) {
100                length += b.length;
101            }
102            return literalLength = length;
103        }
104
105        private void prependLiteral(final byte[] data) {
106            literals.addFirst(data);
107            literalLength += data.length;
108        }
109
110        private void prependTo(final Pair other) {
111            final Iterator<byte[]> listBackwards = literals.descendingIterator();
112            while (listBackwards.hasNext()) {
113                other.prependLiteral(listBackwards.next());
114            }
115        }
116
117        void setBackReference(final LZ77Compressor.BackReference block) {
118            if (hasBackReference()) {
119                throw new IllegalStateException();
120            }
121            brOffset = block.getOffset();
122            brLength = block.getLength();
123        }
124
125        private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
126            final Pair p = new Pair();
127            p.literals.addAll(literals);
128            p.brOffset = brOffset;
129            p.brLength = newBackReferenceLength;
130            return p;
131        }
132
133        void writeTo(final OutputStream out) throws IOException {
134            final int litLength = literalLength();
135            out.write(lengths(litLength, brLength));
136            if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
137                writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
138            }
139            for (final byte[] b : literals) {
140                out.write(b);
141            }
142            if (hasBackReference()) {
143                ByteUtils.toLittleEndian(out, brOffset, 2);
144                if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
145                    writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
146                }
147            }
148            written = true;
149        }
150    }
151
152    private static final int MIN_BACK_REFERENCE_LENGTH = 4;
153
154    /*
155     *
156     * The LZ4 block format has a few properties that make it less straight-forward than one would hope:
157     *
158     * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor
159     * must be merged into a single block.
160     *
161     * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal
162     * before we know how long the next back-reference is going to be.
163     *
164     * there are special rules for the final blocks
165     *
166     * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always
167     * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed.
168     *
169     * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this
170     * block's length and offset and the next block's length is at least twelve.
171     */
172
173    private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
174
175    /**
176     * Returns a builder correctly configured for the LZ4 algorithm.
177     *
178     * @return a builder correctly configured for the LZ4 algorithm
179     */
180    public static Parameters.Builder createParameterBuilder() {
181        final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
182        return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
183                .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen);
184    }
185
186    private final LZ77Compressor compressor;
187
188    private final OutputStream os;
189
190    // used in one-arg write method
191    private final byte[] oneByte = new byte[1];
192    private boolean finished;
193
194    private final Deque<Pair> pairs = new LinkedList<>();
195
196    // keeps track of the last window-size bytes (64k) in order to be
197    // able to expand back-references when needed
198    private final Deque<byte[]> expandedBlocks = new LinkedList<>();
199
200    /**
201     * Creates a new LZ4 output stream.
202     *
203     * @param os An OutputStream to read compressed data from
204     */
205    public BlockLZ4CompressorOutputStream(final OutputStream os) {
206        this(os, createParameterBuilder().build());
207    }
208
209    /**
210     * Creates a new LZ4 output stream.
211     *
212     * @param os     An OutputStream to read compressed data from
213     * @param params The parameters to use for LZ77 compression.
214     */
215    public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) {
216        this.os = os;
217        compressor = new LZ77Compressor(params, block -> {
218            switch (block.getType()) {
219            case LITERAL:
220                addLiteralBlock((LZ77Compressor.LiteralBlock) block);
221                break;
222            case BACK_REFERENCE:
223                addBackReference((LZ77Compressor.BackReference) block);
224                break;
225            case EOD:
226                writeFinalLiteralBlock();
227                break;
228            }
229        });
230    }
231
232    private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
233        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
234        last.setBackReference(block);
235        recordBackReference(block);
236        clearUnusedBlocksAndPairs();
237    }
238
239    private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
240        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
241        recordLiteral(last.addLiteral(block));
242        clearUnusedBlocksAndPairs();
243    }
244
245    private void clearUnusedBlocks() {
246        int blockLengths = 0;
247        int blocksToKeep = 0;
248        for (final byte[] b : expandedBlocks) {
249            blocksToKeep++;
250            blockLengths += b.length;
251            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
252                break;
253            }
254        }
255        final int size = expandedBlocks.size();
256        for (int i = blocksToKeep; i < size; i++) {
257            expandedBlocks.removeLast();
258        }
259    }
260
261    private void clearUnusedBlocksAndPairs() {
262        clearUnusedBlocks();
263        clearUnusedPairs();
264    }
265
266    private void clearUnusedPairs() {
267        int pairLengths = 0;
268        int pairsToKeep = 0;
269        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
270            final Pair p = it.next();
271            pairsToKeep++;
272            pairLengths += p.length();
273            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
274                break;
275            }
276        }
277        final int size = pairs.size();
278        for (int i = pairsToKeep; i < size; i++) {
279            final Pair p = pairs.peekFirst();
280            if (!p.hasBeenWritten()) {
281                break;
282            }
283            pairs.removeFirst();
284        }
285    }
286
287    @Override
288    public void close() throws IOException {
289        try {
290            finish();
291        } finally {
292            os.close();
293        }
294    }
295
296    private byte[] expand(final int offset, final int length) {
297        final byte[] expanded = new byte[length];
298        if (offset == 1) { // surprisingly common special case
299            final byte[] block = expandedBlocks.peekFirst();
300            final byte b = block[block.length - 1];
301            if (b != 0) { // the fresh array contains 0s anyway
302                Arrays.fill(expanded, b);
303            }
304        } else {
305            expandFromList(expanded, offset, length);
306        }
307        return expanded;
308    }
309
310    private void expandFromList(final byte[] expanded, final int offset, final int length) {
311        int offsetRemaining = offset;
312        int lengthRemaining = length;
313        int writeOffset = 0;
314        while (lengthRemaining > 0) {
315            // find block that contains offsetRemaining
316            byte[] block = null;
317            final int copyLen;
318            final int copyOffset;
319            if (offsetRemaining > 0) {
320                int blockOffset = 0;
321                for (final byte[] b : expandedBlocks) {
322                    if (b.length + blockOffset >= offsetRemaining) {
323                        block = b;
324                        break;
325                    }
326                    blockOffset += b.length;
327                }
328                if (block == null) {
329                    // should not be possible
330                    throw new IllegalStateException("Failed to find a block containing offset " + offset);
331                }
332                copyOffset = blockOffset + block.length - offsetRemaining;
333                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
334            } else {
335                // offsetRemaining is negative or 0 and points into the expanded bytes
336                block = expanded;
337                copyOffset = -offsetRemaining;
338                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
339            }
340            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
341            offsetRemaining -= copyLen;
342            lengthRemaining -= copyLen;
343            writeOffset += copyLen;
344        }
345    }
346
347    /**
348     * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
349     *
350     * @throws IOException if an error occurs
351     */
352    public void finish() throws IOException {
353        if (!finished) {
354            compressor.finish();
355            finished = true;
356        }
357    }
358
359    /**
360     * Adds some initial data to fill the window with.
361     *
362     * @param data the data to fill the window with.
363     * @param off  offset of real data into the array
364     * @param len  amount of data
365     * @throws IllegalStateException if the stream has already started to write data
366     * @see LZ77Compressor#prefill
367     */
368    public void prefill(final byte[] data, final int off, final int len) {
369        if (len > 0) {
370            final byte[] b = Arrays.copyOfRange(data, off, off + len);
371            compressor.prefill(b);
372            recordLiteral(b);
373        }
374    }
375
376    private void recordBackReference(final LZ77Compressor.BackReference block) {
377        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
378    }
379
380    private void recordLiteral(final byte[] b) {
381        expandedBlocks.addFirst(b);
382    }
383
384    private void rewriteLastPairs() {
385        final LinkedList<Pair> lastPairs = new LinkedList<>();
386        final LinkedList<Integer> pairLength = new LinkedList<>();
387        int offset = 0;
388        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
389            final Pair p = it.next();
390            if (p.hasBeenWritten()) {
391                break;
392            }
393            final int len = p.length();
394            pairLength.addFirst(len);
395            lastPairs.addFirst(p);
396            offset += len;
397            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
398                break;
399            }
400        }
401        lastPairs.forEach(pairs::remove);
402        // lastPairs may contain between one and four Pairs:
403        // * the last pair may be a one byte literal
404        // * all other Pairs contain a back-reference which must be four bytes long at minimum
405        // we could merge them all into a single literal block but
406        // this may harm compression. For example compressing
407        // "bla.tar" from our tests yields a last block containing a
408        // back-reference of length > 2k and we'd end up with a last
409        // literal of that size rather than a 2k back-reference and a
410        // 12 byte literal at the end.
411
412        // Instead we merge all but the first of lastPairs into a new
413        // literal-only Pair "replacement" and look at the
414        // back-reference in the first of lastPairs and see if we can
415        // split it. We can split it if it is longer than 16 -
416        // replacement.length (i.e. the minimal length of four is kept
417        // while making sure the last literal is at least twelve bytes
418        // long). If we can't split it, we expand the first of the pairs
419        // as well.
420
421        // this is not optimal, we could get better compression
422        // results with more complex approaches as the last literal
423        // only needs to be five bytes long if the previous
424        // back-reference has an offset big enough
425
426        final int lastPairsSize = lastPairs.size();
427        int toExpand = 0;
428        for (int i = 1; i < lastPairsSize; i++) {
429            toExpand += pairLength.get(i);
430        }
431        final Pair replacement = new Pair();
432        if (toExpand > 0) {
433            replacement.prependLiteral(expand(toExpand, toExpand));
434        }
435        final Pair splitCandidate = lastPairs.get(0);
436        final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
437        final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
438        if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
439            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
440            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
441        } else {
442            if (splitCandidate.hasBackReference()) {
443                replacement.prependLiteral(expand(toExpand + brLen, brLen));
444            }
445            splitCandidate.prependTo(replacement);
446        }
447        pairs.add(replacement);
448    }
449
450    @Override
451    public void write(final byte[] data, final int off, final int len) throws IOException {
452        compressor.compress(data, off, len);
453    }
454
455    @Override
456    public void write(final int b) throws IOException {
457        oneByte[0] = (byte) (b & 0xff);
458        write(oneByte);
459    }
460
461    private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
462        writeWritablePairs(length);
463        Pair last = pairs.peekLast();
464        if (last == null || last.hasBackReference()) {
465            last = new Pair();
466            pairs.addLast(last);
467        }
468        return last;
469    }
470
471    private void writeFinalLiteralBlock() throws IOException {
472        rewriteLastPairs();
473        for (final Pair p : pairs) {
474            if (!p.hasBeenWritten()) {
475                p.writeTo(os);
476            }
477        }
478        pairs.clear();
479    }
480
481    private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
482        int unwrittenLength = lengthOfBlocksAfterLastPair;
483        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
484            final Pair p = it.next();
485            if (p.hasBeenWritten()) {
486                break;
487            }
488            unwrittenLength += p.length();
489        }
490        for (final Pair p : pairs) {
491            if (p.hasBeenWritten()) {
492                continue;
493            }
494            unwrittenLength -= p.length();
495            if (!p.canBeWritten(unwrittenLength)) {
496                break;
497            }
498            p.writeTo(os);
499        }
500    }
501}