001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz4;
020
021import java.io.IOException;
022import java.io.OutputStream;
023import java.util.Arrays;
024import java.util.Deque;
025import java.util.Iterator;
026import java.util.LinkedList;
027
028import org.apache.commons.compress.compressors.CompressorOutputStream;
029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
030import org.apache.commons.compress.compressors.lz77support.Parameters;
031import org.apache.commons.compress.utils.ByteUtils;
032
033/**
034 * CompressorOutputStream for the LZ4 block format.
035 *
036 * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
037 * @since 1.14
038 * @NotThreadSafe
039 */
040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream<OutputStream> {
041
042    static final class Pair {
043
044        private static int lengths(final int litLength, final int brLength) {
045            final int l = Math.min(litLength, 15);
046            final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
047            return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
048        }
049
050        private static void writeLength(int length, final OutputStream out) throws IOException {
051            while (length >= 255) {
052                out.write(255);
053                length -= 255;
054            }
055            out.write(length);
056        }
057
058        private final Deque<byte[]> literals = new LinkedList<>();
059
060        private int literalLength;
061
062        private int brOffset;
063
064        private int brLength;
065
066        private boolean written;
067
068        byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
069            final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength());
070            literals.add(copy);
071            literalLength += copy.length;
072            return copy;
073        }
074
075        private int backReferenceLength() {
076            return brLength;
077        }
078
079        boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
080            return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
081        }
082
083        boolean hasBackReference() {
084            return brOffset > 0;
085        }
086
087        private boolean hasBeenWritten() {
088            return written;
089        }
090
091        int length() {
092            return literalLength() + brLength;
093        }
094
095        private int literalLength() {
096            // This method is performance sensitive
097            if (literalLength != 0) {
098                return literalLength;
099            }
100            int length = 0;
101            for (final byte[] b : literals) {
102                length += b.length;
103            }
104            return literalLength = length;
105        }
106
107        private void prependLiteral(final byte[] data) {
108            literals.addFirst(data);
109            literalLength += data.length;
110        }
111
112        private void prependTo(final Pair other) {
113            final Iterator<byte[]> listBackwards = literals.descendingIterator();
114            while (listBackwards.hasNext()) {
115                other.prependLiteral(listBackwards.next());
116            }
117        }
118
119        void setBackReference(final LZ77Compressor.BackReference block) {
120            if (hasBackReference()) {
121                throw new IllegalStateException();
122            }
123            brOffset = block.getOffset();
124            brLength = block.getLength();
125        }
126
127        private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
128            final Pair p = new Pair();
129            p.literals.addAll(literals);
130            p.brOffset = brOffset;
131            p.brLength = newBackReferenceLength;
132            return p;
133        }
134
135        void writeTo(final OutputStream out) throws IOException {
136            final int litLength = literalLength();
137            out.write(lengths(litLength, brLength));
138            if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
139                writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
140            }
141            for (final byte[] b : literals) {
142                out.write(b);
143            }
144            if (hasBackReference()) {
145                ByteUtils.toLittleEndian(out, brOffset, 2);
146                if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
147                    writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
148                }
149            }
150            written = true;
151        }
152    }
153
154    private static final int MIN_BACK_REFERENCE_LENGTH = 4;
155
156    /*
157     *
158     * The LZ4 block format has a few properties that make it less straight-forward than one would hope:
159     *
160     * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor
161     * must be merged into a single block.
162     *
163     * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal
164     * before we know how long the next back-reference is going to be.
165     *
166     * there are special rules for the final blocks
167     *
168     * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always
169     * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed.
170     *
171     * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this
172     * block's length and offset and the next block's length is at least twelve.
173     */
174
175    private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
176
177    /**
178     * Returns a builder correctly configured for the LZ4 algorithm.
179     *
180     * @return a builder correctly configured for the LZ4 algorithm
181     */
182    public static Parameters.Builder createParameterBuilder() {
183        final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
184        return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
185                .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen);
186    }
187
188    private final LZ77Compressor compressor;
189
190    // used in one-arg write method
191    private final byte[] oneByte = new byte[1];
192    private final Deque<Pair> pairs = new LinkedList<>();
193
194    // keeps track of the last window-size bytes (64k) in order to be
195    // able to expand back-references when needed
196    private final Deque<byte[]> expandedBlocks = new LinkedList<>();
197
198    /**
199     * Creates a new LZ4 output stream.
200     *
201     * @param out An OutputStream to read compressed data from
202     */
203    public BlockLZ4CompressorOutputStream(final OutputStream out) {
204        this(out, createParameterBuilder().build());
205    }
206
207    /**
208     * Creates a new LZ4 output stream.
209     *
210     * @param out     An OutputStream to read compressed data from
211     * @param params The parameters to use for LZ77 compression.
212     */
213    public BlockLZ4CompressorOutputStream(final OutputStream out, final Parameters params) {
214        super(out);
215        compressor = new LZ77Compressor(params, block -> {
216            switch (block.getType()) {
217            case LITERAL:
218                addLiteralBlock((LZ77Compressor.LiteralBlock) block);
219                break;
220            case BACK_REFERENCE:
221                addBackReference((LZ77Compressor.BackReference) block);
222                break;
223            case EOD:
224                writeFinalLiteralBlock();
225                break;
226            }
227        });
228    }
229
230    private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
231        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
232        last.setBackReference(block);
233        recordBackReference(block);
234        clearUnusedBlocksAndPairs();
235    }
236
237    private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
238        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
239        recordLiteral(last.addLiteral(block));
240        clearUnusedBlocksAndPairs();
241    }
242
243    private void clearUnusedBlocks() {
244        int blockLengths = 0;
245        int blocksToKeep = 0;
246        for (final byte[] b : expandedBlocks) {
247            blocksToKeep++;
248            blockLengths += b.length;
249            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
250                break;
251            }
252        }
253        final int size = expandedBlocks.size();
254        for (int i = blocksToKeep; i < size; i++) {
255            expandedBlocks.removeLast();
256        }
257    }
258
259    private void clearUnusedBlocksAndPairs() {
260        clearUnusedBlocks();
261        clearUnusedPairs();
262    }
263
264    private void clearUnusedPairs() {
265        int pairLengths = 0;
266        int pairsToKeep = 0;
267        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
268            final Pair p = it.next();
269            pairsToKeep++;
270            pairLengths += p.length();
271            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
272                break;
273            }
274        }
275        final int size = pairs.size();
276        for (int i = pairsToKeep; i < size; i++) {
277            final Pair p = pairs.peekFirst();
278            if (!p.hasBeenWritten()) {
279                break;
280            }
281            pairs.removeFirst();
282        }
283    }
284
285    @Override
286    public void close() throws IOException {
287        try {
288            finish();
289        } finally {
290            super.close();
291        }
292    }
293
294    private byte[] expand(final int offset, final int length) {
295        final byte[] expanded = new byte[length];
296        if (offset == 1) { // surprisingly common special case
297            final byte[] block = expandedBlocks.peekFirst();
298            final byte b = block[block.length - 1];
299            if (b != 0) { // the fresh array contains 0s anyway
300                Arrays.fill(expanded, b);
301            }
302        } else {
303            expandFromList(expanded, offset, length);
304        }
305        return expanded;
306    }
307
308    private void expandFromList(final byte[] expanded, final int offset, final int length) {
309        int offsetRemaining = offset;
310        int lengthRemaining = length;
311        int writeOffset = 0;
312        while (lengthRemaining > 0) {
313            // find block that contains offsetRemaining
314            byte[] block = null;
315            final int copyLen;
316            final int copyOffset;
317            if (offsetRemaining > 0) {
318                int blockOffset = 0;
319                for (final byte[] b : expandedBlocks) {
320                    if (b.length + blockOffset >= offsetRemaining) {
321                        block = b;
322                        break;
323                    }
324                    blockOffset += b.length;
325                }
326                if (block == null) {
327                    // should not be possible
328                    throw new IllegalStateException("Failed to find a block containing offset " + offset);
329                }
330                copyOffset = blockOffset + block.length - offsetRemaining;
331                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
332            } else {
333                // offsetRemaining is negative or 0 and points into the expanded bytes
334                block = expanded;
335                copyOffset = -offsetRemaining;
336                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
337            }
338            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
339            offsetRemaining -= copyLen;
340            lengthRemaining -= copyLen;
341            writeOffset += copyLen;
342        }
343    }
344
345    /**
346     * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
347     *
348     * @throws IOException if an error occurs
349     */
350    @Override
351    public void finish() throws IOException {
352        if (!isFinished()) {
353            compressor.finish();
354            super.finish();
355        }
356    }
357
358    /**
359     * Adds some initial data to fill the window with.
360     *
361     * @param data the data to fill the window with.
362     * @param off  offset of real data into the array
363     * @param len  amount of data
364     * @throws IllegalStateException if the stream has already started to write data
365     * @see LZ77Compressor#prefill
366     */
367    public void prefill(final byte[] data, final int off, final int len) {
368        if (len > 0) {
369            final byte[] b = Arrays.copyOfRange(data, off, off + len);
370            compressor.prefill(b);
371            recordLiteral(b);
372        }
373    }
374
375    private void recordBackReference(final LZ77Compressor.BackReference block) {
376        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
377    }
378
379    private void recordLiteral(final byte[] b) {
380        expandedBlocks.addFirst(b);
381    }
382
383    private void rewriteLastPairs() {
384        final LinkedList<Pair> lastPairs = new LinkedList<>();
385        final LinkedList<Integer> pairLength = new LinkedList<>();
386        int offset = 0;
387        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
388            final Pair p = it.next();
389            if (p.hasBeenWritten()) {
390                break;
391            }
392            final int len = p.length();
393            pairLength.addFirst(len);
394            lastPairs.addFirst(p);
395            offset += len;
396            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
397                break;
398            }
399        }
400        lastPairs.forEach(pairs::remove);
401        // lastPairs may contain between one and four Pairs:
402        // * the last pair may be a one byte literal
403        // * all other Pairs contain a back-reference which must be four bytes long at minimum
404        // we could merge them all into a single literal block but
405        // this may harm compression. For example compressing
406        // "bla.tar" from our tests yields a last block containing a
407        // back-reference of length > 2k and we'd end up with a last
408        // literal of that size rather than a 2k back-reference and a
409        // 12 byte literal at the end.
410
411        // Instead we merge all but the first of lastPairs into a new
412        // literal-only Pair "replacement" and look at the
413        // back-reference in the first of lastPairs and see if we can
414        // split it. We can split it if it is longer than 16 -
415        // replacement.length (i.e. the minimal length of four is kept
416        // while making sure the last literal is at least twelve bytes
417        // long). If we can't split it, we expand the first of the pairs
418        // as well.
419
420        // this is not optimal, we could get better compression
421        // results with more complex approaches as the last literal
422        // only needs to be five bytes long if the previous
423        // back-reference has an offset big enough
424
425        final int lastPairsSize = lastPairs.size();
426        int toExpand = 0;
427        for (int i = 1; i < lastPairsSize; i++) {
428            toExpand += pairLength.get(i);
429        }
430        final Pair replacement = new Pair();
431        if (toExpand > 0) {
432            replacement.prependLiteral(expand(toExpand, toExpand));
433        }
434        final Pair splitCandidate = lastPairs.get(0);
435        final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
436        final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
437        if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
438            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
439            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
440        } else {
441            if (splitCandidate.hasBackReference()) {
442                replacement.prependLiteral(expand(toExpand + brLen, brLen));
443            }
444            splitCandidate.prependTo(replacement);
445        }
446        pairs.add(replacement);
447    }
448
449    @Override
450    public void write(final byte[] data, final int off, final int len) throws IOException {
451        compressor.compress(data, off, len);
452    }
453
454    @Override
455    public void write(final int b) throws IOException {
456        oneByte[0] = (byte) (b & 0xff);
457        write(oneByte);
458    }
459
460    private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
461        writeWritablePairs(length);
462        Pair last = pairs.peekLast();
463        if (last == null || last.hasBackReference()) {
464            last = new Pair();
465            pairs.addLast(last);
466        }
467        return last;
468    }
469
470    private void writeFinalLiteralBlock() throws IOException {
471        rewriteLastPairs();
472        for (final Pair p : pairs) {
473            if (!p.hasBeenWritten()) {
474                p.writeTo(out);
475            }
476        }
477        pairs.clear();
478    }
479
480    private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
481        int unwrittenLength = lengthOfBlocksAfterLastPair;
482        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
483            final Pair p = it.next();
484            if (p.hasBeenWritten()) {
485                break;
486            }
487            unwrittenLength += p.length();
488        }
489        for (final Pair p : pairs) {
490            if (p.hasBeenWritten()) {
491                continue;
492            }
493            unwrittenLength -= p.length();
494            if (!p.canBeWritten(unwrittenLength)) {
495                break;
496            }
497            p.writeTo(out);
498        }
499    }
500}