001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz4;
020
021import java.io.IOException;
022import java.io.OutputStream;
023import java.util.Arrays;
024import java.util.Deque;
025import java.util.Iterator;
026import java.util.LinkedList;
027
028import org.apache.commons.compress.compressors.CompressorOutputStream;
029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
030import org.apache.commons.compress.compressors.lz77support.Parameters;
031import org.apache.commons.compress.utils.ByteUtils;
032
033/**
034 * CompressorOutputStream for the LZ4 block format.
035 *
036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
037 * @since 1.14
038 * @NotThreadSafe
039 */
040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
041
042    private static final int MIN_BACK_REFERENCE_LENGTH = 4;
043    private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
044
045    /*
046
047      The LZ4 block format has a few properties that make it less
048      straight-forward than one would hope:
049
050      * literal blocks and back-references must come in pairs (except
051        for the very last literal block), so consecutive literal
052        blocks created by the compressor must be merged into a single
053        block.
054
055      * the start of a literal/back-reference pair contains the length
056        of the back-reference (at least some part of it) so we can't
057        start writing the literal before we know how long the next
058        back-reference is going to be.
059
060      * there are special rules for the final blocks
061
062        > There are specific parsing rules to respect in order to remain
063        > compatible with assumptions made by the decoder :
064        >
065        >     1. The last 5 bytes are always literals
066        >
067        >     2. The last match must start at least 12 bytes before end of
068        >        block. Consequently, a block with less than 13 bytes cannot be
069        >        compressed.
070
071        which means any back-reference may need to get rewritten as a
072        literal block unless we know the next block is at least of
073        length 5 and the sum of this block's length and offset and the
074        next block's length is at least twelve.
075
076    */
077
078    private final LZ77Compressor compressor;
079    private final OutputStream os;
080
081    // used in one-arg write method
082    private final byte[] oneByte = new byte[1];
083
084    private boolean finished;
085
086    private final Deque<Pair> pairs = new LinkedList<>();
087    // keeps track of the last window-size bytes (64k) in order to be
088    // able to expand back-references when needed
089    private final Deque<byte[]> expandedBlocks = new LinkedList<>();
090
091    /**
092     * Creates a new LZ4 output stream.
093     *
094     * @param os
095     *            An OutputStream to read compressed data from
096     *
097     * @throws IOException if reading fails
098     */
099    public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException {
100        this(os, createParameterBuilder().build());
101    }
102
103    /**
104     * Creates a new LZ4 output stream.
105     *
106     * @param os
107     *            An OutputStream to read compressed data from
108     * @param params
109     *            The parameters to use for LZ77 compression.
110     *
111     * @throws IOException if reading fails
112     */
113    public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) throws IOException {
114        this.os = os;
115        compressor = new LZ77Compressor(params,
116            block -> {
117                switch (block.getType()) {
118                case LITERAL:
119                    addLiteralBlock((LZ77Compressor.LiteralBlock) block);
120                    break;
121                case BACK_REFERENCE:
122                    addBackReference((LZ77Compressor.BackReference) block);
123                    break;
124                case EOD:
125                    writeFinalLiteralBlock();
126                    break;
127                }
128            });
129    }
130
131    @Override
132    public void write(final int b) throws IOException {
133        oneByte[0] = (byte) (b & 0xff);
134        write(oneByte);
135    }
136
137    @Override
138    public void write(final byte[] data, final int off, final int len) throws IOException {
139        compressor.compress(data, off, len);
140    }
141
142    @Override
143    public void close() throws IOException {
144        try {
145            finish();
146        } finally {
147            os.close();
148        }
149    }
150
151    /**
152     * Compresses all remaining data and writes it to the stream,
153     * doesn't close the underlying stream.
154     * @throws IOException if an error occurs
155     */
156    public void finish() throws IOException {
157        if (!finished) {
158            compressor.finish();
159            finished = true;
160        }
161    }
162
163    /**
164     * Adds some initial data to fill the window with.
165     *
166     * @param data the data to fill the window with.
167     * @param off offset of real data into the array
168     * @param len amount of data
169     * @throws IllegalStateException if the stream has already started to write data
170     * @see LZ77Compressor#prefill
171     */
172    public void prefill(final byte[] data, final int off, final int len) {
173        if (len > 0) {
174            final byte[] b = Arrays.copyOfRange(data, off, off + len);
175            compressor.prefill(b);
176            recordLiteral(b);
177        }
178    }
179
180    private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
181        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
182        recordLiteral(last.addLiteral(block));
183        clearUnusedBlocksAndPairs();
184    }
185
186    private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
187        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
188        last.setBackReference(block);
189        recordBackReference(block);
190        clearUnusedBlocksAndPairs();
191    }
192
193    private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
194        writeWritablePairs(length);
195        Pair last = pairs.peekLast();
196        if (last == null || last.hasBackReference()) {
197            last = new Pair();
198            pairs.addLast(last);
199        }
200        return last;
201    }
202
203    private void recordLiteral(final byte[] b) {
204        expandedBlocks.addFirst(b);
205    }
206
207    private void clearUnusedBlocksAndPairs() {
208        clearUnusedBlocks();
209        clearUnusedPairs();
210    }
211
212    private void clearUnusedBlocks() {
213        int blockLengths = 0;
214        int blocksToKeep = 0;
215        for (final byte[] b : expandedBlocks) {
216            blocksToKeep++;
217            blockLengths += b.length;
218            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
219                break;
220            }
221        }
222        final int size = expandedBlocks.size();
223        for (int i = blocksToKeep; i < size; i++) {
224            expandedBlocks.removeLast();
225        }
226    }
227
228    private void recordBackReference(final LZ77Compressor.BackReference block) {
229        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
230    }
231
232    private byte[] expand(final int offset, final int length) {
233        final byte[] expanded = new byte[length];
234        if (offset == 1) { // surprisingly common special case
235            final byte[] block = expandedBlocks.peekFirst();
236            final byte b = block[block.length - 1];
237            if (b != 0) { // the fresh array contains 0s anyway
238                Arrays.fill(expanded, b);
239            }
240        } else {
241            expandFromList(expanded, offset, length);
242        }
243        return expanded;
244    }
245
246    private void expandFromList(final byte[] expanded, final int offset, final int length) {
247        int offsetRemaining = offset;
248        int lengthRemaining = length;
249        int writeOffset = 0;
250        while (lengthRemaining > 0) {
251            // find block that contains offsetRemaining
252            byte[] block = null;
253            final int copyLen;
254            final int copyOffset;
255            if (offsetRemaining > 0) {
256                int blockOffset = 0;
257                for (final byte[] b : expandedBlocks) {
258                    if (b.length + blockOffset >= offsetRemaining) {
259                        block = b;
260                        break;
261                    }
262                    blockOffset += b.length;
263                }
264                if (block == null) {
265                    // should not be possible
266                    throw new IllegalStateException("Failed to find a block containing offset " + offset);
267                }
268                copyOffset = blockOffset + block.length - offsetRemaining;
269                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
270            } else {
271                // offsetRemaining is negative or 0 and points into the expanded bytes
272                block = expanded;
273                copyOffset = -offsetRemaining;
274                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
275            }
276            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
277            offsetRemaining -= copyLen;
278            lengthRemaining -= copyLen;
279            writeOffset += copyLen;
280        }
281    }
282
283    private void clearUnusedPairs() {
284        int pairLengths = 0;
285        int pairsToKeep = 0;
286        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
287            final Pair p = it.next();
288            pairsToKeep++;
289            pairLengths += p.length();
290            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
291                break;
292            }
293        }
294        final int size = pairs.size();
295        for (int i = pairsToKeep; i < size; i++) {
296            final Pair p = pairs.peekFirst();
297            if (!p.hasBeenWritten()) {
298                break;
299            }
300            pairs.removeFirst();
301        }
302    }
303
304    private void writeFinalLiteralBlock() throws IOException {
305        rewriteLastPairs();
306        for (final Pair p : pairs) {
307            if (!p.hasBeenWritten()) {
308                p.writeTo(os);
309            }
310        }
311        pairs.clear();
312    }
313
314    private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
315        int unwrittenLength = lengthOfBlocksAfterLastPair;
316        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
317            final Pair p = it.next();
318            if (p.hasBeenWritten()) {
319                break;
320            }
321            unwrittenLength += p.length();
322        }
323        for (final Pair p : pairs) {
324            if (p.hasBeenWritten()) {
325                continue;
326            }
327            unwrittenLength -= p.length();
328            if (!p.canBeWritten(unwrittenLength)) {
329                break;
330            }
331            p.writeTo(os);
332        }
333    }
334
335    private void rewriteLastPairs() {
336        final LinkedList<Pair> lastPairs = new LinkedList<>();
337        final LinkedList<Integer> pairLength = new LinkedList<>();
338        int offset = 0;
339        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
340            final Pair p = it.next();
341            if (p.hasBeenWritten()) {
342                break;
343            }
344            final int len = p.length();
345            pairLength.addFirst(len);
346            lastPairs.addFirst(p);
347            offset += len;
348            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
349                break;
350            }
351        }
352        for (final Pair p : lastPairs) {
353            pairs.remove(p);
354        }
355        // lastPairs may contain between one and four Pairs:
356        // * the last pair may be a one byte literal
357        // * all other Pairs contain a back-reference which must be four bytes long at minimum
358        // we could merge them all into a single literal block but
359        // this may harm compression. For example compressing
360        // "bla.tar" from our tests yields a last block containing a
361        // back-reference of length > 2k and we'd end up with a last
362        // literal of that size rather than a 2k back-reference and a
363        // 12 byte literal at the end.
364
365        // Instead we merge all but the first of lastPairs into a new
366        // literal-only Pair "replacement" and look at the
367        // back-reference in the first of lastPairs and see if we can
368        // split it. We can split it if it is longer than 16 -
369        // replacement.length (i.e. the minimal length of four is kept
370        // while making sure the last literal is at least twelve bytes
371        // long). If we can't split it, we expand the first of the pairs
372        // as well.
373
374        // this is not optimal, we could get better compression
375        // results with more complex approaches as the last literal
376        // only needs to be five bytes long if the previous
377        // back-reference has an offset big enough
378
379        final int lastPairsSize = lastPairs.size();
380        int toExpand = 0;
381        for (int i = 1; i < lastPairsSize; i++) {
382            toExpand += pairLength.get(i);
383        }
384        final Pair replacement = new Pair();
385        if (toExpand > 0) {
386            replacement.prependLiteral(expand(toExpand, toExpand));
387        }
388        final Pair splitCandidate = lastPairs.get(0);
389        final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
390        final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
391        if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
392            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
393            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
394        } else {
395            if (splitCandidate.hasBackReference()) {
396                replacement.prependLiteral(expand(toExpand + brLen, brLen));
397            }
398            splitCandidate.prependTo(replacement);
399        }
400        pairs.add(replacement);
401    }
402
403    /**
404     * Returns a builder correctly configured for the LZ4 algorithm.
405     * @return a builder correctly configured for the LZ4 algorithm
406     */
407    public static Parameters.Builder createParameterBuilder() {
408        final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
409        return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE)
410            .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
411            .withMaxBackReferenceLength(maxLen)
412            .withMaxOffset(maxLen)
413            .withMaxLiteralLength(maxLen);
414    }
415
416    final static class Pair {
417        private final Deque<byte[]> literals = new LinkedList<>();
418        private int brOffset, brLength;
419        private boolean written;
420
421        private void prependLiteral(final byte[] data) {
422            literals.addFirst(data);
423        }
424        byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
425            final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(),
426                block.getOffset() + block.getLength());
427            literals.add(copy);
428            return copy;
429        }
430        void setBackReference(final LZ77Compressor.BackReference block) {
431            if (hasBackReference()) {
432                throw new IllegalStateException();
433            }
434            brOffset = block.getOffset();
435            brLength = block.getLength();
436        }
437        boolean hasBackReference() {
438            return brOffset > 0;
439        }
440        boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
441            return hasBackReference()
442                && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
443        }
444        int length() {
445            return literalLength() + brLength;
446        }
447        private boolean hasBeenWritten() {
448            return written;
449        }
450        void writeTo(final OutputStream out) throws IOException {
451            final int litLength = literalLength();
452            out.write(lengths(litLength, brLength));
453            if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
454                writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
455            }
456            for (final byte[] b : literals) {
457                out.write(b);
458            }
459            if (hasBackReference()) {
460                ByteUtils.toLittleEndian(out, brOffset, 2);
461                if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
462                    writeLength(brLength - MIN_BACK_REFERENCE_LENGTH
463                        - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
464                }
465            }
466            written = true;
467        }
468        private int literalLength() {
469            int length = 0;
470            for (final byte[] b : literals) {
471                length += b.length;
472            }
473            return length;
474        }
475        private static int lengths(final int litLength, final int brLength) {
476            final int l = litLength < 15 ? litLength : 15;
477            final int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15);
478            return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br;
479        }
480        private static void writeLength(int length, final OutputStream out) throws IOException {
481            while (length >= 255) {
482                out.write(255);
483                length -= 255;
484            }
485            out.write(length);
486        }
487        private int backReferenceLength() {
488            return brLength;
489        }
490        private void prependTo(final Pair other) {
491            final Iterator<byte[]> listBackwards = literals.descendingIterator();
492            while (listBackwards.hasNext()) {
493                other.prependLiteral(listBackwards.next());
494            }
495        }
496        private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
497            final Pair p = new Pair();
498            p.literals.addAll(literals);
499            p.brOffset = brOffset;
500            p.brLength = newBackReferenceLength;
501            return p;
502        }
503    }
504}