View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz4;
20  
21  import java.io.IOException;
22  import java.io.OutputStream;
23  import java.util.Arrays;
24  import java.util.Deque;
25  import java.util.Iterator;
26  import java.util.LinkedList;
27  
28  import org.apache.commons.compress.compressors.CompressorOutputStream;
29  import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
30  import org.apache.commons.compress.compressors.lz77support.Parameters;
31  import org.apache.commons.compress.utils.ByteUtils;
32  
33  /**
34   * CompressorOutputStream for the LZ4 block format.
35   *
36   * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
37   * @since 1.14
38   * @NotThreadSafe
39   */
40  public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
41  
42      static final class Pair {
43  
44          private static int lengths(final int litLength, final int brLength) {
45              final int l = Math.min(litLength, 15);
46              final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
47              return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
48          }
49  
50          private static void writeLength(int length, final OutputStream out) throws IOException {
51              while (length >= 255) {
52                  out.write(255);
53                  length -= 255;
54              }
55              out.write(length);
56          }
57  
58          private final Deque<byte[]> literals = new LinkedList<>();
59  
60          private int literalLength;
61  
62          private int brOffset, brLength;
63  
64          private boolean written;
65  
66          byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
67              final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength());
68              literals.add(copy);
69              literalLength += copy.length;
70              return copy;
71          }
72  
73          private int backReferenceLength() {
74              return brLength;
75          }
76  
77          boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
78              return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
79          }
80  
81          boolean hasBackReference() {
82              return brOffset > 0;
83          }
84  
85          private boolean hasBeenWritten() {
86              return written;
87          }
88  
89          int length() {
90              return literalLength() + brLength;
91          }
92  
93          private int literalLength() {
94              // This method is performance sensitive
95              if (literalLength != 0) {
96                  return literalLength;
97              }
98              int length = 0;
99              for (final byte[] b : literals) {
100                 length += b.length;
101             }
102             return literalLength = length;
103         }
104 
105         private void prependLiteral(final byte[] data) {
106             literals.addFirst(data);
107             literalLength += data.length;
108         }
109 
110         private void prependTo(final Pair other) {
111             final Iterator<byte[]> listBackwards = literals.descendingIterator();
112             while (listBackwards.hasNext()) {
113                 other.prependLiteral(listBackwards.next());
114             }
115         }
116 
117         void setBackReference(final LZ77Compressor.BackReference block) {
118             if (hasBackReference()) {
119                 throw new IllegalStateException();
120             }
121             brOffset = block.getOffset();
122             brLength = block.getLength();
123         }
124 
125         private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
126             final Pair p = new Pair();
127             p.literals.addAll(literals);
128             p.brOffset = brOffset;
129             p.brLength = newBackReferenceLength;
130             return p;
131         }
132 
133         void writeTo(final OutputStream out) throws IOException {
134             final int litLength = literalLength();
135             out.write(lengths(litLength, brLength));
136             if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
137                 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
138             }
139             for (final byte[] b : literals) {
140                 out.write(b);
141             }
142             if (hasBackReference()) {
143                 ByteUtils.toLittleEndian(out, brOffset, 2);
144                 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
145                     writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
146                 }
147             }
148             written = true;
149         }
150     }
151 
152     private static final int MIN_BACK_REFERENCE_LENGTH = 4;
153 
154     /*
155      *
156      * The LZ4 block format has a few properties that make it less straight-forward than one would hope:
157      *
158      * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor
159      * must be merged into a single block.
160      *
161      * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal
162      * before we know how long the next back-reference is going to be.
163      *
164      * there are special rules for the final blocks
165      *
166      * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always
167      * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed.
168      *
169      * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this
170      * block's length and offset and the next block's length is at least twelve.
171      */
172 
173     private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
174 
175     /**
176      * Returns a builder correctly configured for the LZ4 algorithm.
177      *
178      * @return a builder correctly configured for the LZ4 algorithm
179      */
180     public static Parameters.Builder createParameterBuilder() {
181         final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
182         return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
183                 .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen);
184     }
185 
186     private final LZ77Compressor compressor;
187 
188     private final OutputStream os;
189 
190     // used in one-arg write method
191     private final byte[] oneByte = new byte[1];
192     private boolean finished;
193 
194     private final Deque<Pair> pairs = new LinkedList<>();
195 
196     // keeps track of the last window-size bytes (64k) in order to be
197     // able to expand back-references when needed
198     private final Deque<byte[]> expandedBlocks = new LinkedList<>();
199 
200     /**
201      * Creates a new LZ4 output stream.
202      *
203      * @param os An OutputStream to read compressed data from
204      */
205     public BlockLZ4CompressorOutputStream(final OutputStream os) {
206         this(os, createParameterBuilder().build());
207     }
208 
209     /**
210      * Creates a new LZ4 output stream.
211      *
212      * @param os     An OutputStream to read compressed data from
213      * @param params The parameters to use for LZ77 compression.
214      */
215     public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) {
216         this.os = os;
217         compressor = new LZ77Compressor(params, block -> {
218             switch (block.getType()) {
219             case LITERAL:
220                 addLiteralBlock((LZ77Compressor.LiteralBlock) block);
221                 break;
222             case BACK_REFERENCE:
223                 addBackReference((LZ77Compressor.BackReference) block);
224                 break;
225             case EOD:
226                 writeFinalLiteralBlock();
227                 break;
228             }
229         });
230     }
231 
232     private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
233         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
234         last.setBackReference(block);
235         recordBackReference(block);
236         clearUnusedBlocksAndPairs();
237     }
238 
239     private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
240         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
241         recordLiteral(last.addLiteral(block));
242         clearUnusedBlocksAndPairs();
243     }
244 
245     private void clearUnusedBlocks() {
246         int blockLengths = 0;
247         int blocksToKeep = 0;
248         for (final byte[] b : expandedBlocks) {
249             blocksToKeep++;
250             blockLengths += b.length;
251             if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
252                 break;
253             }
254         }
255         final int size = expandedBlocks.size();
256         for (int i = blocksToKeep; i < size; i++) {
257             expandedBlocks.removeLast();
258         }
259     }
260 
261     private void clearUnusedBlocksAndPairs() {
262         clearUnusedBlocks();
263         clearUnusedPairs();
264     }
265 
266     private void clearUnusedPairs() {
267         int pairLengths = 0;
268         int pairsToKeep = 0;
269         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
270             final Pair p = it.next();
271             pairsToKeep++;
272             pairLengths += p.length();
273             if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
274                 break;
275             }
276         }
277         final int size = pairs.size();
278         for (int i = pairsToKeep; i < size; i++) {
279             final Pair p = pairs.peekFirst();
280             if (!p.hasBeenWritten()) {
281                 break;
282             }
283             pairs.removeFirst();
284         }
285     }
286 
287     @Override
288     public void close() throws IOException {
289         try {
290             finish();
291         } finally {
292             os.close();
293         }
294     }
295 
296     private byte[] expand(final int offset, final int length) {
297         final byte[] expanded = new byte[length];
298         if (offset == 1) { // surprisingly common special case
299             final byte[] block = expandedBlocks.peekFirst();
300             final byte b = block[block.length - 1];
301             if (b != 0) { // the fresh array contains 0s anyway
302                 Arrays.fill(expanded, b);
303             }
304         } else {
305             expandFromList(expanded, offset, length);
306         }
307         return expanded;
308     }
309 
310     private void expandFromList(final byte[] expanded, final int offset, final int length) {
311         int offsetRemaining = offset;
312         int lengthRemaining = length;
313         int writeOffset = 0;
314         while (lengthRemaining > 0) {
315             // find block that contains offsetRemaining
316             byte[] block = null;
317             final int copyLen;
318             final int copyOffset;
319             if (offsetRemaining > 0) {
320                 int blockOffset = 0;
321                 for (final byte[] b : expandedBlocks) {
322                     if (b.length + blockOffset >= offsetRemaining) {
323                         block = b;
324                         break;
325                     }
326                     blockOffset += b.length;
327                 }
328                 if (block == null) {
329                     // should not be possible
330                     throw new IllegalStateException("Failed to find a block containing offset " + offset);
331                 }
332                 copyOffset = blockOffset + block.length - offsetRemaining;
333                 copyLen = Math.min(lengthRemaining, block.length - copyOffset);
334             } else {
335                 // offsetRemaining is negative or 0 and points into the expanded bytes
336                 block = expanded;
337                 copyOffset = -offsetRemaining;
338                 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
339             }
340             System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
341             offsetRemaining -= copyLen;
342             lengthRemaining -= copyLen;
343             writeOffset += copyLen;
344         }
345     }
346 
347     /**
348      * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
349      *
350      * @throws IOException if an error occurs
351      */
352     public void finish() throws IOException {
353         if (!finished) {
354             compressor.finish();
355             finished = true;
356         }
357     }
358 
359     /**
360      * Adds some initial data to fill the window with.
361      *
362      * @param data the data to fill the window with.
363      * @param off  offset of real data into the array
364      * @param len  amount of data
365      * @throws IllegalStateException if the stream has already started to write data
366      * @see LZ77Compressor#prefill
367      */
368     public void prefill(final byte[] data, final int off, final int len) {
369         if (len > 0) {
370             final byte[] b = Arrays.copyOfRange(data, off, off + len);
371             compressor.prefill(b);
372             recordLiteral(b);
373         }
374     }
375 
376     private void recordBackReference(final LZ77Compressor.BackReference block) {
377         expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
378     }
379 
380     private void recordLiteral(final byte[] b) {
381         expandedBlocks.addFirst(b);
382     }
383 
384     private void rewriteLastPairs() {
385         final LinkedList<Pair> lastPairs = new LinkedList<>();
386         final LinkedList<Integer> pairLength = new LinkedList<>();
387         int offset = 0;
388         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
389             final Pair p = it.next();
390             if (p.hasBeenWritten()) {
391                 break;
392             }
393             final int len = p.length();
394             pairLength.addFirst(len);
395             lastPairs.addFirst(p);
396             offset += len;
397             if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
398                 break;
399             }
400         }
401         lastPairs.forEach(pairs::remove);
402         // lastPairs may contain between one and four Pairs:
403         // * the last pair may be a one byte literal
404         // * all other Pairs contain a back-reference which must be four bytes long at minimum
405         // we could merge them all into a single literal block but
406         // this may harm compression. For example compressing
407         // "bla.tar" from our tests yields a last block containing a
408         // back-reference of length > 2k and we'd end up with a last
409         // literal of that size rather than a 2k back-reference and a
410         // 12 byte literal at the end.
411 
412         // Instead we merge all but the first of lastPairs into a new
413         // literal-only Pair "replacement" and look at the
414         // back-reference in the first of lastPairs and see if we can
415         // split it. We can split it if it is longer than 16 -
416         // replacement.length (i.e. the minimal length of four is kept
417         // while making sure the last literal is at least twelve bytes
418         // long). If we can't split it, we expand the first of the pairs
419         // as well.
420 
421         // this is not optimal, we could get better compression
422         // results with more complex approaches as the last literal
423         // only needs to be five bytes long if the previous
424         // back-reference has an offset big enough
425 
426         final int lastPairsSize = lastPairs.size();
427         int toExpand = 0;
428         for (int i = 1; i < lastPairsSize; i++) {
429             toExpand += pairLength.get(i);
430         }
431         final Pair replacement = new Pair();
432         if (toExpand > 0) {
433             replacement.prependLiteral(expand(toExpand, toExpand));
434         }
435         final Pair splitCandidate = lastPairs.get(0);
436         final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
437         final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
438         if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
439             replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
440             pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
441         } else {
442             if (splitCandidate.hasBackReference()) {
443                 replacement.prependLiteral(expand(toExpand + brLen, brLen));
444             }
445             splitCandidate.prependTo(replacement);
446         }
447         pairs.add(replacement);
448     }
449 
450     @Override
451     public void write(final byte[] data, final int off, final int len) throws IOException {
452         compressor.compress(data, off, len);
453     }
454 
455     @Override
456     public void write(final int b) throws IOException {
457         oneByte[0] = (byte) (b & 0xff);
458         write(oneByte);
459     }
460 
461     private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
462         writeWritablePairs(length);
463         Pair last = pairs.peekLast();
464         if (last == null || last.hasBackReference()) {
465             last = new Pair();
466             pairs.addLast(last);
467         }
468         return last;
469     }
470 
471     private void writeFinalLiteralBlock() throws IOException {
472         rewriteLastPairs();
473         for (final Pair p : pairs) {
474             if (!p.hasBeenWritten()) {
475                 p.writeTo(os);
476             }
477         }
478         pairs.clear();
479     }
480 
481     private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
482         int unwrittenLength = lengthOfBlocksAfterLastPair;
483         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
484             final Pair p = it.next();
485             if (p.hasBeenWritten()) {
486                 break;
487             }
488             unwrittenLength += p.length();
489         }
490         for (final Pair p : pairs) {
491             if (p.hasBeenWritten()) {
492                 continue;
493             }
494             unwrittenLength -= p.length();
495             if (!p.canBeWritten(unwrittenLength)) {
496                 break;
497             }
498             p.writeTo(os);
499         }
500     }
501 }