View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz4;
20  
21  import java.io.IOException;
22  import java.io.OutputStream;
23  import java.util.Arrays;
24  import java.util.Deque;
25  import java.util.Iterator;
26  import java.util.LinkedList;
27  
28  import org.apache.commons.compress.compressors.CompressorOutputStream;
29  import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
30  import org.apache.commons.compress.compressors.lz77support.Parameters;
31  import org.apache.commons.compress.utils.ByteUtils;
32  
33  /**
34   * CompressorOutputStream for the LZ4 block format.
35   *
36   * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
37   * @since 1.14
38   * @NotThreadSafe
39   */
40  public class BlockLZ4CompressorOutputStream extends CompressorOutputStream<OutputStream> {
41  
42      static final class Pair {
43  
44          private static int lengths(final int litLength, final int brLength) {
45              final int l = Math.min(litLength, 15);
46              final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
47              return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
48          }
49  
50          private static void writeLength(int length, final OutputStream out) throws IOException {
51              while (length >= 255) {
52                  out.write(255);
53                  length -= 255;
54              }
55              out.write(length);
56          }
57  
58          private final Deque<byte[]> literals = new LinkedList<>();
59  
60          private int literalLength;
61  
62          private int brOffset;
63  
64          private int brLength;
65  
66          private boolean written;
67  
68          byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
69              final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength());
70              literals.add(copy);
71              literalLength += copy.length;
72              return copy;
73          }
74  
75          private int backReferenceLength() {
76              return brLength;
77          }
78  
79          boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
80              return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
81          }
82  
83          boolean hasBackReference() {
84              return brOffset > 0;
85          }
86  
87          private boolean hasBeenWritten() {
88              return written;
89          }
90  
91          int length() {
92              return literalLength() + brLength;
93          }
94  
95          private int literalLength() {
96              // This method is performance sensitive
97              if (literalLength != 0) {
98                  return literalLength;
99              }
100             int length = 0;
101             for (final byte[] b : literals) {
102                 length += b.length;
103             }
104             return literalLength = length;
105         }
106 
107         private void prependLiteral(final byte[] data) {
108             literals.addFirst(data);
109             literalLength += data.length;
110         }
111 
112         private void prependTo(final Pair other) {
113             final Iterator<byte[]> listBackwards = literals.descendingIterator();
114             while (listBackwards.hasNext()) {
115                 other.prependLiteral(listBackwards.next());
116             }
117         }
118 
119         void setBackReference(final LZ77Compressor.BackReference block) {
120             if (hasBackReference()) {
121                 throw new IllegalStateException();
122             }
123             brOffset = block.getOffset();
124             brLength = block.getLength();
125         }
126 
127         private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
128             final Pair p = new Pair();
129             p.literals.addAll(literals);
130             p.brOffset = brOffset;
131             p.brLength = newBackReferenceLength;
132             return p;
133         }
134 
135         void writeTo(final OutputStream out) throws IOException {
136             final int litLength = literalLength();
137             out.write(lengths(litLength, brLength));
138             if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
139                 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
140             }
141             for (final byte[] b : literals) {
142                 out.write(b);
143             }
144             if (hasBackReference()) {
145                 ByteUtils.toLittleEndian(out, brOffset, 2);
146                 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
147                     writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
148                 }
149             }
150             written = true;
151         }
152     }
153 
154     private static final int MIN_BACK_REFERENCE_LENGTH = 4;
155 
156     /*
157      *
158      * The LZ4 block format has a few properties that make it less straight-forward than one would hope:
159      *
160      * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor
161      * must be merged into a single block.
162      *
163      * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal
164      * before we know how long the next back-reference is going to be.
165      *
166      * there are special rules for the final blocks
167      *
168      * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always
169      * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed.
170      *
171      * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this
172      * block's length and offset and the next block's length is at least twelve.
173      */
174 
175     private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
176 
177     /**
178      * Returns a builder correctly configured for the LZ4 algorithm.
179      *
180      * @return a builder correctly configured for the LZ4 algorithm
181      */
182     public static Parameters.Builder createParameterBuilder() {
183         final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
184         return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
185                 .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen);
186     }
187 
188     private final LZ77Compressor compressor;
189 
190     // used in one-arg write method
191     private final byte[] oneByte = new byte[1];
192     private final Deque<Pair> pairs = new LinkedList<>();
193 
194     // keeps track of the last window-size bytes (64k) in order to be
195     // able to expand back-references when needed
196     private final Deque<byte[]> expandedBlocks = new LinkedList<>();
197 
198     /**
199      * Creates a new LZ4 output stream.
200      *
201      * @param out An OutputStream to read compressed data from
202      */
203     public BlockLZ4CompressorOutputStream(final OutputStream out) {
204         this(out, createParameterBuilder().build());
205     }
206 
207     /**
208      * Creates a new LZ4 output stream.
209      *
210      * @param out     An OutputStream to read compressed data from
211      * @param params The parameters to use for LZ77 compression.
212      */
213     public BlockLZ4CompressorOutputStream(final OutputStream out, final Parameters params) {
214         super(out);
215         compressor = new LZ77Compressor(params, block -> {
216             switch (block.getType()) {
217             case LITERAL:
218                 addLiteralBlock((LZ77Compressor.LiteralBlock) block);
219                 break;
220             case BACK_REFERENCE:
221                 addBackReference((LZ77Compressor.BackReference) block);
222                 break;
223             case EOD:
224                 writeFinalLiteralBlock();
225                 break;
226             }
227         });
228     }
229 
230     private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
231         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
232         last.setBackReference(block);
233         recordBackReference(block);
234         clearUnusedBlocksAndPairs();
235     }
236 
237     private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
238         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
239         recordLiteral(last.addLiteral(block));
240         clearUnusedBlocksAndPairs();
241     }
242 
243     private void clearUnusedBlocks() {
244         int blockLengths = 0;
245         int blocksToKeep = 0;
246         for (final byte[] b : expandedBlocks) {
247             blocksToKeep++;
248             blockLengths += b.length;
249             if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
250                 break;
251             }
252         }
253         final int size = expandedBlocks.size();
254         for (int i = blocksToKeep; i < size; i++) {
255             expandedBlocks.removeLast();
256         }
257     }
258 
259     private void clearUnusedBlocksAndPairs() {
260         clearUnusedBlocks();
261         clearUnusedPairs();
262     }
263 
264     private void clearUnusedPairs() {
265         int pairLengths = 0;
266         int pairsToKeep = 0;
267         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
268             final Pair p = it.next();
269             pairsToKeep++;
270             pairLengths += p.length();
271             if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
272                 break;
273             }
274         }
275         final int size = pairs.size();
276         for (int i = pairsToKeep; i < size; i++) {
277             final Pair p = pairs.peekFirst();
278             if (!p.hasBeenWritten()) {
279                 break;
280             }
281             pairs.removeFirst();
282         }
283     }
284 
285     @Override
286     public void close() throws IOException {
287         try {
288             finish();
289         } finally {
290             super.close();
291         }
292     }
293 
294     private byte[] expand(final int offset, final int length) {
295         final byte[] expanded = new byte[length];
296         if (offset == 1) { // surprisingly common special case
297             final byte[] block = expandedBlocks.peekFirst();
298             final byte b = block[block.length - 1];
299             if (b != 0) { // the fresh array contains 0s anyway
300                 Arrays.fill(expanded, b);
301             }
302         } else {
303             expandFromList(expanded, offset, length);
304         }
305         return expanded;
306     }
307 
308     private void expandFromList(final byte[] expanded, final int offset, final int length) {
309         int offsetRemaining = offset;
310         int lengthRemaining = length;
311         int writeOffset = 0;
312         while (lengthRemaining > 0) {
313             // find block that contains offsetRemaining
314             byte[] block = null;
315             final int copyLen;
316             final int copyOffset;
317             if (offsetRemaining > 0) {
318                 int blockOffset = 0;
319                 for (final byte[] b : expandedBlocks) {
320                     if (b.length + blockOffset >= offsetRemaining) {
321                         block = b;
322                         break;
323                     }
324                     blockOffset += b.length;
325                 }
326                 if (block == null) {
327                     // should not be possible
328                     throw new IllegalStateException("Failed to find a block containing offset " + offset);
329                 }
330                 copyOffset = blockOffset + block.length - offsetRemaining;
331                 copyLen = Math.min(lengthRemaining, block.length - copyOffset);
332             } else {
333                 // offsetRemaining is negative or 0 and points into the expanded bytes
334                 block = expanded;
335                 copyOffset = -offsetRemaining;
336                 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
337             }
338             System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
339             offsetRemaining -= copyLen;
340             lengthRemaining -= copyLen;
341             writeOffset += copyLen;
342         }
343     }
344 
345     /**
346      * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
347      *
348      * @throws IOException if an error occurs
349      */
350     @Override
351     public void finish() throws IOException {
352         if (!isFinished()) {
353             compressor.finish();
354             super.finish();
355         }
356     }
357 
358     /**
359      * Adds some initial data to fill the window with.
360      *
361      * @param data the data to fill the window with.
362      * @param off  offset of real data into the array
363      * @param len  amount of data
364      * @throws IllegalStateException if the stream has already started to write data
365      * @see LZ77Compressor#prefill
366      */
367     public void prefill(final byte[] data, final int off, final int len) {
368         if (len > 0) {
369             final byte[] b = Arrays.copyOfRange(data, off, off + len);
370             compressor.prefill(b);
371             recordLiteral(b);
372         }
373     }
374 
375     private void recordBackReference(final LZ77Compressor.BackReference block) {
376         expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
377     }
378 
379     private void recordLiteral(final byte[] b) {
380         expandedBlocks.addFirst(b);
381     }
382 
383     private void rewriteLastPairs() {
384         final LinkedList<Pair> lastPairs = new LinkedList<>();
385         final LinkedList<Integer> pairLength = new LinkedList<>();
386         int offset = 0;
387         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
388             final Pair p = it.next();
389             if (p.hasBeenWritten()) {
390                 break;
391             }
392             final int len = p.length();
393             pairLength.addFirst(len);
394             lastPairs.addFirst(p);
395             offset += len;
396             if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
397                 break;
398             }
399         }
400         lastPairs.forEach(pairs::remove);
401         // lastPairs may contain between one and four Pairs:
402         // * the last pair may be a one byte literal
403         // * all other Pairs contain a back-reference which must be four bytes long at minimum
404         // we could merge them all into a single literal block but
405         // this may harm compression. For example compressing
406         // "bla.tar" from our tests yields a last block containing a
407         // back-reference of length > 2k and we'd end up with a last
408         // literal of that size rather than a 2k back-reference and a
409         // 12 byte literal at the end.
410 
411         // Instead we merge all but the first of lastPairs into a new
412         // literal-only Pair "replacement" and look at the
413         // back-reference in the first of lastPairs and see if we can
414         // split it. We can split it if it is longer than 16 -
415         // replacement.length (i.e. the minimal length of four is kept
416         // while making sure the last literal is at least twelve bytes
417         // long). If we can't split it, we expand the first of the pairs
418         // as well.
419 
420         // this is not optimal, we could get better compression
421         // results with more complex approaches as the last literal
422         // only needs to be five bytes long if the previous
423         // back-reference has an offset big enough
424 
425         final int lastPairsSize = lastPairs.size();
426         int toExpand = 0;
427         for (int i = 1; i < lastPairsSize; i++) {
428             toExpand += pairLength.get(i);
429         }
430         final Pair replacement = new Pair();
431         if (toExpand > 0) {
432             replacement.prependLiteral(expand(toExpand, toExpand));
433         }
434         final Pair splitCandidate = lastPairs.get(0);
435         final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
436         final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
437         if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
438             replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
439             pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
440         } else {
441             if (splitCandidate.hasBackReference()) {
442                 replacement.prependLiteral(expand(toExpand + brLen, brLen));
443             }
444             splitCandidate.prependTo(replacement);
445         }
446         pairs.add(replacement);
447     }
448 
449     @Override
450     public void write(final byte[] data, final int off, final int len) throws IOException {
451         compressor.compress(data, off, len);
452     }
453 
454     @Override
455     public void write(final int b) throws IOException {
456         oneByte[0] = (byte) (b & 0xff);
457         write(oneByte);
458     }
459 
460     private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
461         writeWritablePairs(length);
462         Pair last = pairs.peekLast();
463         if (last == null || last.hasBackReference()) {
464             last = new Pair();
465             pairs.addLast(last);
466         }
467         return last;
468     }
469 
470     private void writeFinalLiteralBlock() throws IOException {
471         rewriteLastPairs();
472         for (final Pair p : pairs) {
473             if (!p.hasBeenWritten()) {
474                 p.writeTo(out);
475             }
476         }
477         pairs.clear();
478     }
479 
480     private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
481         int unwrittenLength = lengthOfBlocksAfterLastPair;
482         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
483             final Pair p = it.next();
484             if (p.hasBeenWritten()) {
485                 break;
486             }
487             unwrittenLength += p.length();
488         }
489         for (final Pair p : pairs) {
490             if (p.hasBeenWritten()) {
491                 continue;
492             }
493             unwrittenLength -= p.length();
494             if (!p.canBeWritten(unwrittenLength)) {
495                 break;
496             }
497             p.writeTo(out);
498         }
499     }
500 }