001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz4; 020 021import java.io.IOException; 022import java.io.OutputStream; 023import java.util.Arrays; 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; 027 028import org.apache.commons.compress.compressors.CompressorOutputStream; 029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 030import org.apache.commons.compress.compressors.lz77support.Parameters; 031import org.apache.commons.compress.utils.ByteUtils; 032 033/** 034 * CompressorOutputStream for the LZ4 block format. 035 * 036 * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> 037 * @since 1.14 038 * @NotThreadSafe 039 */ 040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { 041 042 static final class Pair { 043 044 private static int lengths(final int litLength, final int brLength) { 045 final int l = Math.min(litLength, 15); 046 final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15; 047 return l << BlockLZ4CompressorInputStream.SIZE_BITS | br; 048 } 049 050 private static void writeLength(int length, final OutputStream out) throws IOException { 051 while (length >= 255) { 052 out.write(255); 053 length -= 255; 054 } 055 out.write(length); 056 } 057 058 private final Deque<byte[]> literals = new LinkedList<>(); 059 060 private int literalLength; 061 062 private int brOffset, brLength; 063 064 private boolean written; 065 066 byte[] addLiteral(final LZ77Compressor.LiteralBlock block) { 067 final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength()); 068 literals.add(copy); 069 literalLength += copy.length; 070 return copy; 071 } 072 073 private int backReferenceLength() { 074 return brLength; 075 } 076 077 boolean canBeWritten(final int lengthOfBlocksAfterThisPair) { 078 return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 079 } 080 081 boolean hasBackReference() { 082 return brOffset > 0; 083 } 084 085 private boolean hasBeenWritten() { 086 return written; 087 } 088 089 int length() { 090 return literalLength() + brLength; 091 } 092 093 private int literalLength() { 094 // This method is performance sensitive 095 if (literalLength != 0) { 096 return literalLength; 097 } 098 int length = 0; 099 for (final byte[] b : literals) { 100 length += b.length; 101 } 102 return literalLength = length; 103 } 104 105 private void prependLiteral(final byte[] data) { 106 literals.addFirst(data); 107 literalLength += data.length; 108 } 109 110 private void prependTo(final Pair other) { 111 final Iterator<byte[]> listBackwards = literals.descendingIterator(); 112 while (listBackwards.hasNext()) { 113 other.prependLiteral(listBackwards.next()); 114 } 115 } 116 117 void setBackReference(final LZ77Compressor.BackReference block) { 118 if (hasBackReference()) { 119 throw new IllegalStateException(); 120 } 121 brOffset = block.getOffset(); 122 brLength = block.getLength(); 123 } 124 125 private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) { 126 final Pair p = new Pair(); 127 p.literals.addAll(literals); 128 p.brOffset = brOffset; 129 p.brLength = newBackReferenceLength; 130 return p; 131 } 132 133 void writeTo(final OutputStream out) throws IOException { 134 final int litLength = literalLength(); 135 out.write(lengths(litLength, brLength)); 136 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 137 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 138 } 139 for (final byte[] b : literals) { 140 out.write(b); 141 } 142 if (hasBackReference()) { 143 ByteUtils.toLittleEndian(out, brOffset, 2); 144 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 145 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 146 } 147 } 148 written = true; 149 } 150 } 151 152 private static final int MIN_BACK_REFERENCE_LENGTH = 4; 153 154 /* 155 * 156 * The LZ4 block format has a few properties that make it less straight-forward than one would hope: 157 * 158 * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor 159 * must be merged into a single block. 160 * 161 * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal 162 * before we know how long the next back-reference is going to be. 163 * 164 * there are special rules for the final blocks 165 * 166 * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always 167 * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed. 168 * 169 * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this 170 * block's length and offset and the next block's length is at least twelve. 171 */ 172 173 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; 174 175 /** 176 * Returns a builder correctly configured for the LZ4 algorithm. 177 * 178 * @return a builder correctly configured for the LZ4 algorithm 179 */ 180 public static Parameters.Builder createParameterBuilder() { 181 final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; 182 return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) 183 .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen); 184 } 185 186 private final LZ77Compressor compressor; 187 188 private final OutputStream os; 189 190 // used in one-arg write method 191 private final byte[] oneByte = new byte[1]; 192 private boolean finished; 193 194 private final Deque<Pair> pairs = new LinkedList<>(); 195 196 // keeps track of the last window-size bytes (64k) in order to be 197 // able to expand back-references when needed 198 private final Deque<byte[]> expandedBlocks = new LinkedList<>(); 199 200 /** 201 * Creates a new LZ4 output stream. 202 * 203 * @param os An OutputStream to read compressed data from 204 */ 205 public BlockLZ4CompressorOutputStream(final OutputStream os) { 206 this(os, createParameterBuilder().build()); 207 } 208 209 /** 210 * Creates a new LZ4 output stream. 211 * 212 * @param os An OutputStream to read compressed data from 213 * @param params The parameters to use for LZ77 compression. 214 */ 215 public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) { 216 this.os = os; 217 compressor = new LZ77Compressor(params, block -> { 218 switch (block.getType()) { 219 case LITERAL: 220 addLiteralBlock((LZ77Compressor.LiteralBlock) block); 221 break; 222 case BACK_REFERENCE: 223 addBackReference((LZ77Compressor.BackReference) block); 224 break; 225 case EOD: 226 writeFinalLiteralBlock(); 227 break; 228 } 229 }); 230 } 231 232 private void addBackReference(final LZ77Compressor.BackReference block) throws IOException { 233 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 234 last.setBackReference(block); 235 recordBackReference(block); 236 clearUnusedBlocksAndPairs(); 237 } 238 239 private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { 240 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 241 recordLiteral(last.addLiteral(block)); 242 clearUnusedBlocksAndPairs(); 243 } 244 245 private void clearUnusedBlocks() { 246 int blockLengths = 0; 247 int blocksToKeep = 0; 248 for (final byte[] b : expandedBlocks) { 249 blocksToKeep++; 250 blockLengths += b.length; 251 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 252 break; 253 } 254 } 255 final int size = expandedBlocks.size(); 256 for (int i = blocksToKeep; i < size; i++) { 257 expandedBlocks.removeLast(); 258 } 259 } 260 261 private void clearUnusedBlocksAndPairs() { 262 clearUnusedBlocks(); 263 clearUnusedPairs(); 264 } 265 266 private void clearUnusedPairs() { 267 int pairLengths = 0; 268 int pairsToKeep = 0; 269 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) { 270 final Pair p = it.next(); 271 pairsToKeep++; 272 pairLengths += p.length(); 273 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 274 break; 275 } 276 } 277 final int size = pairs.size(); 278 for (int i = pairsToKeep; i < size; i++) { 279 final Pair p = pairs.peekFirst(); 280 if (!p.hasBeenWritten()) { 281 break; 282 } 283 pairs.removeFirst(); 284 } 285 } 286 287 @Override 288 public void close() throws IOException { 289 try { 290 finish(); 291 } finally { 292 os.close(); 293 } 294 } 295 296 private byte[] expand(final int offset, final int length) { 297 final byte[] expanded = new byte[length]; 298 if (offset == 1) { // surprisingly common special case 299 final byte[] block = expandedBlocks.peekFirst(); 300 final byte b = block[block.length - 1]; 301 if (b != 0) { // the fresh array contains 0s anyway 302 Arrays.fill(expanded, b); 303 } 304 } else { 305 expandFromList(expanded, offset, length); 306 } 307 return expanded; 308 } 309 310 private void expandFromList(final byte[] expanded, final int offset, final int length) { 311 int offsetRemaining = offset; 312 int lengthRemaining = length; 313 int writeOffset = 0; 314 while (lengthRemaining > 0) { 315 // find block that contains offsetRemaining 316 byte[] block = null; 317 final int copyLen; 318 final int copyOffset; 319 if (offsetRemaining > 0) { 320 int blockOffset = 0; 321 for (final byte[] b : expandedBlocks) { 322 if (b.length + blockOffset >= offsetRemaining) { 323 block = b; 324 break; 325 } 326 blockOffset += b.length; 327 } 328 if (block == null) { 329 // should not be possible 330 throw new IllegalStateException("Failed to find a block containing offset " + offset); 331 } 332 copyOffset = blockOffset + block.length - offsetRemaining; 333 copyLen = Math.min(lengthRemaining, block.length - copyOffset); 334 } else { 335 // offsetRemaining is negative or 0 and points into the expanded bytes 336 block = expanded; 337 copyOffset = -offsetRemaining; 338 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); 339 } 340 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); 341 offsetRemaining -= copyLen; 342 lengthRemaining -= copyLen; 343 writeOffset += copyLen; 344 } 345 } 346 347 /** 348 * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream. 349 * 350 * @throws IOException if an error occurs 351 */ 352 public void finish() throws IOException { 353 if (!finished) { 354 compressor.finish(); 355 finished = true; 356 } 357 } 358 359 /** 360 * Adds some initial data to fill the window with. 361 * 362 * @param data the data to fill the window with. 363 * @param off offset of real data into the array 364 * @param len amount of data 365 * @throws IllegalStateException if the stream has already started to write data 366 * @see LZ77Compressor#prefill 367 */ 368 public void prefill(final byte[] data, final int off, final int len) { 369 if (len > 0) { 370 final byte[] b = Arrays.copyOfRange(data, off, off + len); 371 compressor.prefill(b); 372 recordLiteral(b); 373 } 374 } 375 376 private void recordBackReference(final LZ77Compressor.BackReference block) { 377 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); 378 } 379 380 private void recordLiteral(final byte[] b) { 381 expandedBlocks.addFirst(b); 382 } 383 384 private void rewriteLastPairs() { 385 final LinkedList<Pair> lastPairs = new LinkedList<>(); 386 final LinkedList<Integer> pairLength = new LinkedList<>(); 387 int offset = 0; 388 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) { 389 final Pair p = it.next(); 390 if (p.hasBeenWritten()) { 391 break; 392 } 393 final int len = p.length(); 394 pairLength.addFirst(len); 395 lastPairs.addFirst(p); 396 offset += len; 397 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { 398 break; 399 } 400 } 401 lastPairs.forEach(pairs::remove); 402 // lastPairs may contain between one and four Pairs: 403 // * the last pair may be a one byte literal 404 // * all other Pairs contain a back-reference which must be four bytes long at minimum 405 // we could merge them all into a single literal block but 406 // this may harm compression. For example compressing 407 // "bla.tar" from our tests yields a last block containing a 408 // back-reference of length > 2k and we'd end up with a last 409 // literal of that size rather than a 2k back-reference and a 410 // 12 byte literal at the end. 411 412 // Instead we merge all but the first of lastPairs into a new 413 // literal-only Pair "replacement" and look at the 414 // back-reference in the first of lastPairs and see if we can 415 // split it. We can split it if it is longer than 16 - 416 // replacement.length (i.e. the minimal length of four is kept 417 // while making sure the last literal is at least twelve bytes 418 // long). If we can't split it, we expand the first of the pairs 419 // as well. 420 421 // this is not optimal, we could get better compression 422 // results with more complex approaches as the last literal 423 // only needs to be five bytes long if the previous 424 // back-reference has an offset big enough 425 426 final int lastPairsSize = lastPairs.size(); 427 int toExpand = 0; 428 for (int i = 1; i < lastPairsSize; i++) { 429 toExpand += pairLength.get(i); 430 } 431 final Pair replacement = new Pair(); 432 if (toExpand > 0) { 433 replacement.prependLiteral(expand(toExpand, toExpand)); 434 } 435 final Pair splitCandidate = lastPairs.get(0); 436 final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; 437 final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; 438 if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { 439 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); 440 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); 441 } else { 442 if (splitCandidate.hasBackReference()) { 443 replacement.prependLiteral(expand(toExpand + brLen, brLen)); 444 } 445 splitCandidate.prependTo(replacement); 446 } 447 pairs.add(replacement); 448 } 449 450 @Override 451 public void write(final byte[] data, final int off, final int len) throws IOException { 452 compressor.compress(data, off, len); 453 } 454 455 @Override 456 public void write(final int b) throws IOException { 457 oneByte[0] = (byte) (b & 0xff); 458 write(oneByte); 459 } 460 461 private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException { 462 writeWritablePairs(length); 463 Pair last = pairs.peekLast(); 464 if (last == null || last.hasBackReference()) { 465 last = new Pair(); 466 pairs.addLast(last); 467 } 468 return last; 469 } 470 471 private void writeFinalLiteralBlock() throws IOException { 472 rewriteLastPairs(); 473 for (final Pair p : pairs) { 474 if (!p.hasBeenWritten()) { 475 p.writeTo(os); 476 } 477 } 478 pairs.clear(); 479 } 480 481 private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException { 482 int unwrittenLength = lengthOfBlocksAfterLastPair; 483 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) { 484 final Pair p = it.next(); 485 if (p.hasBeenWritten()) { 486 break; 487 } 488 unwrittenLength += p.length(); 489 } 490 for (final Pair p : pairs) { 491 if (p.hasBeenWritten()) { 492 continue; 493 } 494 unwrittenLength -= p.length(); 495 if (!p.canBeWritten(unwrittenLength)) { 496 break; 497 } 498 p.writeTo(os); 499 } 500 } 501}