001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz4; 020 021import java.io.IOException; 022import java.io.OutputStream; 023import java.util.Arrays; 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; 027 028import org.apache.commons.compress.compressors.CompressorOutputStream; 029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 030import org.apache.commons.compress.compressors.lz77support.Parameters; 031import org.apache.commons.compress.utils.ByteUtils; 032 033/** 034 * CompressorOutputStream for the LZ4 block format. 035 * 036 * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> 037 * @since 1.14 038 * @NotThreadSafe 039 */ 040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream<OutputStream> { 041 042 static final class Pair { 043 044 private static int lengths(final int litLength, final int brLength) { 045 final int l = Math.min(litLength, 15); 046 final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15; 047 return l << BlockLZ4CompressorInputStream.SIZE_BITS | br; 048 } 049 050 private static void writeLength(int length, final OutputStream out) throws IOException { 051 while (length >= 255) { 052 out.write(255); 053 length -= 255; 054 } 055 out.write(length); 056 } 057 058 private final Deque<byte[]> literals = new LinkedList<>(); 059 060 private int literalLength; 061 062 private int brOffset; 063 064 private int brLength; 065 066 private boolean written; 067 068 byte[] addLiteral(final LZ77Compressor.LiteralBlock block) { 069 final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength()); 070 literals.add(copy); 071 literalLength += copy.length; 072 return copy; 073 } 074 075 private int backReferenceLength() { 076 return brLength; 077 } 078 079 boolean canBeWritten(final int lengthOfBlocksAfterThisPair) { 080 return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 081 } 082 083 boolean hasBackReference() { 084 return brOffset > 0; 085 } 086 087 private boolean hasBeenWritten() { 088 return written; 089 } 090 091 int length() { 092 return literalLength() + brLength; 093 } 094 095 private int literalLength() { 096 // This method is performance sensitive 097 if (literalLength != 0) { 098 return literalLength; 099 } 100 int length = 0; 101 for (final byte[] b : literals) { 102 length += b.length; 103 } 104 return literalLength = length; 105 } 106 107 private void prependLiteral(final byte[] data) { 108 literals.addFirst(data); 109 literalLength += data.length; 110 } 111 112 private void prependTo(final Pair other) { 113 final Iterator<byte[]> listBackwards = literals.descendingIterator(); 114 while (listBackwards.hasNext()) { 115 other.prependLiteral(listBackwards.next()); 116 } 117 } 118 119 void setBackReference(final LZ77Compressor.BackReference block) { 120 if (hasBackReference()) { 121 throw new IllegalStateException(); 122 } 123 brOffset = block.getOffset(); 124 brLength = block.getLength(); 125 } 126 127 private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) { 128 final Pair p = new Pair(); 129 p.literals.addAll(literals); 130 p.brOffset = brOffset; 131 p.brLength = newBackReferenceLength; 132 return p; 133 } 134 135 void writeTo(final OutputStream out) throws IOException { 136 final int litLength = literalLength(); 137 out.write(lengths(litLength, brLength)); 138 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 139 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 140 } 141 for (final byte[] b : literals) { 142 out.write(b); 143 } 144 if (hasBackReference()) { 145 ByteUtils.toLittleEndian(out, brOffset, 2); 146 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 147 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 148 } 149 } 150 written = true; 151 } 152 } 153 154 private static final int MIN_BACK_REFERENCE_LENGTH = 4; 155 156 /* 157 * 158 * The LZ4 block format has a few properties that make it less straight-forward than one would hope: 159 * 160 * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor 161 * must be merged into a single block. 162 * 163 * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal 164 * before we know how long the next back-reference is going to be. 165 * 166 * there are special rules for the final blocks 167 * 168 * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always 169 * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed. 170 * 171 * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this 172 * block's length and offset and the next block's length is at least twelve. 173 */ 174 175 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; 176 177 /** 178 * Returns a builder correctly configured for the LZ4 algorithm. 179 * 180 * @return a builder correctly configured for the LZ4 algorithm 181 */ 182 public static Parameters.Builder createParameterBuilder() { 183 final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; 184 return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) 185 .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen); 186 } 187 188 private final LZ77Compressor compressor; 189 190 // used in one-arg write method 191 private final byte[] oneByte = new byte[1]; 192 private final Deque<Pair> pairs = new LinkedList<>(); 193 194 // keeps track of the last window-size bytes (64k) in order to be 195 // able to expand back-references when needed 196 private final Deque<byte[]> expandedBlocks = new LinkedList<>(); 197 198 /** 199 * Creates a new LZ4 output stream. 200 * 201 * @param out An OutputStream to read compressed data from 202 */ 203 public BlockLZ4CompressorOutputStream(final OutputStream out) { 204 this(out, createParameterBuilder().build()); 205 } 206 207 /** 208 * Creates a new LZ4 output stream. 209 * 210 * @param out An OutputStream to read compressed data from 211 * @param params The parameters to use for LZ77 compression. 212 */ 213 public BlockLZ4CompressorOutputStream(final OutputStream out, final Parameters params) { 214 super(out); 215 compressor = new LZ77Compressor(params, block -> { 216 switch (block.getType()) { 217 case LITERAL: 218 addLiteralBlock((LZ77Compressor.LiteralBlock) block); 219 break; 220 case BACK_REFERENCE: 221 addBackReference((LZ77Compressor.BackReference) block); 222 break; 223 case EOD: 224 writeFinalLiteralBlock(); 225 break; 226 } 227 }); 228 } 229 230 private void addBackReference(final LZ77Compressor.BackReference block) throws IOException { 231 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 232 last.setBackReference(block); 233 recordBackReference(block); 234 clearUnusedBlocksAndPairs(); 235 } 236 237 private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { 238 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 239 recordLiteral(last.addLiteral(block)); 240 clearUnusedBlocksAndPairs(); 241 } 242 243 private void clearUnusedBlocks() { 244 int blockLengths = 0; 245 int blocksToKeep = 0; 246 for (final byte[] b : expandedBlocks) { 247 blocksToKeep++; 248 blockLengths += b.length; 249 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 250 break; 251 } 252 } 253 final int size = expandedBlocks.size(); 254 for (int i = blocksToKeep; i < size; i++) { 255 expandedBlocks.removeLast(); 256 } 257 } 258 259 private void clearUnusedBlocksAndPairs() { 260 clearUnusedBlocks(); 261 clearUnusedPairs(); 262 } 263 264 private void clearUnusedPairs() { 265 int pairLengths = 0; 266 int pairsToKeep = 0; 267 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) { 268 final Pair p = it.next(); 269 pairsToKeep++; 270 pairLengths += p.length(); 271 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 272 break; 273 } 274 } 275 final int size = pairs.size(); 276 for (int i = pairsToKeep; i < size; i++) { 277 final Pair p = pairs.peekFirst(); 278 if (!p.hasBeenWritten()) { 279 break; 280 } 281 pairs.removeFirst(); 282 } 283 } 284 285 @Override 286 public void close() throws IOException { 287 try { 288 finish(); 289 } finally { 290 super.close(); 291 } 292 } 293 294 private byte[] expand(final int offset, final int length) { 295 final byte[] expanded = new byte[length]; 296 if (offset == 1) { // surprisingly common special case 297 final byte[] block = expandedBlocks.peekFirst(); 298 final byte b = block[block.length - 1]; 299 if (b != 0) { // the fresh array contains 0s anyway 300 Arrays.fill(expanded, b); 301 } 302 } else { 303 expandFromList(expanded, offset, length); 304 } 305 return expanded; 306 } 307 308 private void expandFromList(final byte[] expanded, final int offset, final int length) { 309 int offsetRemaining = offset; 310 int lengthRemaining = length; 311 int writeOffset = 0; 312 while (lengthRemaining > 0) { 313 // find block that contains offsetRemaining 314 byte[] block = null; 315 final int copyLen; 316 final int copyOffset; 317 if (offsetRemaining > 0) { 318 int blockOffset = 0; 319 for (final byte[] b : expandedBlocks) { 320 if (b.length + blockOffset >= offsetRemaining) { 321 block = b; 322 break; 323 } 324 blockOffset += b.length; 325 } 326 if (block == null) { 327 // should not be possible 328 throw new IllegalStateException("Failed to find a block containing offset " + offset); 329 } 330 copyOffset = blockOffset + block.length - offsetRemaining; 331 copyLen = Math.min(lengthRemaining, block.length - copyOffset); 332 } else { 333 // offsetRemaining is negative or 0 and points into the expanded bytes 334 block = expanded; 335 copyOffset = -offsetRemaining; 336 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); 337 } 338 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); 339 offsetRemaining -= copyLen; 340 lengthRemaining -= copyLen; 341 writeOffset += copyLen; 342 } 343 } 344 345 /** 346 * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream. 347 * 348 * @throws IOException if an error occurs 349 */ 350 @Override 351 public void finish() throws IOException { 352 if (!isFinished()) { 353 compressor.finish(); 354 super.finish(); 355 } 356 } 357 358 /** 359 * Adds some initial data to fill the window with. 360 * 361 * @param data the data to fill the window with. 362 * @param off offset of real data into the array 363 * @param len amount of data 364 * @throws IllegalStateException if the stream has already started to write data 365 * @see LZ77Compressor#prefill 366 */ 367 public void prefill(final byte[] data, final int off, final int len) { 368 if (len > 0) { 369 final byte[] b = Arrays.copyOfRange(data, off, off + len); 370 compressor.prefill(b); 371 recordLiteral(b); 372 } 373 } 374 375 private void recordBackReference(final LZ77Compressor.BackReference block) { 376 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); 377 } 378 379 private void recordLiteral(final byte[] b) { 380 expandedBlocks.addFirst(b); 381 } 382 383 private void rewriteLastPairs() { 384 final LinkedList<Pair> lastPairs = new LinkedList<>(); 385 final LinkedList<Integer> pairLength = new LinkedList<>(); 386 int offset = 0; 387 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) { 388 final Pair p = it.next(); 389 if (p.hasBeenWritten()) { 390 break; 391 } 392 final int len = p.length(); 393 pairLength.addFirst(len); 394 lastPairs.addFirst(p); 395 offset += len; 396 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { 397 break; 398 } 399 } 400 lastPairs.forEach(pairs::remove); 401 // lastPairs may contain between one and four Pairs: 402 // * the last pair may be a one byte literal 403 // * all other Pairs contain a back-reference which must be four bytes long at minimum 404 // we could merge them all into a single literal block but 405 // this may harm compression. For example compressing 406 // "bla.tar" from our tests yields a last block containing a 407 // back-reference of length > 2k and we'd end up with a last 408 // literal of that size rather than a 2k back-reference and a 409 // 12 byte literal at the end. 410 411 // Instead we merge all but the first of lastPairs into a new 412 // literal-only Pair "replacement" and look at the 413 // back-reference in the first of lastPairs and see if we can 414 // split it. We can split it if it is longer than 16 - 415 // replacement.length (i.e. the minimal length of four is kept 416 // while making sure the last literal is at least twelve bytes 417 // long). If we can't split it, we expand the first of the pairs 418 // as well. 419 420 // this is not optimal, we could get better compression 421 // results with more complex approaches as the last literal 422 // only needs to be five bytes long if the previous 423 // back-reference has an offset big enough 424 425 final int lastPairsSize = lastPairs.size(); 426 int toExpand = 0; 427 for (int i = 1; i < lastPairsSize; i++) { 428 toExpand += pairLength.get(i); 429 } 430 final Pair replacement = new Pair(); 431 if (toExpand > 0) { 432 replacement.prependLiteral(expand(toExpand, toExpand)); 433 } 434 final Pair splitCandidate = lastPairs.get(0); 435 final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; 436 final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; 437 if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { 438 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); 439 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); 440 } else { 441 if (splitCandidate.hasBackReference()) { 442 replacement.prependLiteral(expand(toExpand + brLen, brLen)); 443 } 444 splitCandidate.prependTo(replacement); 445 } 446 pairs.add(replacement); 447 } 448 449 @Override 450 public void write(final byte[] data, final int off, final int len) throws IOException { 451 compressor.compress(data, off, len); 452 } 453 454 @Override 455 public void write(final int b) throws IOException { 456 oneByte[0] = (byte) (b & 0xff); 457 write(oneByte); 458 } 459 460 private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException { 461 writeWritablePairs(length); 462 Pair last = pairs.peekLast(); 463 if (last == null || last.hasBackReference()) { 464 last = new Pair(); 465 pairs.addLast(last); 466 } 467 return last; 468 } 469 470 private void writeFinalLiteralBlock() throws IOException { 471 rewriteLastPairs(); 472 for (final Pair p : pairs) { 473 if (!p.hasBeenWritten()) { 474 p.writeTo(out); 475 } 476 } 477 pairs.clear(); 478 } 479 480 private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException { 481 int unwrittenLength = lengthOfBlocksAfterLastPair; 482 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) { 483 final Pair p = it.next(); 484 if (p.hasBeenWritten()) { 485 break; 486 } 487 unwrittenLength += p.length(); 488 } 489 for (final Pair p : pairs) { 490 if (p.hasBeenWritten()) { 491 continue; 492 } 493 unwrittenLength -= p.length(); 494 if (!p.canBeWritten(unwrittenLength)) { 495 break; 496 } 497 p.writeTo(out); 498 } 499 } 500}