001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Objects; 023 024import org.apache.commons.lang3.ArrayFill; 025 026/** 027 * Helper class for compression algorithms that use the ideas of LZ77. 028 * 029 * <p> 030 * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths) 031 * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how 032 * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the 033 * case of DEFLATE for example). 034 * </p> 035 * 036 * <p> 037 * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951 038 * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib 039 * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation. 040 * </p> 041 * 042 * <p> 043 * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for 044 * a whole family of algorithms. 045 * </p> 046 * 047 * <p> 048 * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either 049 * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all 050 * information, the {@code #finish} method must be used once all data has been fed into the compressor. 051 * </p> 052 * 053 * <p> 054 * Several parameters influence the outcome of the "compression": 055 * </p> 056 * <dl> 057 * 058 * <dt>{@code windowSize}</dt> 059 * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer 060 * of twice of {@code windowSize} - real world values are in the area of 32k.</dd> 061 * 062 * <dt>{@code minBackReferenceLength}</dt> 063 * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd> 064 * 065 * <dt>{@code maxBackReferenceLength}</dt> 066 * <dd>Maximal length of a back-reference found.</dd> 067 * 068 * <dt>{@code maxOffset}</dt> 069 * <dd>Maximal offset of a back-reference.</dd> 070 * 071 * <dt>{@code maxLiteralLength}</dt> 072 * <dd>Maximal length of a literal block.</dd> 073 * </dl> 074 * 075 * @see "https://tools.ietf.org/html/rfc1951#section-4" 076 * @since 1.14 077 * @NotThreadSafe 078 */ 079public class LZ77Compressor { 080 081 /** 082 * Represents a back-reference. 083 */ 084 public static final class BackReference extends Block { 085 private final int offset, length; 086 087 public BackReference(final int offset, final int length) { 088 this.offset = offset; 089 this.length = length; 090 } 091 092 /** 093 * Provides the length of the back-reference. 094 * 095 * @return the length 096 */ 097 public int getLength() { 098 return length; 099 } 100 101 /** 102 * Provides the offset of the back-reference. 103 * 104 * @return the offset 105 */ 106 public int getOffset() { 107 return offset; 108 } 109 110 @Override 111 public BlockType getType() { 112 return BlockType.BACK_REFERENCE; 113 } 114 115 @Override 116 public String toString() { 117 return "BackReference with offset " + offset + " and length " + length; 118 } 119 } 120 121 /** 122 * Base class representing blocks the compressor may emit. 123 * 124 * <p> 125 * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses 126 * may get introduced with future releases. 127 * </p> 128 */ 129 public abstract static class Block { 130 /** Enumeration of the block types the compressor may emit. */ 131 public enum BlockType { 132 LITERAL, BACK_REFERENCE, EOD 133 } 134 135 public abstract BlockType getType(); 136 } 137 138 /** 139 * Callback invoked while the compressor processes data. 140 * 141 * <p> 142 * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of 143 * {@link #compress} or {@link #finish}. 144 * </p> 145 */ 146 public interface Callback { 147 /** 148 * Consumes a block. 149 * 150 * @param b the block to consume 151 * @throws IOException in case of an error 152 */ 153 void accept(Block b) throws IOException; 154 } 155 156 /** A simple "we are done" marker. */ 157 public static final class EOD extends Block { 158 @Override 159 public BlockType getType() { 160 return BlockType.EOD; 161 } 162 } 163 164 /** 165 * Represents a literal block of data. 166 * 167 * <p> 168 * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept} 169 * immediately as it will get overwritten sooner or later. 170 * </p> 171 */ 172 public static final class LiteralBlock extends Block { 173 private final byte[] data; 174 private final int offset, length; 175 176 public LiteralBlock(final byte[] data, final int offset, final int length) { 177 this.data = data; 178 this.offset = offset; 179 this.length = length; 180 } 181 182 /** 183 * The literal data. 184 * 185 * <p> 186 * This returns a life view of the actual data in order to avoid copying, modify the array at your own risk. 187 * </p> 188 * 189 * @return the data 190 */ 191 public byte[] getData() { 192 return data; 193 } 194 195 /** 196 * Length of literal block. 197 * 198 * @return the length 199 */ 200 public int getLength() { 201 return length; 202 } 203 204 /** 205 * Offset into data where the literal block starts. 206 * 207 * @return the offset 208 */ 209 public int getOffset() { 210 return offset; 211 } 212 213 @Override 214 public BlockType getType() { 215 return BlockType.LITERAL; 216 } 217 218 @Override 219 public String toString() { 220 return "LiteralBlock starting at " + offset + " with length " + length; 221 } 222 } 223 224 private static final Block THE_EOD = new EOD(); 225 226 static final int NUMBER_OF_BYTES_IN_HASH = 3; 227 private static final int NO_MATCH = -1; 228 229 // we use a 15 bit hash code as calculated in updateHash 230 private static final int HASH_SIZE = 1 << 15; 231 private static final int HASH_MASK = HASH_SIZE - 1; 232 233 private static final int H_SHIFT = 5; 234 private final Parameters params; 235 private final Callback callback; 236 237 // the sliding window, twice as big as "windowSize" parameter 238 private final byte[] window; 239 240 // the head of hash-chain - indexed by hash-code, points to the 241 // location inside of window of the latest sequence of bytes with 242 // the given hash. 243 private final int[] head; 244 // for each window-location points to the latest earlier location 245 // with the same hash. Only stores values for the latest 246 // "windowSize" elements, the index is "window location modulo 247 // windowSize". 248 private final int[] prev; 249 // bit mask used when indexing into prev 250 private final int wMask; 251 private boolean initialized; 252 // the position inside of window that shall be encoded right now 253 private int currentPosition; 254 // the number of bytes available to compress including the one at 255 // currentPosition 256 private int lookahead; 257 // the hash of the three bytes stating at the current position 258 private int insertHash; 259 260 // the position inside the window where the current literal 261 // block starts (in case we are inside a literal block). 262 private int blockStart; 263 264 // position of the current match 265 private int matchStart = NO_MATCH; 266 267 // number of missed insertString calls for the up to three last 268 // bytes of the last match that can only be performed once more 269 // data has been read 270 private int missedInserts; 271 272 /** 273 * Initializes a compressor with parameters and a callback. 274 * 275 * @param params the parameters 276 * @param callback the callback 277 * @throws NullPointerException if either parameter is {@code null} 278 */ 279 public LZ77Compressor(final Parameters params, final Callback callback) { 280 Objects.requireNonNull(params, "params"); 281 Objects.requireNonNull(callback, "callback"); 282 283 this.params = params; 284 this.callback = callback; 285 286 final int wSize = params.getWindowSize(); 287 window = new byte[wSize * 2]; 288 wMask = wSize - 1; 289 head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH); 290 prev = new int[wSize]; 291 } 292 293 private void catchUpMissedInserts() { 294 while (missedInserts > 0) { 295 insertString(currentPosition - missedInserts--); 296 } 297 } 298 299 private void compress() throws IOException { 300 final int minMatch = params.getMinBackReferenceLength(); 301 final boolean lazy = params.getLazyMatching(); 302 final int lazyThreshold = params.getLazyMatchingThreshold(); 303 304 while (lookahead >= minMatch) { 305 catchUpMissedInserts(); 306 int matchLength = 0; 307 final int hashHead = insertString(currentPosition); 308 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 309 // sets matchStart as a side effect 310 matchLength = longestMatch(hashHead); 311 312 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 313 // try to find a longer match using the next position 314 matchLength = longestMatchForNextPosition(matchLength); 315 } 316 } 317 if (matchLength >= minMatch) { 318 if (blockStart != currentPosition) { 319 // emit preceding literal block 320 flushLiteralBlock(); 321 blockStart = NO_MATCH; 322 } 323 flushBackReference(matchLength); 324 insertStringsInMatch(matchLength); 325 lookahead -= matchLength; 326 currentPosition += matchLength; 327 blockStart = currentPosition; 328 } else { 329 // no match, append to current or start a new literal 330 lookahead--; 331 currentPosition++; 332 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 333 flushLiteralBlock(); 334 blockStart = currentPosition; 335 } 336 } 337 } 338 } 339 340 /** 341 * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method. 342 * 343 * @param data the data to compress - must not be null 344 * @throws IOException if the callback throws an exception 345 */ 346 public void compress(final byte[] data) throws IOException { 347 compress(data, 0, data.length); 348 } 349 350 /** 351 * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method. 352 * 353 * @param data the data to compress - must not be null 354 * @param off the start offset of the data 355 * @param len the number of bytes to compress 356 * @throws IOException if the callback throws an exception 357 */ 358 public void compress(final byte[] data, int off, int len) throws IOException { 359 final int wSize = params.getWindowSize(); 360 while (len > wSize) { // chop into windowSize sized chunks 361 doCompress(data, off, wSize); 362 off += wSize; 363 len -= wSize; 364 } 365 if (len > 0) { 366 doCompress(data, off, len); 367 } 368 } 369 370 // performs the actual algorithm with the pre-condition len <= windowSize 371 private void doCompress(final byte[] data, final int off, final int len) throws IOException { 372 final int spaceLeft = window.length - currentPosition - lookahead; 373 if (len > spaceLeft) { 374 slide(); 375 } 376 System.arraycopy(data, off, window, currentPosition + lookahead, len); 377 lookahead += len; 378 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 379 initialize(); 380 } 381 if (initialized) { 382 compress(); 383 } 384 } 385 386 /** 387 * Tells the compressor to process all remaining data and signal end of data to the callback. 388 * 389 * <p> 390 * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method. 391 * </p> 392 * 393 * @throws IOException if the callback throws an exception 394 */ 395 public void finish() throws IOException { 396 if (blockStart != currentPosition || lookahead > 0) { 397 currentPosition += lookahead; 398 flushLiteralBlock(); 399 } 400 callback.accept(THE_EOD); 401 } 402 403 private void flushBackReference(final int matchLength) throws IOException { 404 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 405 } 406 407 private void flushLiteralBlock() throws IOException { 408 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 409 } 410 411 private void initialize() { 412 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 413 insertHash = nextHash(insertHash, window[i]); 414 } 415 initialized = true; 416 } 417 418 /** 419 * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain. 420 * 421 * <p> 422 * Updates {@code insertHash} and {@code prev} as a side effect. 423 * </p> 424 */ 425 private int insertString(final int pos) { 426 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 427 final int hashHead = head[insertHash]; 428 prev[pos & wMask] = hashHead; 429 head[insertHash] = pos; 430 return hashHead; 431 } 432 433 private void insertStringsInMatch(final int matchLength) { 434 // inserts strings contained in current match 435 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 436 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 437 // currentPosition has been inserted already 438 for (int i = 1; i <= stop; i++) { 439 insertString(currentPosition + i); 440 } 441 missedInserts = matchLength - stop - 1; 442 } 443 444 /** 445 * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset). 446 * 447 * <p> 448 * Sets matchStart to the index of the start position of the longest match as a side effect. 449 * </p> 450 */ 451 private int longestMatch(int matchHead) { 452 final int minLength = params.getMinBackReferenceLength(); 453 int longestMatchLength = minLength - 1; 454 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 455 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 456 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 457 final int maxCandidates = params.getMaxCandidates(); 458 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 459 int currentLength = 0; 460 for (int i = 0; i < maxPossibleLength; i++) { 461 if (window[matchHead + i] != window[currentPosition + i]) { 462 break; 463 } 464 currentLength++; 465 } 466 if (currentLength > longestMatchLength) { 467 longestMatchLength = currentLength; 468 matchStart = matchHead; 469 if (currentLength >= niceBackReferenceLength) { 470 // no need to search any further 471 break; 472 } 473 } 474 matchHead = prev[matchHead & wMask]; 475 } 476 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 477 } 478 479 private int longestMatchForNextPosition(final int prevMatchLength) { 480 // save a bunch of values to restore them if the next match isn't better than the current one 481 final int prevMatchStart = matchStart; 482 final int prevInsertHash = insertHash; 483 484 lookahead--; 485 currentPosition++; 486 final int hashHead = insertString(currentPosition); 487 final int prevHashHead = prev[currentPosition & wMask]; 488 int matchLength = longestMatch(hashHead); 489 490 if (matchLength <= prevMatchLength) { 491 // use the first match, as the next one isn't any better 492 matchLength = prevMatchLength; 493 matchStart = prevMatchStart; 494 495 // restore modified values 496 head[insertHash] = prevHashHead; 497 insertHash = prevInsertHash; 498 currentPosition--; 499 lookahead++; 500 } 501 return matchLength; 502 } 503 504 /** 505 * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is 506 * nextHash(H, D). 507 * 508 * <p> 509 * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update. 510 * </p> 511 */ 512 private int nextHash(final int oldHash, final byte nextByte) { 513 final int nextVal = nextByte & 0xFF; 514 return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK; 515 } 516 517 /** 518 * Adds some initial data to fill the window with. 519 * 520 * <p> 521 * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the 522 * LZ4 frame format using block dependency. 523 * </p> 524 * 525 * @param data the data to fill the window with. 526 * @throws IllegalStateException if the compressor has already started to accept data 527 */ 528 public void prefill(final byte[] data) { 529 if (currentPosition != 0 || lookahead != 0) { 530 throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore"); 531 } 532 533 // don't need more than windowSize for back-references 534 final int len = Math.min(params.getWindowSize(), data.length); 535 System.arraycopy(data, data.length - len, window, 0, len); 536 537 if (len >= NUMBER_OF_BYTES_IN_HASH) { 538 initialize(); 539 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 540 for (int i = 0; i < stop; i++) { 541 insertString(i); 542 } 543 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 544 } else { // not enough data to hash anything 545 missedInserts = len; 546 } 547 blockStart = currentPosition = len; 548 } 549 550 private void slide() throws IOException { 551 final int wSize = params.getWindowSize(); 552 if (blockStart != currentPosition && blockStart < wSize) { 553 flushLiteralBlock(); 554 blockStart = currentPosition; 555 } 556 System.arraycopy(window, wSize, window, 0, wSize); 557 currentPosition -= wSize; 558 matchStart -= wSize; 559 blockStart -= wSize; 560 for (int i = 0; i < HASH_SIZE; i++) { 561 final int h = head[i]; 562 head[i] = h >= wSize ? h - wSize : NO_MATCH; 563 } 564 for (int i = 0; i < wSize; i++) { 565 final int p = prev[i]; 566 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 567 } 568 } 569}