1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.commons.compress.compressors.lz77support;
20
21 import java.io.IOException;
22 import java.util.Objects;
23
24 import org.apache.commons.lang3.ArrayFill;
25
26 /**
27 * Helper class for compression algorithms that use the ideas of LZ77.
28 *
29 * <p>
30 * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
31 * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
32 * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
33 * case of DEFLATE for example).
34 * </p>
35 *
36 * <p>
37 * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
38 * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
39 * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
40 * </p>
41 *
42 * <p>
43 * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
44 * a whole family of algorithms.
45 * </p>
46 *
47 * <p>
48 * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
49 * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
50 * information, the {@code #finish} method must be used once all data has been fed into the compressor.
51 * </p>
52 *
53 * <p>
54 * Several parameters influence the outcome of the "compression":
55 * </p>
56 * <dl>
57 *
58 * <dt>{@code windowSize}</dt>
59 * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
60 * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
61 *
62 * <dt>{@code minBackReferenceLength}</dt>
63 * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
64 *
65 * <dt>{@code maxBackReferenceLength}</dt>
66 * <dd>Maximal length of a back-reference found.</dd>
67 *
68 * <dt>{@code maxOffset}</dt>
69 * <dd>Maximal offset of a back-reference.</dd>
70 *
71 * <dt>{@code maxLiteralLength}</dt>
72 * <dd>Maximal length of a literal block.</dd>
73 * </dl>
74 *
75 * @see "https://tools.ietf.org/html/rfc1951#section-4"
76 * @since 1.14
77 * @NotThreadSafe
78 */
79 public class LZ77Compressor {
80
81 /**
82 * Represents a back-reference.
83 */
84 public abstract static class AbstractReference extends Block {
85
86 private final int offset;
87 private final int length;
88
89 /**
90 * Constructs a new instance.
91 *
92 * @param blockType The block type.
93 * @param offset the offset of the reference.
94 * @param length the offset of the reference.
95 */
96 public AbstractReference(final BlockType blockType, final int offset, final int length) {
97 super(blockType);
98 this.offset = offset;
99 this.length = length;
100 }
101
102 /**
103 * Gets the offset of the reference.
104 *
105 * @return the length
106 */
107 public int getLength() {
108 return length;
109 }
110
111 /**
112 * Gets the offset of the reference.
113 *
114 * @return the offset
115 */
116 public int getOffset() {
117 return offset;
118 }
119
120 @Override
121 public String toString() {
122 return super.toString() + " with offset " + offset + " and length " + length;
123 }
124 }
125
126 /**
127 * Represents a back-reference.
128 */
129 public static final class BackReference extends AbstractReference {
130
131 /**
132 * Constructs a new instance.
133 *
134 * @param offset the offset of the back-reference.
135 * @param length the offset of the back-reference.
136 */
137 public BackReference(final int offset, final int length) {
138 super(BlockType.BACK_REFERENCE, offset, length);
139 }
140
141 }
142
143 /**
144 * Base class representing blocks the compressor may emit.
145 *
146 * <p>
147 * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
148 * may get introduced with future releases.
149 * </p>
150 */
151 public abstract static class Block {
152
153 /**
154 * Enumerates the block types the compressor emits.
155 */
156 public enum BlockType {
157
158 /**
159 * The literal block type.
160 */
161 LITERAL,
162
163 /**
164 * The back-reference block type.
165 */
166 BACK_REFERENCE,
167
168 /**
169 * The end-of-data block type.
170 */
171 EOD
172 }
173
174 private final BlockType type;
175
176 /**
177 * Constructs a new typeless instance.
178 *
179 * @deprecated Use {@link #Block()}.
180 */
181 @Deprecated
182 public Block() {
183 this.type = null;
184 }
185
186 /**
187 * Constructs a new instance.
188 *
189 * @param type the block type, may not be {@code null}.
190 */
191 protected Block(final BlockType type) {
192 this.type = Objects.requireNonNull(type);
193 }
194
195 /**
196 * Gets the the block type.
197 *
198 * @return the the block type.
199 */
200 public BlockType getType() {
201 return type;
202 }
203
204 @Override
205 public String toString() {
206 return getClass().getSimpleName() + " " + getType();
207 }
208 }
209
210 /**
211 * Callback invoked while the compressor processes data.
212 *
213 * <p>
214 * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
215 * {@link #compress} or {@link #finish}.
216 * </p>
217 */
218 public interface Callback {
219
220 /**
221 * Consumes a block.
222 *
223 * @param b the block to consume
224 * @throws IOException in case of an error
225 */
226 void accept(Block b) throws IOException;
227 }
228
229 /** A simple "we are done" marker. */
230 public static final class EOD extends Block {
231
232 /**
233 * The singleton instance.
234 */
235 private static final EOD INSTANCE = new EOD();
236
237 /**
238 * Constructs a new instance.
239 */
240 public EOD() {
241 super(BlockType.EOD);
242 }
243
244 }
245
246 /**
247 * Represents a literal block of data.
248 *
249 * <p>
250 * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
251 * immediately as it will get overwritten sooner or later.
252 * </p>
253 */
254 public static final class LiteralBlock extends AbstractReference {
255
256 private final byte[] data;
257
258 /**
259 * Constructs a new instance.
260 *
261 * @param data the literal data.
262 * @param offset the length of literal block.
263 * @param length the length of literal block.
264 */
265 public LiteralBlock(final byte[] data, final int offset, final int length) {
266 super(BlockType.LITERAL, offset, length);
267 this.data = data;
268 }
269
270 /**
271 * Gets the literal data.
272 *
273 * <p>
274 * This returns a live view of the actual data in order to avoid copying, modify the array at your own risk.
275 * </p>
276 *
277 * @return the data
278 */
279 public byte[] getData() {
280 return data;
281 }
282
283 }
284
285 static final int NUMBER_OF_BYTES_IN_HASH = 3;
286 private static final int NO_MATCH = -1;
287
288 // we use a 15 bit hash code as calculated in updateHash
289 private static final int HASH_SIZE = 1 << 15;
290 private static final int HASH_MASK = HASH_SIZE - 1;
291
292 private static final int H_SHIFT = 5;
293 private final Parameters params;
294 private final Callback callback;
295
296 // the sliding window, twice as big as "windowSize" parameter
297 private final byte[] window;
298
299 // the head of hash-chain - indexed by hash-code, points to the
300 // location inside of window of the latest sequence of bytes with
301 // the given hash.
302 private final int[] head;
303 // for each window-location points to the latest earlier location
304 // with the same hash. Only stores values for the latest
305 // "windowSize" elements, the index is "window location modulo
306 // windowSize".
307 private final int[] prev;
308 // bit mask used when indexing into prev
309 private final int wMask;
310 private boolean initialized;
311 // the position inside of window that shall be encoded right now
312 private int currentPosition;
313 // the number of bytes available to compress including the one at
314 // currentPosition
315 private int lookahead;
316 // the hash of the three bytes stating at the current position
317 private int insertHash;
318
319 // the position inside the window where the current literal
320 // block starts (in case we are inside a literal block).
321 private int blockStart;
322
323 // position of the current match
324 private int matchStart = NO_MATCH;
325
326 // number of missed insertString calls for the up to three last
327 // bytes of the last match that can only be performed once more
328 // data has been read
329 private int missedInserts;
330
331 /**
332 * Initializes a compressor with parameters and a callback.
333 *
334 * @param params the parameters
335 * @param callback the callback
336 * @throws NullPointerException if either parameter is {@code null}
337 */
338 public LZ77Compressor(final Parameters params, final Callback callback) {
339 Objects.requireNonNull(params, "params");
340 Objects.requireNonNull(callback, "callback");
341
342 this.params = params;
343 this.callback = callback;
344
345 final int wSize = params.getWindowSize();
346 window = new byte[wSize * 2];
347 wMask = wSize - 1;
348 head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
349 prev = new int[wSize];
350 }
351
352 private void catchUpMissedInserts() {
353 while (missedInserts > 0) {
354 insertString(currentPosition - missedInserts--);
355 }
356 }
357
358 private void compress() throws IOException {
359 final int minMatch = params.getMinBackReferenceLength();
360 final boolean lazy = params.getLazyMatching();
361 final int lazyThreshold = params.getLazyMatchingThreshold();
362
363 while (lookahead >= minMatch) {
364 catchUpMissedInserts();
365 int matchLength = 0;
366 final int hashHead = insertString(currentPosition);
367 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
368 // sets matchStart as a side effect
369 matchLength = longestMatch(hashHead);
370
371 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
372 // try to find a longer match using the next position
373 matchLength = longestMatchForNextPosition(matchLength);
374 }
375 }
376 if (matchLength >= minMatch) {
377 if (blockStart != currentPosition) {
378 // emit preceding literal block
379 flushLiteralBlock();
380 blockStart = NO_MATCH;
381 }
382 flushBackReference(matchLength);
383 insertStringsInMatch(matchLength);
384 lookahead -= matchLength;
385 currentPosition += matchLength;
386 blockStart = currentPosition;
387 } else {
388 // no match, append to current or start a new literal
389 lookahead--;
390 currentPosition++;
391 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
392 flushLiteralBlock();
393 blockStart = currentPosition;
394 }
395 }
396 }
397 }
398
399 /**
400 * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
401 *
402 * @param data the data to compress - must not be null
403 * @throws IOException if the callback throws an exception
404 */
405 public void compress(final byte[] data) throws IOException {
406 compress(data, 0, data.length);
407 }
408
409 /**
410 * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
411 *
412 * @param data the data to compress - must not be null
413 * @param off the start offset of the data
414 * @param len the number of bytes to compress
415 * @throws IOException if the callback throws an exception
416 */
417 public void compress(final byte[] data, int off, int len) throws IOException {
418 final int wSize = params.getWindowSize();
419 while (len > wSize) { // chop into windowSize sized chunks
420 doCompress(data, off, wSize);
421 off += wSize;
422 len -= wSize;
423 }
424 if (len > 0) {
425 doCompress(data, off, len);
426 }
427 }
428
429 // performs the actual algorithm with the pre-condition len <= windowSize
430 private void doCompress(final byte[] data, final int off, final int len) throws IOException {
431 final int spaceLeft = window.length - currentPosition - lookahead;
432 if (len > spaceLeft) {
433 slide();
434 }
435 System.arraycopy(data, off, window, currentPosition + lookahead, len);
436 lookahead += len;
437 if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
438 initialize();
439 }
440 if (initialized) {
441 compress();
442 }
443 }
444
445 /**
446 * Tells the compressor to process all remaining data and signal end of data to the callback.
447 *
448 * <p>
449 * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
450 * </p>
451 *
452 * @throws IOException if the callback throws an exception
453 */
454 public void finish() throws IOException {
455 if (blockStart != currentPosition || lookahead > 0) {
456 currentPosition += lookahead;
457 flushLiteralBlock();
458 }
459 callback.accept(EOD.INSTANCE);
460 }
461
462 private void flushBackReference(final int matchLength) throws IOException {
463 callback.accept(new BackReference(currentPosition - matchStart, matchLength));
464 }
465
466 private void flushLiteralBlock() throws IOException {
467 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
468 }
469
470 private void initialize() {
471 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
472 insertHash = nextHash(insertHash, window[i]);
473 }
474 initialized = true;
475 }
476
477 /**
478 * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
479 *
480 * <p>
481 * Updates {@code insertHash} and {@code prev} as a side effect.
482 * </p>
483 */
484 private int insertString(final int pos) {
485 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
486 final int hashHead = head[insertHash];
487 prev[pos & wMask] = hashHead;
488 head[insertHash] = pos;
489 return hashHead;
490 }
491
492 private void insertStringsInMatch(final int matchLength) {
493 // inserts strings contained in current match
494 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
495 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
496 // currentPosition has been inserted already
497 for (int i = 1; i <= stop; i++) {
498 insertString(currentPosition + i);
499 }
500 missedInserts = matchLength - stop - 1;
501 }
502
503 /**
504 * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
505 *
506 * <p>
507 * Sets matchStart to the index of the start position of the longest match as a side effect.
508 * </p>
509 */
510 private int longestMatch(int matchHead) {
511 final int minLength = params.getMinBackReferenceLength();
512 int longestMatchLength = minLength - 1;
513 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
514 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
515 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
516 final int maxCandidates = params.getMaxCandidates();
517 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
518 int currentLength = 0;
519 for (int i = 0; i < maxPossibleLength; i++) {
520 if (window[matchHead + i] != window[currentPosition + i]) {
521 break;
522 }
523 currentLength++;
524 }
525 if (currentLength > longestMatchLength) {
526 longestMatchLength = currentLength;
527 matchStart = matchHead;
528 if (currentLength >= niceBackReferenceLength) {
529 // no need to search any further
530 break;
531 }
532 }
533 matchHead = prev[matchHead & wMask];
534 }
535 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
536 }
537
538 private int longestMatchForNextPosition(final int prevMatchLength) {
539 // save a bunch of values to restore them if the next match isn't better than the current one
540 final int prevMatchStart = matchStart;
541 final int prevInsertHash = insertHash;
542
543 lookahead--;
544 currentPosition++;
545 final int hashHead = insertString(currentPosition);
546 final int prevHashHead = prev[currentPosition & wMask];
547 int matchLength = longestMatch(hashHead);
548
549 if (matchLength <= prevMatchLength) {
550 // use the first match, as the next one isn't any better
551 matchLength = prevMatchLength;
552 matchStart = prevMatchStart;
553
554 // restore modified values
555 head[insertHash] = prevHashHead;
556 insertHash = prevInsertHash;
557 currentPosition--;
558 lookahead++;
559 }
560 return matchLength;
561 }
562
563 /**
564 * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
565 * nextHash(H, D).
566 *
567 * <p>
568 * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
569 * </p>
570 */
571 private int nextHash(final int oldHash, final byte nextByte) {
572 final int nextVal = nextByte & 0xFF;
573 return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
574 }
575
576 /**
577 * Adds some initial data to fill the window with.
578 *
579 * <p>
580 * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
581 * LZ4 frame format using block dependency.
582 * </p>
583 *
584 * @param data the data to fill the window with.
585 * @throws IllegalStateException if the compressor has already started to accept data
586 */
587 public void prefill(final byte[] data) {
588 if (currentPosition != 0 || lookahead != 0) {
589 throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
590 }
591
592 // don't need more than windowSize for back-references
593 final int len = Math.min(params.getWindowSize(), data.length);
594 System.arraycopy(data, data.length - len, window, 0, len);
595
596 if (len >= NUMBER_OF_BYTES_IN_HASH) {
597 initialize();
598 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
599 for (int i = 0; i < stop; i++) {
600 insertString(i);
601 }
602 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
603 } else { // not enough data to hash anything
604 missedInserts = len;
605 }
606 blockStart = currentPosition = len;
607 }
608
609 private void slide() throws IOException {
610 final int wSize = params.getWindowSize();
611 if (blockStart != currentPosition && blockStart < wSize) {
612 flushLiteralBlock();
613 blockStart = currentPosition;
614 }
615 System.arraycopy(window, wSize, window, 0, wSize);
616 currentPosition -= wSize;
617 matchStart -= wSize;
618 blockStart -= wSize;
619 for (int i = 0; i < HASH_SIZE; i++) {
620 final int h = head[i];
621 head[i] = h >= wSize ? h - wSize : NO_MATCH;
622 }
623 for (int i = 0; i < wSize; i++) {
624 final int p = prev[i];
625 prev[i] = p >= wSize ? p - wSize : NO_MATCH;
626 }
627 }
628 }