001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.util.Arrays;
023import java.util.Objects;
024
025/**
026 * Helper class for compression algorithms that use the ideas of LZ77.
027 *
028 * <p>
029 * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
030 * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
031 * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
032 * case of DEFLATE for example).
033 * </p>
034 *
035 * <p>
036 * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
037 * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
038 * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
039 * </p>
040 *
041 * <p>
042 * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
043 * a whole family of algorithms.
044 * </p>
045 *
046 * <p>
047 * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
048 * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
049 * information, the {@code #finish} method must be used once all data has been fed into the compressor.
050 * </p>
051 *
052 * <p>
053 * Several parameters influence the outcome of the "compression":
054 * </p>
055 * <dl>
056 *
057 * <dt>{@code windowSize}</dt>
058 * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
059 * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
060 *
061 * <dt>{@code minBackReferenceLength}</dt>
062 * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
063 *
064 * <dt>{@code maxBackReferenceLength}</dt>
065 * <dd>Maximal length of a back-reference found.</dd>
066 *
067 * <dt>{@code maxOffset}</dt>
068 * <dd>Maximal offset of a back-reference.</dd>
069 *
070 * <dt>{@code maxLiteralLength}</dt>
071 * <dd>Maximal length of a literal block.</dd>
072 * </dl>
073 *
074 * @see "https://tools.ietf.org/html/rfc1951#section-4"
075 * @since 1.14
076 * @NotThreadSafe
077 */
078public class LZ77Compressor {
079
080    /**
081     * Represents a back-reference.
082     */
083    public static final class BackReference extends Block {
084        private final int offset, length;
085
086        public BackReference(final int offset, final int length) {
087            this.offset = offset;
088            this.length = length;
089        }
090
091        /**
092         * Provides the length of the back-reference.
093         *
094         * @return the length
095         */
096        public int getLength() {
097            return length;
098        }
099
100        /**
101         * Provides the offset of the back-reference.
102         *
103         * @return the offset
104         */
105        public int getOffset() {
106            return offset;
107        }
108
109        @Override
110        public BlockType getType() {
111            return BlockType.BACK_REFERENCE;
112        }
113
114        @Override
115        public String toString() {
116            return "BackReference with offset " + offset + " and length " + length;
117        }
118    }
119
120    /**
121     * Base class representing blocks the compressor may emit.
122     *
123     * <p>
124     * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
125     * may get introduced with future releases.
126     * </p>
127     */
128    public abstract static class Block {
129        /** Enumeration of the block types the compressor may emit. */
130        public enum BlockType {
131            LITERAL, BACK_REFERENCE, EOD
132        }
133
134        public abstract BlockType getType();
135    }
136
137    /**
138     * Callback invoked while the compressor processes data.
139     *
140     * <p>
141     * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
142     * {@link #compress} or {@link #finish}.
143     * </p>
144     */
145    public interface Callback {
146        /**
147         * Consumes a block.
148         *
149         * @param b the block to consume
150         * @throws IOException in case of an error
151         */
152        void accept(Block b) throws IOException;
153    }
154
155    /** A simple "we are done" marker. */
156    public static final class EOD extends Block {
157        @Override
158        public BlockType getType() {
159            return BlockType.EOD;
160        }
161    }
162
163    /**
164     * Represents a literal block of data.
165     *
166     * <p>
167     * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
168     * immediately as it will get overwritten sooner or later.
169     * </p>
170     */
171    public static final class LiteralBlock extends Block {
172        private final byte[] data;
173        private final int offset, length;
174
175        public LiteralBlock(final byte[] data, final int offset, final int length) {
176            this.data = data;
177            this.offset = offset;
178            this.length = length;
179        }
180
181        /**
182         * The literal data.
183         *
184         * <p>
185         * This returns a life view of the actual data in order to avoid copying, modify the array at your own risk.
186         * </p>
187         *
188         * @return the data
189         */
190        public byte[] getData() {
191            return data;
192        }
193
194        /**
195         * Length of literal block.
196         *
197         * @return the length
198         */
199        public int getLength() {
200            return length;
201        }
202
203        /**
204         * Offset into data where the literal block starts.
205         *
206         * @return the offset
207         */
208        public int getOffset() {
209            return offset;
210        }
211
212        @Override
213        public BlockType getType() {
214            return BlockType.LITERAL;
215        }
216
217        @Override
218        public String toString() {
219            return "LiteralBlock starting at " + offset + " with length " + length;
220        }
221    }
222
223    private static final Block THE_EOD = new EOD();
224
225    static final int NUMBER_OF_BYTES_IN_HASH = 3;
226    private static final int NO_MATCH = -1;
227
228    // we use a 15 bit hash code as calculated in updateHash
229    private static final int HASH_SIZE = 1 << 15;
230    private static final int HASH_MASK = HASH_SIZE - 1;
231
232    private static final int H_SHIFT = 5;
233    private final Parameters params;
234    private final Callback callback;
235
236    // the sliding window, twice as big as "windowSize" parameter
237    private final byte[] window;
238
239    // the head of hash-chain - indexed by hash-code, points to the
240    // location inside of window of the latest sequence of bytes with
241    // the given hash.
242    private final int[] head;
243    // for each window-location points to the latest earlier location
244    // with the same hash. Only stores values for the latest
245    // "windowSize" elements, the index is "window location modulo
246    // windowSize".
247    private final int[] prev;
248    // bit mask used when indexing into prev
249    private final int wMask;
250    private boolean initialized;
251    // the position inside of window that shall be encoded right now
252    private int currentPosition;
253    // the number of bytes available to compress including the one at
254    // currentPosition
255    private int lookahead;
256    // the hash of the three bytes stating at the current position
257    private int insertHash;
258
259    // the position inside the window where the current literal
260    // block starts (in case we are inside a literal block).
261    private int blockStart;
262
263    // position of the current match
264    private int matchStart = NO_MATCH;
265
266    // number of missed insertString calls for the up to three last
267    // bytes of the last match that can only be performed once more
268    // data has been read
269    private int missedInserts;
270
271    /**
272     * Initializes a compressor with parameters and a callback.
273     *
274     * @param params   the parameters
275     * @param callback the callback
276     * @throws NullPointerException if either parameter is {@code null}
277     */
278    public LZ77Compressor(final Parameters params, final Callback callback) {
279        Objects.requireNonNull(params, "params");
280        Objects.requireNonNull(callback, "callback");
281
282        this.params = params;
283        this.callback = callback;
284
285        final int wSize = params.getWindowSize();
286        window = new byte[wSize * 2];
287        wMask = wSize - 1;
288        head = new int[HASH_SIZE];
289        Arrays.fill(head, NO_MATCH);
290        prev = new int[wSize];
291    }
292
293    private void catchUpMissedInserts() {
294        while (missedInserts > 0) {
295            insertString(currentPosition - missedInserts--);
296        }
297    }
298
299    private void compress() throws IOException {
300        final int minMatch = params.getMinBackReferenceLength();
301        final boolean lazy = params.getLazyMatching();
302        final int lazyThreshold = params.getLazyMatchingThreshold();
303
304        while (lookahead >= minMatch) {
305            catchUpMissedInserts();
306            int matchLength = 0;
307            final int hashHead = insertString(currentPosition);
308            if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
309                // sets matchStart as a side effect
310                matchLength = longestMatch(hashHead);
311
312                if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
313                    // try to find a longer match using the next position
314                    matchLength = longestMatchForNextPosition(matchLength);
315                }
316            }
317            if (matchLength >= minMatch) {
318                if (blockStart != currentPosition) {
319                    // emit preceding literal block
320                    flushLiteralBlock();
321                    blockStart = NO_MATCH;
322                }
323                flushBackReference(matchLength);
324                insertStringsInMatch(matchLength);
325                lookahead -= matchLength;
326                currentPosition += matchLength;
327                blockStart = currentPosition;
328            } else {
329                // no match, append to current or start a new literal
330                lookahead--;
331                currentPosition++;
332                if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
333                    flushLiteralBlock();
334                    blockStart = currentPosition;
335                }
336            }
337        }
338    }
339
340    /**
341     * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
342     *
343     * @param data the data to compress - must not be null
344     * @throws IOException if the callback throws an exception
345     */
346    public void compress(final byte[] data) throws IOException {
347        compress(data, 0, data.length);
348    }
349
350    /**
351     * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
352     *
353     * @param data the data to compress - must not be null
354     * @param off  the start offset of the data
355     * @param len  the number of bytes to compress
356     * @throws IOException if the callback throws an exception
357     */
358    public void compress(final byte[] data, int off, int len) throws IOException {
359        final int wSize = params.getWindowSize();
360        while (len > wSize) { // chop into windowSize sized chunks
361            doCompress(data, off, wSize);
362            off += wSize;
363            len -= wSize;
364        }
365        if (len > 0) {
366            doCompress(data, off, len);
367        }
368    }
369
370    // performs the actual algorithm with the pre-condition len <= windowSize
371    private void doCompress(final byte[] data, final int off, final int len) throws IOException {
372        final int spaceLeft = window.length - currentPosition - lookahead;
373        if (len > spaceLeft) {
374            slide();
375        }
376        System.arraycopy(data, off, window, currentPosition + lookahead, len);
377        lookahead += len;
378        if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
379            initialize();
380        }
381        if (initialized) {
382            compress();
383        }
384    }
385
386    /**
387     * Tells the compressor to process all remaining data and signal end of data to the callback.
388     *
389     * <p>
390     * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
391     * </p>
392     *
393     * @throws IOException if the callback throws an exception
394     */
395    public void finish() throws IOException {
396        if (blockStart != currentPosition || lookahead > 0) {
397            currentPosition += lookahead;
398            flushLiteralBlock();
399        }
400        callback.accept(THE_EOD);
401    }
402
403    private void flushBackReference(final int matchLength) throws IOException {
404        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
405    }
406
407    private void flushLiteralBlock() throws IOException {
408        callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
409    }
410
411    private void initialize() {
412        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
413            insertHash = nextHash(insertHash, window[i]);
414        }
415        initialized = true;
416    }
417
418    /**
419     * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
420     *
421     * <p>
422     * Updates {@code insertHash} and {@code prev} as a side effect.
423     * </p>
424     */
425    private int insertString(final int pos) {
426        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
427        final int hashHead = head[insertHash];
428        prev[pos & wMask] = hashHead;
429        head[insertHash] = pos;
430        return hashHead;
431    }
432
433    private void insertStringsInMatch(final int matchLength) {
434        // inserts strings contained in current match
435        // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
436        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
437        // currentPosition has been inserted already
438        for (int i = 1; i <= stop; i++) {
439            insertString(currentPosition + i);
440        }
441        missedInserts = matchLength - stop - 1;
442    }
443
444    /**
445     * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
446     *
447     * <p>
448     * Sets matchStart to the index of the start position of the longest match as a side effect.
449     * </p>
450     */
451    private int longestMatch(int matchHead) {
452        final int minLength = params.getMinBackReferenceLength();
453        int longestMatchLength = minLength - 1;
454        final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
455        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
456        final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
457        final int maxCandidates = params.getMaxCandidates();
458        for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
459            int currentLength = 0;
460            for (int i = 0; i < maxPossibleLength; i++) {
461                if (window[matchHead + i] != window[currentPosition + i]) {
462                    break;
463                }
464                currentLength++;
465            }
466            if (currentLength > longestMatchLength) {
467                longestMatchLength = currentLength;
468                matchStart = matchHead;
469                if (currentLength >= niceBackReferenceLength) {
470                    // no need to search any further
471                    break;
472                }
473            }
474            matchHead = prev[matchHead & wMask];
475        }
476        return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
477    }
478
479    private int longestMatchForNextPosition(final int prevMatchLength) {
480        // save a bunch of values to restore them if the next match isn't better than the current one
481        final int prevMatchStart = matchStart;
482        final int prevInsertHash = insertHash;
483
484        lookahead--;
485        currentPosition++;
486        final int hashHead = insertString(currentPosition);
487        final int prevHashHead = prev[currentPosition & wMask];
488        int matchLength = longestMatch(hashHead);
489
490        if (matchLength <= prevMatchLength) {
491            // use the first match, as the next one isn't any better
492            matchLength = prevMatchLength;
493            matchStart = prevMatchStart;
494
495            // restore modified values
496            head[insertHash] = prevHashHead;
497            insertHash = prevInsertHash;
498            currentPosition--;
499            lookahead++;
500        }
501        return matchLength;
502    }
503
504    /**
505     * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
506     * nextHash(H, D).
507     *
508     * <p>
509     * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
510     * </p>
511     */
512    private int nextHash(final int oldHash, final byte nextByte) {
513        final int nextVal = nextByte & 0xFF;
514        return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
515    }
516
517    /**
518     * Adds some initial data to fill the window with.
519     *
520     * <p>
521     * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
522     * LZ4 frame format using block dependency.
523     * </p>
524     *
525     * @param data the data to fill the window with.
526     * @throws IllegalStateException if the compressor has already started to accept data
527     */
528    public void prefill(final byte[] data) {
529        if (currentPosition != 0 || lookahead != 0) {
530            throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
531        }
532
533        // don't need more than windowSize for back-references
534        final int len = Math.min(params.getWindowSize(), data.length);
535        System.arraycopy(data, data.length - len, window, 0, len);
536
537        if (len >= NUMBER_OF_BYTES_IN_HASH) {
538            initialize();
539            final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
540            for (int i = 0; i < stop; i++) {
541                insertString(i);
542            }
543            missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
544        } else { // not enough data to hash anything
545            missedInserts = len;
546        }
547        blockStart = currentPosition = len;
548    }
549
550    private void slide() throws IOException {
551        final int wSize = params.getWindowSize();
552        if (blockStart != currentPosition && blockStart < wSize) {
553            flushLiteralBlock();
554            blockStart = currentPosition;
555        }
556        System.arraycopy(window, wSize, window, 0, wSize);
557        currentPosition -= wSize;
558        matchStart -= wSize;
559        blockStart -= wSize;
560        for (int i = 0; i < HASH_SIZE; i++) {
561            final int h = head[i];
562            head[i] = h >= wSize ? h - wSize : NO_MATCH;
563        }
564        for (int i = 0; i < wSize; i++) {
565            final int p = prev[i];
566            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
567        }
568    }
569}