001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019
020package org.apache.commons.compress.compressors.zstandard;
021
022import java.io.IOException;
023import java.io.OutputStream;
024
025import org.apache.commons.compress.compressors.CompressorOutputStream;
026import org.apache.commons.io.build.AbstractStreamBuilder;
027import org.apache.commons.lang3.ArrayUtils;
028
029import com.github.luben.zstd.ZstdOutputStream;
030
031/**
032 * {@link CompressorOutputStream} implementation to create Zstandard encoded stream.
033 * <p>
034 * This class avoids making the underlying {@code zstd} classes part of the public or protected API. The underlying implementation is provided through the
035 * <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a> library which is based on <a href="https://github.com/facebook/zstd/">zstd</a>.
036 * </p>
037 *
038 * @see <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a>
039 * @see <a href="https://github.com/facebook/zstd/">zstd</a>
040 * @since 1.16
041 */
042public class ZstdCompressorOutputStream extends CompressorOutputStream<ZstdOutputStream> {
043
044    // @formatter:off
045    /**
046     * Builds a new {@link ZstdCompressorOutputStream}.
047     *
048     * <p>
049     * For example:
050     * </p>
051     * <pre>{@code
052     * ZstdCompressorOutputStream s = ZstdCompressorOutputStream.builder()
053     *   .setPath(path)
054     *   .setLevel(3)
055     *   .setStrategy(0)
056     *   .setWorkers(0)
057     *   .get();
058     * }
059     * </pre>
060     * <p>
061     * This class avoids making the underlying {@code zstd} classes part of the public or protected API.
062     * </p>
063     * @see #get()
064     * @see ZstdConstants
065     * @since 1.28.0
066     */
067    // @formatter:on
068    public static final class Builder extends AbstractStreamBuilder<ZstdCompressorOutputStream, Builder> {
069
070        private int chainLog;
071        private boolean checksum;
072        private boolean closeFrameOnFlush;
073        private byte[] dict;
074        private int hashLog;
075        private int jobSize;
076        private int level = ZstdConstants.ZSTD_CLEVEL_DEFAULT;
077        private int minMatch;
078        private int overlapLog;
079        private int searchLog;
080        private int strategy;
081        private int targetLength;
082        private int windowLog;
083        private int workers;
084
085        /**
086         * Constructs a new builder of {@link ZstdCompressorOutputStream}.
087         */
088        public Builder() {
089            // empty
090        }
091
092        @Override
093        public ZstdCompressorOutputStream get() throws IOException {
094            return new ZstdCompressorOutputStream(this);
095        }
096
097        /**
098         * Sets the size of the multi-probe search table, as a power of 2.
099         * <p>
100         * The value {@code 0} means use the default chainLog.
101         * </p>
102         * <p>
103         * The resulting memory usage is (in C) {@code (1 << (chainLog + 2))}. The input must be between {@link ZstdConstants#ZSTD_CHAINLOG_MIN} and
104         * {@link ZstdConstants#ZSTD_CHAINLOG_MAX}. A larger tables result in better and slower compression. This parameter is useless for "fast" strategy but
105         * still useful when using "dfast" strategy, in which case it defines a secondary probe table.
106         * </p>
107         *
108         * @param chainLog the size of the multi-probe search table, as a power of 2.
109         * @return this instance.
110         * @see ZstdConstants#ZSTD_CHAINLOG_MIN
111         * @see ZstdConstants#ZSTD_CHAINLOG_MAX
112         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
113         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
114         */
115        public Builder setChainLog(final int chainLog) {
116            this.chainLog = chainLog;
117            return this;
118        }
119
120        /**
121         * Sets whether a 32-bits checksum of content is written at end of frame (defaults to {@code false}).
122         * <p>
123         * The value {@code false} means no checksum.
124         * </p>
125         *
126         * @param checksum Whether a 32-bits checksum of content is written at end of frame.
127         * @return this instance.
128         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
129         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
130         */
131        public Builder setChecksum(final boolean checksum) {
132            this.checksum = checksum;
133            return this;
134        }
135
136        /**
137         * Sets whether to close the frame on flush.
138         * <p>
139         * This will guarantee that it can be ready fully if the process crashes before closing the stream. The downside is that this negatively affects the
140         * compression ratio.
141         * </p>
142         * <p>
143         * The value {@code false} means don't close on flush.
144         * </p>
145         *
146         * @param closeFrameOnFlush whether to close the frame on flush.
147         * @return this instance.
148         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
149         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
150         */
151        public Builder setCloseFrameOnFlush(final boolean closeFrameOnFlush) {
152            this.closeFrameOnFlush = closeFrameOnFlush;
153            return this;
154        }
155
156        /**
157         * Sets an internal {@code CDict} from the given {@code dict} buffer.
158         * <p>
159         * Decompression will have to use same dictionary.
160         * </p>
161         * <strong>Using a dictionary</strong>
162         * <ul>
163         * <li>Loading a null (or 0-length) dictionary invalidates the previous dictionary, returning to no-dictionary mode.</li>
164         * <li>A dictionary is sticky, it will be used for all future compressed frames. To return to the no-dictionary mode, load a null dictionary.</li>
165         * <li>Loading a dictionary builds tables. This is a CPU consuming operation, with non-negligible impact on latency. Tables are dependent on compression
166         * parameters, and for this reason, compression parameters can no longer be changed after loading a dictionary.</li>
167         * <li>The dictionary content will be copied internally.</li>
168         * </ul>
169         *
170         * @param dict The dictionary buffer.
171         * @return this instance.
172         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter12">Zstd manual Chapter12</a>
173         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
174         */
175        public Builder setDict(final byte[] dict) {
176            this.dict = dict;
177            return this;
178        }
179
180        /**
181         * Size of the initial probe table, as a power of 2.
182         * <p>
183         * The value {@code 0} means "use default hashLog".
184         * </p>
185         * <p>
186         * The resulting memory usage is (in C) {@code (1 << (hashLog + 2))}. This value must be between {@link ZstdConstants#ZSTD_HASHLOG_MIN} and
187         * {@link ZstdConstants#ZSTD_HASHLOG_MAX}. Using a larger table improves the compression ratio of strategies &lt;= dFast, and improves speed of
188         * strategies &gt; dFast.
189         * </p>
190         *
191         * @param hashLog Size of the initial probe table, as a power of 2.
192         * @return this instance.
193         * @see ZstdConstants#ZSTD_HASHLOG_MIN
194         * @see ZstdConstants#ZSTD_HASHLOG_MAX
195         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
196         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
197         */
198        public Builder setHashLog(final int hashLog) {
199            this.hashLog = hashLog;
200            return this;
201        }
202
203        /**
204         * Size of a compression job.
205         * <p>
206         * This value is enforced only when {@code workers >= 1}. Each compression job is completed in parallel, so this value can indirectly impact the number
207         * of active threads. A value of 0 uses a default behavior, which is dynamically determined based on compression parameters. Job size must be a minimum
208         * of overlap size, or <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">ZSTDMT_JOBSIZE_MIN (= 512 KB)</a>, whichever
209         * is largest. The minimum size is automatically and transparently enforced.
210         * </p>
211         * <p>
212         * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
213         * macro {@code ZSTD_MULTITHREAD}).
214         * </p>
215         *
216         * @param jobSize Size of a compression job.
217         * @return this instance.
218         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
219         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">zstdmt_compress.h</a>
220         */
221        public Builder setJobSize(final int jobSize) {
222            this.jobSize = jobSize;
223            return this;
224        }
225
226        /**
227         * Sets compression parameters according to a pre-defined {@code cLevel} table, from 0 to 9.
228         * <p>
229         * The exact compression parameters are dynamically determined, depending on both compression level and srcSize (when known). The default level is
230         * {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}. The special value 0 means default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
231         * </p>
232         * <ul>
233         * <li>The value 0 means use the default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}</li>
234         * <li>You may pass a negative compression level.</li>
235         * <li>Setting a level does not automatically set all other compression parameters to defaults. Setting this value will eventually dynamically impact
236         * the compression parameters which have not been manually set. The manually set values are used.</li>
237         * </ul>
238         *
239         * @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
240         * @return this instance
241         * @see ZstdConstants#ZSTD_CLEVEL_DEFAULT
242         * @see ZstdConstants#ZSTD_CLEVEL_MIN
243         * @see ZstdConstants#ZSTD_CLEVEL_MAX
244         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
245         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
246         */
247        public Builder setLevel(final int level) {
248            this.level = level;
249            return this;
250        }
251
252        /**
253         * Sets minimum match size for long distance matcher.
254         * <p>
255         * Zstd can still find matches of smaller size, by updating its search algorithm to look for this size and larger. Using larger values increase
256         * compression and decompression speed, but decrease the ratio. The value must be between {@link ZstdConstants#ZSTD_MINMATCH_MIN} and
257         * {@link ZstdConstants#ZSTD_MINMATCH_MAX}. Note that currently, for all strategies &lt; {@code btopt}, effective minimum is 4. , for all strategies
258         * &gt; {@code fast}, effective maximum is {@code 6}.
259         * </p>
260         * <p>
261         * The value {@code 0} means use the default minMatchLength.
262         * </p>
263         *
264         * @param minMatch minimum match size for long distance matcher.
265         * @return this instance.
266         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
267         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
268         */
269        public Builder setMinMatch(final int minMatch) {
270            this.minMatch = minMatch;
271            return this;
272        }
273
274        /**
275         * Sets the overlap size, as a fraction of window size.
276         * <p>
277         * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. It helps preserve compression ratio, while each job
278         * is compressed in parallel. This value is enforced only when workers &gt;= 1. Larger values increase compression ratio, but decrease speed. Possible
279         * values range from 0 to 9:
280         * </p>
281         * <ul>
282         * <li>0 means "default" : value will be determined by the library, depending on strategy</li>
283         * <li>1 means "no overlap"</li>
284         * <li>9 means "full overlap", using a full window size.</li>
285         * </ul>
286         * <p>
287         * Each intermediate rank increases/decreases the load size by a factor 2:
288         * </p>
289         * <ul>
290         * <li>9: full window</li>
291         * <li>8: w / 2</li>
292         * <li>7: w / 4</li>
293         * <li>6: w / 8</li>
294         * <li>5: w / 16</li>
295         * <li>4: w / 32</li>
296         * <li>3: w / 64</li>
297         * <li>2: w / 128</li>
298         * <li>1: no overlap</li>
299         * <li>0: default
300         * </ul>
301         * <p>
302         * The default value varies between 6 and 9, depending on the strategy.
303         * </p>
304         * <p>
305         * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
306         * macro {@code ZSTD_MULTITHREAD}).
307         * </p>
308         *
309         * @param overlapLog the overlap size, as a fraction of window size.
310         * @return this instance.
311         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
312         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
313         */
314        public Builder setOverlapLog(final int overlapLog) {
315            this.overlapLog = overlapLog;
316            return this;
317        }
318
319        /**
320         * Sets number of search attempts, as a power of 2.
321         * <p>
322         * More attempts result in better and slower compression. This parameter is useless for "fast" and "dFast" strategies.
323         * </p>
324         * <p>
325         * The value {@code 0} means use the default searchLog.
326         * </p>
327         *
328         * @param searchLog number of search attempts, as a power of 2.
329         * @return this instance.
330         * @see ZstdConstants#ZSTD_SEARCHLOG_MIN
331         * @see ZstdConstants#ZSTD_SEARCHLOG_MAX
332         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
333         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
334         */
335        public Builder setSearchLog(final int searchLog) {
336            this.searchLog = searchLog;
337            return this;
338        }
339
340        /**
341         * Sets the {@code ZSTD_strategy} from the C enum definition.
342         * <p>
343         * The higher the value of selected strategy, the more complex it is, resulting in stronger and slower compression.
344         * </p>
345         * <p>
346         * The value {@code 0} means use the default strategy.
347         * </p>
348         * <ul>
349         * <li>{@code ZSTD_fast = 1}</li>
350         * <li>{@code ZSTD_dfast = 2}</li>
351         * <li>{@code ZSTD_greedy = 3}</li>
352         * <li>{@code ZSTD_lazy = 4}</li>
353         * <li>{@code ZSTD_lazy2 = 5}</li>
354         * <li>{@code ZSTD_btlazy2 = 6}</li>
355         * <li>{@code ZSTD_btopt = 7}</li>
356         * <li>{@code ZSTD_btultra = 8}</li>
357         * <li>{@code ZSTD_btultra2 = 9}</li>
358         * </ul>
359         *
360         * @param strategy the {@code ZSTD_strategy} from the C enum definition.
361         * @return this instance.
362         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
363         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
364         */
365        public Builder setStrategy(final int strategy) {
366            this.strategy = strategy;
367            return this;
368        }
369
370        /**
371         * Sets a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
372         * <p>
373         * For strategies {@code btopt}, {@code btultra} and {@code btultra2}:
374         * </p>
375         * <ul>
376         * <li>Length of Match considered "good enough" to stop search.</li>
377         * <li>Larger values make compression stronger, and slower.</li>
378         * </ul>
379         * <p>
380         * For strategy {@code fast}:
381         * </p>
382         * <ul>
383         * <li>Distance between match sampling.</li>
384         * <li>Larger values make compression faster, and weaker.</li>
385         * </ul>
386         * <p>
387         * The value {@code 0} means use the default targetLength.
388         * </p>
389         *
390         * @param targetLength a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
391         * @return this instance.
392         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
393         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
394         */
395        public Builder setTargetLength(final int targetLength) {
396            this.targetLength = targetLength;
397            return this;
398        }
399
400        /**
401         * Sets maximum allowed back-reference distance, expressed as power of 2.
402         * <p>
403         * This will set a memory budget for streaming decompression, with larger values requiring more memory and typically compressing more. This value be
404         * between {@link ZstdConstants#ZSTD_WINDOWLOG_MIN} and {@link ZstdConstants#ZSTD_WINDOWLOG_MAX}.
405         * </p>
406         * <p>
407         * <strong>Note</strong>: Using a windowLog greater than {@link ZstdConstants#ZSTD_WINDOWLOG_LIMIT_DEFAULT} requires explicitly allowing such size at
408         * streaming decompression stage.
409         * </p>
410         * <p>
411         * The value {@code 0} means use the default windowLog.
412         * </p>
413         *
414         * @param windowLog maximum allowed back-reference distance, expressed as power of 2.
415         * @return this instance.
416         * @see ZstdConstants#ZSTD_WINDOWLOG_MIN
417         * @see ZstdConstants#ZSTD_WINDOWLOG_MAX
418         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
419         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
420         */
421        public Builder setWindowLog(final int windowLog) {
422            this.windowLog = windowLog;
423            return this;
424        }
425
426        /**
427         * Sets how many threads will be spawned to compress in parallel.
428         * <p>
429         * When workers &gt;= 1, this triggers asynchronous mode when compressing which consumes input and flushes output if possible, but immediately gives
430         * back control to the caller, while compression is performed in parallel, within worker threads. More workers improve speed, but also increase memory
431         * usage. Compression is performed from the calling thread, and all invocations are blocking.
432         * </p>
433         * <p>
434         * The value {@code 0} means "single-threaded mode", nothing is spawned.
435         * </p>
436         * <p>
437         * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
438         * macro {@code ZSTD_MULTITHREAD}).
439         * </p>
440         *
441         * @param workers How many threads will be spawned to compress in parallel.
442         * @return this instance.
443         * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
444         * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
445         */
446        public Builder setWorkers(final int workers) {
447            this.workers = workers;
448            return this;
449        }
450    }
451
452    /**
453     * Constructs a new builder of {@link ZstdCompressorOutputStream}.
454     *
455     * @return a new builder of {@link ZstdCompressorOutputStream}.
456     * @since 1.28.0
457     */
458    public static Builder builder() {
459        return new Builder();
460    }
461
462    @SuppressWarnings("resource") // Caller closes
463    private static ZstdOutputStream toZstdOutputStream(final Builder builder) throws IOException {
464        final OutputStream outputStream = builder.getOutputStream();
465        if (outputStream instanceof ZstdOutputStream) {
466            // Builder properties are not applied when a ZstdOutputStream is provided.
467            return (ZstdOutputStream) outputStream;
468        }
469        // @formatter:off
470        return new ZstdOutputStream(outputStream)
471            .setChainLog(builder.chainLog)
472            .setChecksum(builder.checksum)
473            .setCloseFrameOnFlush(builder.closeFrameOnFlush)
474            .setDict(builder.dict != null ? builder.dict : ArrayUtils.EMPTY_BYTE_ARRAY)
475            .setHashLog(builder.hashLog)
476            .setJobSize(builder.jobSize)
477            .setLevel(builder.level)
478            .setMinMatch(builder.minMatch)
479            .setOverlapLog(builder.overlapLog)
480            .setSearchLog(builder.searchLog)
481            .setStrategy(builder.strategy)
482            .setTargetLength(builder.targetLength)
483            .setWindowLog(builder.windowLog)
484            .setWorkers(builder.workers);
485        // @formatter:on
486    }
487
488    @SuppressWarnings("resource") // Caller closes
489    private ZstdCompressorOutputStream(final Builder builder) throws IOException {
490        super(toZstdOutputStream(builder));
491    }
492
493    /**
494     * Constructs a new instance using default Zstd parameter values.
495     *
496     * @param outStream the output stream.
497     * @throws IOException if an I/O error occurs.
498     */
499    public ZstdCompressorOutputStream(final OutputStream outStream) throws IOException {
500        this(builder().setOutputStream(outStream));
501    }
502
503    /**
504     * Constructs a new instance using default Zstd parameter values plus a compression level.
505     *
506     * @param outStream the output stream.
507     * @param level     The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
508     * @throws IOException if an I/O error occurs.
509     * @since 1.18
510     * @deprecated Use {@link #builder()}.
511     */
512    @Deprecated
513    public ZstdCompressorOutputStream(final OutputStream outStream, final int level) throws IOException {
514        this(builder().setOutputStream(outStream).setLevel(level));
515    }
516
517    /**
518     * Constructs a new instance using default Zstd parameter values plus a compression level and checksum setting.
519     *
520     * @param outStream         the output stream.
521     * @param level             The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
522     * @param closeFrameOnFlush whether to close the frame on flush.
523     * @throws IOException if an I/O error occurs.
524     * @since 1.18
525     * @deprecated Use {@link #builder()}.
526     */
527    @Deprecated
528    public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush) throws IOException {
529        this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush));
530    }
531
532    /**
533     * Constructs a new instance using default Zstd parameter values plus a compression level, closeFrameOnFlush and checksum settings.
534     *
535     * @param outStream         the output stream.
536     * @param level             The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
537     * @param closeFrameOnFlush whether to close the frame on flush.
538     * @param checksum          Whether a 32-bits checksum of content is written at end of frame.
539     * @throws IOException if an I/O error occurs.
540     * @since 1.18
541     * @deprecated Use {@link #builder()}.
542     */
543    @Deprecated
544    public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush, final boolean checksum)
545            throws IOException {
546        this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush).setChecksum(checksum));
547    }
548
549    @Override
550    public void write(final byte[] buf, final int off, final int len) throws IOException {
551        out.write(buf, off, len);
552    }
553}