View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.compress.compressors.zstandard;
21  
22  import java.io.IOException;
23  import java.io.OutputStream;
24  
25  import org.apache.commons.compress.compressors.CompressorOutputStream;
26  import org.apache.commons.io.build.AbstractStreamBuilder;
27  import org.apache.commons.lang3.ArrayUtils;
28  
29  import com.github.luben.zstd.ZstdOutputStream;
30  
31  /**
32   * {@link CompressorOutputStream} implementation to create Zstandard encoded stream.
33   * <p>
34   * This class avoids making the underlying {@code zstd} classes part of the public or protected API. The underlying implementation is provided through the
35   * <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a> library which is based on <a href="https://github.com/facebook/zstd/">zstd</a>.
36   * </p>
37   *
38   * @see <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a>
39   * @see <a href="https://github.com/facebook/zstd/">zstd</a>
40   * @since 1.16
41   */
42  public class ZstdCompressorOutputStream extends CompressorOutputStream<ZstdOutputStream> {
43  
44      // @formatter:off
45      /**
46       * Builds a new {@link ZstdCompressorOutputStream}.
47       *
48       * <p>
49       * For example:
50       * </p>
51       * <pre>{@code
52       * ZstdCompressorOutputStream s = ZstdCompressorOutputStream.builder()
53       *   .setPath(path)
54       *   .setLevel(3)
55       *   .setStrategy(0)
56       *   .setWorkers(0)
57       *   .get();
58       * }
59       * </pre>
60       * <p>
61       * This class avoids making the underlying {@code zstd} classes part of the public or protected API.
62       * </p>
63       * @see #get()
64       * @see ZstdConstants
65       * @since 1.28.0
66       */
67      // @formatter:on
68      public static final class Builder extends AbstractStreamBuilder<ZstdCompressorOutputStream, Builder> {
69  
70          private int chainLog;
71          private boolean checksum;
72          private boolean closeFrameOnFlush;
73          private byte[] dict;
74          private int hashLog;
75          private int jobSize;
76          private int level = ZstdConstants.ZSTD_CLEVEL_DEFAULT;
77          private int minMatch;
78          private int overlapLog;
79          private int searchLog;
80          private int strategy;
81          private int targetLength;
82          private int windowLog;
83          private int workers;
84  
85          /**
86           * Constructs a new builder of {@link ZstdCompressorOutputStream}.
87           */
88          public Builder() {
89              // empty
90          }
91  
92          @Override
93          public ZstdCompressorOutputStream get() throws IOException {
94              return new ZstdCompressorOutputStream(this);
95          }
96  
97          /**
98           * Sets the size of the multi-probe search table, as a power of 2.
99           * <p>
100          * The value {@code 0} means use the default chainLog.
101          * </p>
102          * <p>
103          * The resulting memory usage is (in C) {@code (1 << (chainLog + 2))}. The input must be between {@link ZstdConstants#ZSTD_CHAINLOG_MIN} and
104          * {@link ZstdConstants#ZSTD_CHAINLOG_MAX}. A larger tables result in better and slower compression. This parameter is useless for "fast" strategy but
105          * still useful when using "dfast" strategy, in which case it defines a secondary probe table.
106          * </p>
107          *
108          * @param chainLog the size of the multi-probe search table, as a power of 2.
109          * @return this instance.
110          * @see ZstdConstants#ZSTD_CHAINLOG_MIN
111          * @see ZstdConstants#ZSTD_CHAINLOG_MAX
112          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
113          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
114          */
115         public Builder setChainLog(final int chainLog) {
116             this.chainLog = chainLog;
117             return this;
118         }
119 
120         /**
121          * Sets whether a 32-bits checksum of content is written at end of frame (defaults to {@code false}).
122          * <p>
123          * The value {@code false} means no checksum.
124          * </p>
125          *
126          * @param checksum Whether a 32-bits checksum of content is written at end of frame.
127          * @return this instance.
128          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
129          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
130          */
131         public Builder setChecksum(final boolean checksum) {
132             this.checksum = checksum;
133             return this;
134         }
135 
136         /**
137          * Sets whether to close the frame on flush.
138          * <p>
139          * This will guarantee that it can be ready fully if the process crashes before closing the stream. The downside is that this negatively affects the
140          * compression ratio.
141          * </p>
142          * <p>
143          * The value {@code false} means don't close on flush.
144          * </p>
145          *
146          * @param closeFrameOnFlush whether to close the frame on flush.
147          * @return this instance.
148          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
149          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
150          */
151         public Builder setCloseFrameOnFlush(final boolean closeFrameOnFlush) {
152             this.closeFrameOnFlush = closeFrameOnFlush;
153             return this;
154         }
155 
156         /**
157          * Sets an internal {@code CDict} from the given {@code dict} buffer.
158          * <p>
159          * Decompression will have to use same dictionary.
160          * </p>
161          * <strong>Using a dictionary</strong>
162          * <ul>
163          * <li>Loading a null (or 0-length) dictionary invalidates the previous dictionary, returning to no-dictionary mode.</li>
164          * <li>A dictionary is sticky, it will be used for all future compressed frames. To return to the no-dictionary mode, load a null dictionary.</li>
165          * <li>Loading a dictionary builds tables. This is a CPU consuming operation, with non-negligible impact on latency. Tables are dependent on compression
166          * parameters, and for this reason, compression parameters can no longer be changed after loading a dictionary.</li>
167          * <li>The dictionary content will be copied internally.</li>
168          * </ul>
169          *
170          * @param dict The dictionary buffer.
171          * @return this instance.
172          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter12">Zstd manual Chapter12</a>
173          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
174          */
175         public Builder setDict(final byte[] dict) {
176             this.dict = dict;
177             return this;
178         }
179 
180         /**
181          * Size of the initial probe table, as a power of 2.
182          * <p>
183          * The value {@code 0} means "use default hashLog".
184          * </p>
185          * <p>
186          * The resulting memory usage is (in C) {@code (1 << (hashLog + 2))}. This value must be between {@link ZstdConstants#ZSTD_HASHLOG_MIN} and
187          * {@link ZstdConstants#ZSTD_HASHLOG_MAX}. Using a larger table improves the compression ratio of strategies &lt;= dFast, and improves speed of
188          * strategies &gt; dFast.
189          * </p>
190          *
191          * @param hashLog Size of the initial probe table, as a power of 2.
192          * @return this instance.
193          * @see ZstdConstants#ZSTD_HASHLOG_MIN
194          * @see ZstdConstants#ZSTD_HASHLOG_MAX
195          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
196          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
197          */
198         public Builder setHashLog(final int hashLog) {
199             this.hashLog = hashLog;
200             return this;
201         }
202 
203         /**
204          * Size of a compression job.
205          * <p>
206          * This value is enforced only when {@code workers >= 1}. Each compression job is completed in parallel, so this value can indirectly impact the number
207          * of active threads. A value of 0 uses a default behavior, which is dynamically determined based on compression parameters. Job size must be a minimum
208          * of overlap size, or <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">ZSTDMT_JOBSIZE_MIN (= 512 KB)</a>, whichever
209          * is largest. The minimum size is automatically and transparently enforced.
210          * </p>
211          * <p>
212          * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
213          * macro {@code ZSTD_MULTITHREAD}).
214          * </p>
215          *
216          * @param jobSize Size of a compression job.
217          * @return this instance.
218          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
219          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">zstdmt_compress.h</a>
220          */
221         public Builder setJobSize(final int jobSize) {
222             this.jobSize = jobSize;
223             return this;
224         }
225 
226         /**
227          * Sets compression parameters according to a pre-defined {@code cLevel} table, from 0 to 9.
228          * <p>
229          * The exact compression parameters are dynamically determined, depending on both compression level and srcSize (when known). The default level is
230          * {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}. The special value 0 means default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
231          * </p>
232          * <ul>
233          * <li>The value 0 means use the default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}</li>
234          * <li>You may pass a negative compression level.</li>
235          * <li>Setting a level does not automatically set all other compression parameters to defaults. Setting this value will eventually dynamically impact
236          * the compression parameters which have not been manually set. The manually set values are used.</li>
237          * </ul>
238          *
239          * @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
240          * @return this instance
241          * @see ZstdConstants#ZSTD_CLEVEL_DEFAULT
242          * @see ZstdConstants#ZSTD_CLEVEL_MIN
243          * @see ZstdConstants#ZSTD_CLEVEL_MAX
244          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
245          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
246          */
247         public Builder setLevel(final int level) {
248             this.level = level;
249             return this;
250         }
251 
252         /**
253          * Sets minimum match size for long distance matcher.
254          * <p>
255          * Zstd can still find matches of smaller size, by updating its search algorithm to look for this size and larger. Using larger values increase
256          * compression and decompression speed, but decrease the ratio. The value must be between {@link ZstdConstants#ZSTD_MINMATCH_MIN} and
257          * {@link ZstdConstants#ZSTD_MINMATCH_MAX}. Note that currently, for all strategies &lt; {@code btopt}, effective minimum is 4. , for all strategies
258          * &gt; {@code fast}, effective maximum is {@code 6}.
259          * </p>
260          * <p>
261          * The value {@code 0} means use the default minMatchLength.
262          * </p>
263          *
264          * @param minMatch minimum match size for long distance matcher.
265          * @return this instance.
266          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
267          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
268          */
269         public Builder setMinMatch(final int minMatch) {
270             this.minMatch = minMatch;
271             return this;
272         }
273 
274         /**
275          * Sets the overlap size, as a fraction of window size.
276          * <p>
277          * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. It helps preserve compression ratio, while each job
278          * is compressed in parallel. This value is enforced only when workers &gt;= 1. Larger values increase compression ratio, but decrease speed. Possible
279          * values range from 0 to 9:
280          * </p>
281          * <ul>
282          * <li>0 means "default" : value will be determined by the library, depending on strategy</li>
283          * <li>1 means "no overlap"</li>
284          * <li>9 means "full overlap", using a full window size.</li>
285          * </ul>
286          * <p>
287          * Each intermediate rank increases/decreases the load size by a factor 2:
288          * </p>
289          * <ul>
290          * <li>9: full window</li>
291          * <li>8: w / 2</li>
292          * <li>7: w / 4</li>
293          * <li>6: w / 8</li>
294          * <li>5: w / 16</li>
295          * <li>4: w / 32</li>
296          * <li>3: w / 64</li>
297          * <li>2: w / 128</li>
298          * <li>1: no overlap</li>
299          * <li>0: default
300          * </ul>
301          * <p>
302          * The default value varies between 6 and 9, depending on the strategy.
303          * </p>
304          * <p>
305          * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
306          * macro {@code ZSTD_MULTITHREAD}).
307          * </p>
308          *
309          * @param overlapLog the overlap size, as a fraction of window size.
310          * @return this instance.
311          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
312          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
313          */
314         public Builder setOverlapLog(final int overlapLog) {
315             this.overlapLog = overlapLog;
316             return this;
317         }
318 
319         /**
320          * Sets number of search attempts, as a power of 2.
321          * <p>
322          * More attempts result in better and slower compression. This parameter is useless for "fast" and "dFast" strategies.
323          * </p>
324          * <p>
325          * The value {@code 0} means use the default searchLog.
326          * </p>
327          *
328          * @param searchLog number of search attempts, as a power of 2.
329          * @return this instance.
330          * @see ZstdConstants#ZSTD_SEARCHLOG_MIN
331          * @see ZstdConstants#ZSTD_SEARCHLOG_MAX
332          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
333          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
334          */
335         public Builder setSearchLog(final int searchLog) {
336             this.searchLog = searchLog;
337             return this;
338         }
339 
340         /**
341          * Sets the {@code ZSTD_strategy} from the C enum definition.
342          * <p>
343          * The higher the value of selected strategy, the more complex it is, resulting in stronger and slower compression.
344          * </p>
345          * <p>
346          * The value {@code 0} means use the default strategy.
347          * </p>
348          * <ul>
349          * <li>{@code ZSTD_fast = 1}</li>
350          * <li>{@code ZSTD_dfast = 2}</li>
351          * <li>{@code ZSTD_greedy = 3}</li>
352          * <li>{@code ZSTD_lazy = 4}</li>
353          * <li>{@code ZSTD_lazy2 = 5}</li>
354          * <li>{@code ZSTD_btlazy2 = 6}</li>
355          * <li>{@code ZSTD_btopt = 7}</li>
356          * <li>{@code ZSTD_btultra = 8}</li>
357          * <li>{@code ZSTD_btultra2 = 9}</li>
358          * </ul>
359          *
360          * @param strategy the {@code ZSTD_strategy} from the C enum definition.
361          * @return this instance.
362          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
363          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
364          */
365         public Builder setStrategy(final int strategy) {
366             this.strategy = strategy;
367             return this;
368         }
369 
370         /**
371          * Sets a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
372          * <p>
373          * For strategies {@code btopt}, {@code btultra} and {@code btultra2}:
374          * </p>
375          * <ul>
376          * <li>Length of Match considered "good enough" to stop search.</li>
377          * <li>Larger values make compression stronger, and slower.</li>
378          * </ul>
379          * <p>
380          * For strategy {@code fast}:
381          * </p>
382          * <ul>
383          * <li>Distance between match sampling.</li>
384          * <li>Larger values make compression faster, and weaker.</li>
385          * </ul>
386          * <p>
387          * The value {@code 0} means use the default targetLength.
388          * </p>
389          *
390          * @param targetLength a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
391          * @return this instance.
392          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
393          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
394          */
395         public Builder setTargetLength(final int targetLength) {
396             this.targetLength = targetLength;
397             return this;
398         }
399 
400         /**
401          * Sets maximum allowed back-reference distance, expressed as power of 2.
402          * <p>
403          * This will set a memory budget for streaming decompression, with larger values requiring more memory and typically compressing more. This value be
404          * between {@link ZstdConstants#ZSTD_WINDOWLOG_MIN} and {@link ZstdConstants#ZSTD_WINDOWLOG_MAX}.
405          * </p>
406          * <p>
407          * <strong>Note</strong>: Using a windowLog greater than {@link ZstdConstants#ZSTD_WINDOWLOG_LIMIT_DEFAULT} requires explicitly allowing such size at
408          * streaming decompression stage.
409          * </p>
410          * <p>
411          * The value {@code 0} means use the default windowLog.
412          * </p>
413          *
414          * @param windowLog maximum allowed back-reference distance, expressed as power of 2.
415          * @return this instance.
416          * @see ZstdConstants#ZSTD_WINDOWLOG_MIN
417          * @see ZstdConstants#ZSTD_WINDOWLOG_MAX
418          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
419          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
420          */
421         public Builder setWindowLog(final int windowLog) {
422             this.windowLog = windowLog;
423             return this;
424         }
425 
426         /**
427          * Sets how many threads will be spawned to compress in parallel.
428          * <p>
429          * When workers &gt;= 1, this triggers asynchronous mode when compressing which consumes input and flushes output if possible, but immediately gives
430          * back control to the caller, while compression is performed in parallel, within worker threads. More workers improve speed, but also increase memory
431          * usage. Compression is performed from the calling thread, and all invocations are blocking.
432          * </p>
433          * <p>
434          * The value {@code 0} means "single-threaded mode", nothing is spawned.
435          * </p>
436          * <p>
437          * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
438          * macro {@code ZSTD_MULTITHREAD}).
439          * </p>
440          *
441          * @param workers How many threads will be spawned to compress in parallel.
442          * @return this instance.
443          * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
444          * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
445          */
446         public Builder setWorkers(final int workers) {
447             this.workers = workers;
448             return this;
449         }
450     }
451 
452     /**
453      * Constructs a new builder of {@link ZstdCompressorOutputStream}.
454      *
455      * @return a new builder of {@link ZstdCompressorOutputStream}.
456      * @since 1.28.0
457      */
458     public static Builder builder() {
459         return new Builder();
460     }
461 
462     @SuppressWarnings("resource") // Caller closes
463     private static ZstdOutputStream toZstdOutputStream(final Builder builder) throws IOException {
464         final OutputStream outputStream = builder.getOutputStream();
465         if (outputStream instanceof ZstdOutputStream) {
466             // Builder properties are not applied when a ZstdOutputStream is provided.
467             return (ZstdOutputStream) outputStream;
468         }
469         // @formatter:off
470         return new ZstdOutputStream(outputStream)
471             .setChainLog(builder.chainLog)
472             .setChecksum(builder.checksum)
473             .setCloseFrameOnFlush(builder.closeFrameOnFlush)
474             .setDict(builder.dict != null ? builder.dict : ArrayUtils.EMPTY_BYTE_ARRAY)
475             .setHashLog(builder.hashLog)
476             .setJobSize(builder.jobSize)
477             .setLevel(builder.level)
478             .setMinMatch(builder.minMatch)
479             .setOverlapLog(builder.overlapLog)
480             .setSearchLog(builder.searchLog)
481             .setStrategy(builder.strategy)
482             .setTargetLength(builder.targetLength)
483             .setWindowLog(builder.windowLog)
484             .setWorkers(builder.workers);
485         // @formatter:on
486     }
487 
488     @SuppressWarnings("resource") // Caller closes
489     private ZstdCompressorOutputStream(final Builder builder) throws IOException {
490         super(toZstdOutputStream(builder));
491     }
492 
493     /**
494      * Constructs a new instance using default Zstd parameter values.
495      *
496      * @param outStream the output stream.
497      * @throws IOException if an I/O error occurs.
498      */
499     public ZstdCompressorOutputStream(final OutputStream outStream) throws IOException {
500         this(builder().setOutputStream(outStream));
501     }
502 
503     /**
504      * Constructs a new instance using default Zstd parameter values plus a compression level.
505      *
506      * @param outStream the output stream.
507      * @param level     The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
508      * @throws IOException if an I/O error occurs.
509      * @since 1.18
510      * @deprecated Use {@link #builder()}.
511      */
512     @Deprecated
513     public ZstdCompressorOutputStream(final OutputStream outStream, final int level) throws IOException {
514         this(builder().setOutputStream(outStream).setLevel(level));
515     }
516 
517     /**
518      * Constructs a new instance using default Zstd parameter values plus a compression level and checksum setting.
519      *
520      * @param outStream         the output stream.
521      * @param level             The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
522      * @param closeFrameOnFlush whether to close the frame on flush.
523      * @throws IOException if an I/O error occurs.
524      * @since 1.18
525      * @deprecated Use {@link #builder()}.
526      */
527     @Deprecated
528     public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush) throws IOException {
529         this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush));
530     }
531 
532     /**
533      * Constructs a new instance using default Zstd parameter values plus a compression level, closeFrameOnFlush and checksum settings.
534      *
535      * @param outStream         the output stream.
536      * @param level             The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
537      * @param closeFrameOnFlush whether to close the frame on flush.
538      * @param checksum          Whether a 32-bits checksum of content is written at end of frame.
539      * @throws IOException if an I/O error occurs.
540      * @since 1.18
541      * @deprecated Use {@link #builder()}.
542      */
543     @Deprecated
544     public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush, final boolean checksum)
545             throws IOException {
546         this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush).setChecksum(checksum));
547     }
548 
549     @Override
550     public void write(final byte[] buf, final int off, final int len) throws IOException {
551         out.write(buf, off, len);
552     }
553 }