1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package org.apache.commons.compress.compressors.zstandard;
21
22 import java.io.IOException;
23 import java.io.OutputStream;
24
25 import org.apache.commons.compress.compressors.CompressorOutputStream;
26 import org.apache.commons.io.build.AbstractStreamBuilder;
27 import org.apache.commons.lang3.ArrayUtils;
28
29 import com.github.luben.zstd.ZstdOutputStream;
30
31 /**
32 * {@link CompressorOutputStream} implementation to create Zstandard encoded stream.
33 * <p>
34 * This class avoids making the underlying {@code zstd} classes part of the public or protected API. The underlying implementation is provided through the
35 * <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a> library which is based on <a href="https://github.com/facebook/zstd/">zstd</a>.
36 * </p>
37 *
38 * @see <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a>
39 * @see <a href="https://github.com/facebook/zstd/">zstd</a>
40 * @since 1.16
41 */
42 public class ZstdCompressorOutputStream extends CompressorOutputStream<ZstdOutputStream> {
43
44 // @formatter:off
45 /**
46 * Builds a new {@link ZstdCompressorOutputStream}.
47 *
48 * <p>
49 * For example:
50 * </p>
51 * <pre>{@code
52 * ZstdCompressorOutputStream s = ZstdCompressorOutputStream.builder()
53 * .setPath(path)
54 * .setLevel(3)
55 * .setStrategy(0)
56 * .setWorkers(0)
57 * .get();
58 * }
59 * </pre>
60 * <p>
61 * This class avoids making the underlying {@code zstd} classes part of the public or protected API.
62 * </p>
63 * @see #get()
64 * @see ZstdConstants
65 * @since 1.28.0
66 */
67 // @formatter:on
68 public static final class Builder extends AbstractStreamBuilder<ZstdCompressorOutputStream, Builder> {
69
70 private int chainLog;
71 private boolean checksum;
72 private boolean closeFrameOnFlush;
73 private byte[] dict;
74 private int hashLog;
75 private int jobSize;
76 private int level = ZstdConstants.ZSTD_CLEVEL_DEFAULT;
77 private int minMatch;
78 private int overlapLog;
79 private int searchLog;
80 private int strategy;
81 private int targetLength;
82 private int windowLog;
83 private int workers;
84
85 /**
86 * Constructs a new builder of {@link ZstdCompressorOutputStream}.
87 */
88 public Builder() {
89 // empty
90 }
91
92 @Override
93 public ZstdCompressorOutputStream get() throws IOException {
94 return new ZstdCompressorOutputStream(this);
95 }
96
97 /**
98 * Sets the size of the multi-probe search table, as a power of 2.
99 * <p>
100 * The value {@code 0} means use the default chainLog.
101 * </p>
102 * <p>
103 * The resulting memory usage is (in C) {@code (1 << (chainLog + 2))}. The input must be between {@link ZstdConstants#ZSTD_CHAINLOG_MIN} and
104 * {@link ZstdConstants#ZSTD_CHAINLOG_MAX}. A larger tables result in better and slower compression. This parameter is useless for "fast" strategy but
105 * still useful when using "dfast" strategy, in which case it defines a secondary probe table.
106 * </p>
107 *
108 * @param chainLog the size of the multi-probe search table, as a power of 2.
109 * @return this instance.
110 * @see ZstdConstants#ZSTD_CHAINLOG_MIN
111 * @see ZstdConstants#ZSTD_CHAINLOG_MAX
112 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
113 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
114 */
115 public Builder setChainLog(final int chainLog) {
116 this.chainLog = chainLog;
117 return this;
118 }
119
120 /**
121 * Sets whether a 32-bits checksum of content is written at end of frame (defaults to {@code false}).
122 * <p>
123 * The value {@code false} means no checksum.
124 * </p>
125 *
126 * @param checksum Whether a 32-bits checksum of content is written at end of frame.
127 * @return this instance.
128 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
129 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
130 */
131 public Builder setChecksum(final boolean checksum) {
132 this.checksum = checksum;
133 return this;
134 }
135
136 /**
137 * Sets whether to close the frame on flush.
138 * <p>
139 * This will guarantee that it can be ready fully if the process crashes before closing the stream. The downside is that this negatively affects the
140 * compression ratio.
141 * </p>
142 * <p>
143 * The value {@code false} means don't close on flush.
144 * </p>
145 *
146 * @param closeFrameOnFlush whether to close the frame on flush.
147 * @return this instance.
148 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
149 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
150 */
151 public Builder setCloseFrameOnFlush(final boolean closeFrameOnFlush) {
152 this.closeFrameOnFlush = closeFrameOnFlush;
153 return this;
154 }
155
156 /**
157 * Sets an internal {@code CDict} from the given {@code dict} buffer.
158 * <p>
159 * Decompression will have to use same dictionary.
160 * </p>
161 * <strong>Using a dictionary</strong>
162 * <ul>
163 * <li>Loading a null (or 0-length) dictionary invalidates the previous dictionary, returning to no-dictionary mode.</li>
164 * <li>A dictionary is sticky, it will be used for all future compressed frames. To return to the no-dictionary mode, load a null dictionary.</li>
165 * <li>Loading a dictionary builds tables. This is a CPU consuming operation, with non-negligible impact on latency. Tables are dependent on compression
166 * parameters, and for this reason, compression parameters can no longer be changed after loading a dictionary.</li>
167 * <li>The dictionary content will be copied internally.</li>
168 * </ul>
169 *
170 * @param dict The dictionary buffer.
171 * @return this instance.
172 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter12">Zstd manual Chapter12</a>
173 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
174 */
175 public Builder setDict(final byte[] dict) {
176 this.dict = dict;
177 return this;
178 }
179
180 /**
181 * Size of the initial probe table, as a power of 2.
182 * <p>
183 * The value {@code 0} means "use default hashLog".
184 * </p>
185 * <p>
186 * The resulting memory usage is (in C) {@code (1 << (hashLog + 2))}. This value must be between {@link ZstdConstants#ZSTD_HASHLOG_MIN} and
187 * {@link ZstdConstants#ZSTD_HASHLOG_MAX}. Using a larger table improves the compression ratio of strategies <= dFast, and improves speed of
188 * strategies > dFast.
189 * </p>
190 *
191 * @param hashLog Size of the initial probe table, as a power of 2.
192 * @return this instance.
193 * @see ZstdConstants#ZSTD_HASHLOG_MIN
194 * @see ZstdConstants#ZSTD_HASHLOG_MAX
195 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
196 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
197 */
198 public Builder setHashLog(final int hashLog) {
199 this.hashLog = hashLog;
200 return this;
201 }
202
203 /**
204 * Size of a compression job.
205 * <p>
206 * This value is enforced only when {@code workers >= 1}. Each compression job is completed in parallel, so this value can indirectly impact the number
207 * of active threads. A value of 0 uses a default behavior, which is dynamically determined based on compression parameters. Job size must be a minimum
208 * of overlap size, or <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">ZSTDMT_JOBSIZE_MIN (= 512 KB)</a>, whichever
209 * is largest. The minimum size is automatically and transparently enforced.
210 * </p>
211 * <p>
212 * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
213 * macro {@code ZSTD_MULTITHREAD}).
214 * </p>
215 *
216 * @param jobSize Size of a compression job.
217 * @return this instance.
218 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
219 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">zstdmt_compress.h</a>
220 */
221 public Builder setJobSize(final int jobSize) {
222 this.jobSize = jobSize;
223 return this;
224 }
225
226 /**
227 * Sets compression parameters according to a pre-defined {@code cLevel} table, from 0 to 9.
228 * <p>
229 * The exact compression parameters are dynamically determined, depending on both compression level and srcSize (when known). The default level is
230 * {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}. The special value 0 means default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
231 * </p>
232 * <ul>
233 * <li>The value 0 means use the default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}</li>
234 * <li>You may pass a negative compression level.</li>
235 * <li>Setting a level does not automatically set all other compression parameters to defaults. Setting this value will eventually dynamically impact
236 * the compression parameters which have not been manually set. The manually set values are used.</li>
237 * </ul>
238 *
239 * @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
240 * @return this instance
241 * @see ZstdConstants#ZSTD_CLEVEL_DEFAULT
242 * @see ZstdConstants#ZSTD_CLEVEL_MIN
243 * @see ZstdConstants#ZSTD_CLEVEL_MAX
244 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
245 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
246 */
247 public Builder setLevel(final int level) {
248 this.level = level;
249 return this;
250 }
251
252 /**
253 * Sets minimum match size for long distance matcher.
254 * <p>
255 * Zstd can still find matches of smaller size, by updating its search algorithm to look for this size and larger. Using larger values increase
256 * compression and decompression speed, but decrease the ratio. The value must be between {@link ZstdConstants#ZSTD_MINMATCH_MIN} and
257 * {@link ZstdConstants#ZSTD_MINMATCH_MAX}. Note that currently, for all strategies < {@code btopt}, effective minimum is 4. , for all strategies
258 * > {@code fast}, effective maximum is {@code 6}.
259 * </p>
260 * <p>
261 * The value {@code 0} means use the default minMatchLength.
262 * </p>
263 *
264 * @param minMatch minimum match size for long distance matcher.
265 * @return this instance.
266 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
267 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
268 */
269 public Builder setMinMatch(final int minMatch) {
270 this.minMatch = minMatch;
271 return this;
272 }
273
274 /**
275 * Sets the overlap size, as a fraction of window size.
276 * <p>
277 * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. It helps preserve compression ratio, while each job
278 * is compressed in parallel. This value is enforced only when workers >= 1. Larger values increase compression ratio, but decrease speed. Possible
279 * values range from 0 to 9:
280 * </p>
281 * <ul>
282 * <li>0 means "default" : value will be determined by the library, depending on strategy</li>
283 * <li>1 means "no overlap"</li>
284 * <li>9 means "full overlap", using a full window size.</li>
285 * </ul>
286 * <p>
287 * Each intermediate rank increases/decreases the load size by a factor 2:
288 * </p>
289 * <ul>
290 * <li>9: full window</li>
291 * <li>8: w / 2</li>
292 * <li>7: w / 4</li>
293 * <li>6: w / 8</li>
294 * <li>5: w / 16</li>
295 * <li>4: w / 32</li>
296 * <li>3: w / 64</li>
297 * <li>2: w / 128</li>
298 * <li>1: no overlap</li>
299 * <li>0: default
300 * </ul>
301 * <p>
302 * The default value varies between 6 and 9, depending on the strategy.
303 * </p>
304 * <p>
305 * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
306 * macro {@code ZSTD_MULTITHREAD}).
307 * </p>
308 *
309 * @param overlapLog the overlap size, as a fraction of window size.
310 * @return this instance.
311 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
312 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
313 */
314 public Builder setOverlapLog(final int overlapLog) {
315 this.overlapLog = overlapLog;
316 return this;
317 }
318
319 /**
320 * Sets number of search attempts, as a power of 2.
321 * <p>
322 * More attempts result in better and slower compression. This parameter is useless for "fast" and "dFast" strategies.
323 * </p>
324 * <p>
325 * The value {@code 0} means use the default searchLog.
326 * </p>
327 *
328 * @param searchLog number of search attempts, as a power of 2.
329 * @return this instance.
330 * @see ZstdConstants#ZSTD_SEARCHLOG_MIN
331 * @see ZstdConstants#ZSTD_SEARCHLOG_MAX
332 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
333 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
334 */
335 public Builder setSearchLog(final int searchLog) {
336 this.searchLog = searchLog;
337 return this;
338 }
339
340 /**
341 * Sets the {@code ZSTD_strategy} from the C enum definition.
342 * <p>
343 * The higher the value of selected strategy, the more complex it is, resulting in stronger and slower compression.
344 * </p>
345 * <p>
346 * The value {@code 0} means use the default strategy.
347 * </p>
348 * <ul>
349 * <li>{@code ZSTD_fast = 1}</li>
350 * <li>{@code ZSTD_dfast = 2}</li>
351 * <li>{@code ZSTD_greedy = 3}</li>
352 * <li>{@code ZSTD_lazy = 4}</li>
353 * <li>{@code ZSTD_lazy2 = 5}</li>
354 * <li>{@code ZSTD_btlazy2 = 6}</li>
355 * <li>{@code ZSTD_btopt = 7}</li>
356 * <li>{@code ZSTD_btultra = 8}</li>
357 * <li>{@code ZSTD_btultra2 = 9}</li>
358 * </ul>
359 *
360 * @param strategy the {@code ZSTD_strategy} from the C enum definition.
361 * @return this instance.
362 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
363 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
364 */
365 public Builder setStrategy(final int strategy) {
366 this.strategy = strategy;
367 return this;
368 }
369
370 /**
371 * Sets a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
372 * <p>
373 * For strategies {@code btopt}, {@code btultra} and {@code btultra2}:
374 * </p>
375 * <ul>
376 * <li>Length of Match considered "good enough" to stop search.</li>
377 * <li>Larger values make compression stronger, and slower.</li>
378 * </ul>
379 * <p>
380 * For strategy {@code fast}:
381 * </p>
382 * <ul>
383 * <li>Distance between match sampling.</li>
384 * <li>Larger values make compression faster, and weaker.</li>
385 * </ul>
386 * <p>
387 * The value {@code 0} means use the default targetLength.
388 * </p>
389 *
390 * @param targetLength a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
391 * @return this instance.
392 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
393 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
394 */
395 public Builder setTargetLength(final int targetLength) {
396 this.targetLength = targetLength;
397 return this;
398 }
399
400 /**
401 * Sets maximum allowed back-reference distance, expressed as power of 2.
402 * <p>
403 * This will set a memory budget for streaming decompression, with larger values requiring more memory and typically compressing more. This value be
404 * between {@link ZstdConstants#ZSTD_WINDOWLOG_MIN} and {@link ZstdConstants#ZSTD_WINDOWLOG_MAX}.
405 * </p>
406 * <p>
407 * <strong>Note</strong>: Using a windowLog greater than {@link ZstdConstants#ZSTD_WINDOWLOG_LIMIT_DEFAULT} requires explicitly allowing such size at
408 * streaming decompression stage.
409 * </p>
410 * <p>
411 * The value {@code 0} means use the default windowLog.
412 * </p>
413 *
414 * @param windowLog maximum allowed back-reference distance, expressed as power of 2.
415 * @return this instance.
416 * @see ZstdConstants#ZSTD_WINDOWLOG_MIN
417 * @see ZstdConstants#ZSTD_WINDOWLOG_MAX
418 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
419 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
420 */
421 public Builder setWindowLog(final int windowLog) {
422 this.windowLog = windowLog;
423 return this;
424 }
425
426 /**
427 * Sets how many threads will be spawned to compress in parallel.
428 * <p>
429 * When workers >= 1, this triggers asynchronous mode when compressing which consumes input and flushes output if possible, but immediately gives
430 * back control to the caller, while compression is performed in parallel, within worker threads. More workers improve speed, but also increase memory
431 * usage. Compression is performed from the calling thread, and all invocations are blocking.
432 * </p>
433 * <p>
434 * The value {@code 0} means "single-threaded mode", nothing is spawned.
435 * </p>
436 * <p>
437 * This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
438 * macro {@code ZSTD_MULTITHREAD}).
439 * </p>
440 *
441 * @param workers How many threads will be spawned to compress in parallel.
442 * @return this instance.
443 * @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
444 * @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
445 */
446 public Builder setWorkers(final int workers) {
447 this.workers = workers;
448 return this;
449 }
450 }
451
452 /**
453 * Constructs a new builder of {@link ZstdCompressorOutputStream}.
454 *
455 * @return a new builder of {@link ZstdCompressorOutputStream}.
456 * @since 1.28.0
457 */
458 public static Builder builder() {
459 return new Builder();
460 }
461
462 @SuppressWarnings("resource") // Caller closes
463 private static ZstdOutputStream toZstdOutputStream(final Builder builder) throws IOException {
464 final OutputStream outputStream = builder.getOutputStream();
465 if (outputStream instanceof ZstdOutputStream) {
466 // Builder properties are not applied when a ZstdOutputStream is provided.
467 return (ZstdOutputStream) outputStream;
468 }
469 // @formatter:off
470 return new ZstdOutputStream(outputStream)
471 .setChainLog(builder.chainLog)
472 .setChecksum(builder.checksum)
473 .setCloseFrameOnFlush(builder.closeFrameOnFlush)
474 .setDict(builder.dict != null ? builder.dict : ArrayUtils.EMPTY_BYTE_ARRAY)
475 .setHashLog(builder.hashLog)
476 .setJobSize(builder.jobSize)
477 .setLevel(builder.level)
478 .setMinMatch(builder.minMatch)
479 .setOverlapLog(builder.overlapLog)
480 .setSearchLog(builder.searchLog)
481 .setStrategy(builder.strategy)
482 .setTargetLength(builder.targetLength)
483 .setWindowLog(builder.windowLog)
484 .setWorkers(builder.workers);
485 // @formatter:on
486 }
487
488 @SuppressWarnings("resource") // Caller closes
489 private ZstdCompressorOutputStream(final Builder builder) throws IOException {
490 super(toZstdOutputStream(builder));
491 }
492
493 /**
494 * Constructs a new instance using default Zstd parameter values.
495 *
496 * @param outStream the output stream.
497 * @throws IOException if an I/O error occurs.
498 */
499 public ZstdCompressorOutputStream(final OutputStream outStream) throws IOException {
500 this(builder().setOutputStream(outStream));
501 }
502
503 /**
504 * Constructs a new instance using default Zstd parameter values plus a compression level.
505 *
506 * @param outStream the output stream.
507 * @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
508 * @throws IOException if an I/O error occurs.
509 * @since 1.18
510 * @deprecated Use {@link #builder()}.
511 */
512 @Deprecated
513 public ZstdCompressorOutputStream(final OutputStream outStream, final int level) throws IOException {
514 this(builder().setOutputStream(outStream).setLevel(level));
515 }
516
517 /**
518 * Constructs a new instance using default Zstd parameter values plus a compression level and checksum setting.
519 *
520 * @param outStream the output stream.
521 * @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
522 * @param closeFrameOnFlush whether to close the frame on flush.
523 * @throws IOException if an I/O error occurs.
524 * @since 1.18
525 * @deprecated Use {@link #builder()}.
526 */
527 @Deprecated
528 public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush) throws IOException {
529 this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush));
530 }
531
532 /**
533 * Constructs a new instance using default Zstd parameter values plus a compression level, closeFrameOnFlush and checksum settings.
534 *
535 * @param outStream the output stream.
536 * @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
537 * @param closeFrameOnFlush whether to close the frame on flush.
538 * @param checksum Whether a 32-bits checksum of content is written at end of frame.
539 * @throws IOException if an I/O error occurs.
540 * @since 1.18
541 * @deprecated Use {@link #builder()}.
542 */
543 @Deprecated
544 public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush, final boolean checksum)
545 throws IOException {
546 this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush).setChecksum(checksum));
547 }
548
549 @Override
550 public void write(final byte[] buf, final int off, final int len) throws IOException {
551 out.write(buf, off, len);
552 }
553 }