001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.OutputStream; 023 024import org.apache.commons.compress.compressors.CompressorOutputStream; 025import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 026import org.apache.commons.compress.compressors.lz77support.Parameters; 027import org.apache.commons.compress.utils.ByteUtils; 028 029/** 030 * CompressorOutputStream for the raw Snappy format. 031 * 032 * <p> 033 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must 034 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks 035 * and doesn't contain offsets bigger than 32k which is the default block size used by this class. 036 * </p> 037 * 038 * <p> 039 * The raw Snappy format requires the uncompressed size to be written at the beginning of the stream using a varint representation, i.e. the number of bytes 040 * needed to write the information is not known before the uncompressed size is known. We've chosen to make the uncompressedSize a parameter of the constructor 041 * in favor of buffering the whole output until the size is known. When using the {@link FramedSnappyCompressorOutputStream} this limitation is taken care of by 042 * the warpping framing format. 043 * </p> 044 * 045 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 046 * @since 1.14 047 * @NotThreadSafe 048 */ 049public class SnappyCompressorOutputStream extends CompressorOutputStream { 050 // literal length is stored as (len - 1) either inside the tag 051 // (six bits minus four flags) or in 1 to 4 bytes after the tag 052 private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60; 053 private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8; 054 private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16; 055 056 private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24; 057 058 private static final int ONE_SIZE_BYTE_MARKER = 60 << 2; 059 060 private static final int TWO_SIZE_BYTE_MARKER = 61 << 2; 061 062 private static final int THREE_SIZE_BYTE_MARKER = 62 << 2; 063 064 private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2; 065 066 // Back-references ("copies") have their offset/size information 067 // in two, three or five bytes. 068 private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4; 069 070 private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11; 071 072 private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1; 073 074 private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1; 075 076 private static final int ONE_BYTE_COPY_TAG = 1; 077 078 private static final int TWO_BYTE_COPY_TAG = 2; 079 private static final int FOUR_BYTE_COPY_TAG = 3; 080 // technically the format could use shorter matches but with a 081 // length of three the offset would be encoded as at least two 082 // bytes in addition to the tag, so yield no compression at all 083 private static final int MIN_MATCH_LENGTH = 4; 084 // Snappy stores the match length in six bits of the tag 085 private static final int MAX_MATCH_LENGTH = 64; 086 087 /** 088 * Returns a builder correctly configured for the Snappy algorithm using the gven block size. 089 * 090 * @param blockSize the block size. 091 * @return a builder correctly configured for the Snappy algorithm using the gven block size 092 */ 093 public static Parameters.Builder createParameterBuilder(final int blockSize) { 094 // the max offset and max literal length defined by the format 095 // are 2^32 - 1 and 2^32 respectively - with blockSize being 096 // an integer we will never exceed that 097 return Parameters.builder(blockSize).withMinBackReferenceLength(MIN_MATCH_LENGTH).withMaxBackReferenceLength(MAX_MATCH_LENGTH).withMaxOffset(blockSize) 098 .withMaxLiteralLength(blockSize); 099 } 100 101 private final LZ77Compressor compressor; 102 private final OutputStream os; 103 private final ByteUtils.ByteConsumer consumer; 104 105 // used in one-arg write method 106 private final byte[] oneByte = new byte[1]; 107 108 private boolean finished; 109 110 /** 111 * Constructor using the default block size of 32k. 112 * 113 * @param os the outputstream to write compressed data to 114 * @param uncompressedSize the uncompressed size of data 115 * @throws IOException if writing of the size fails 116 */ 117 public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException { 118 this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE); 119 } 120 121 /** 122 * Constructor using a configurable block size. 123 * 124 * @param os the outputstream to write compressed data to 125 * @param uncompressedSize the uncompressed size of data 126 * @param blockSize the block size used - must be a power of two 127 * @throws IOException if writing of the size fails 128 */ 129 public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize) throws IOException { 130 this(os, uncompressedSize, createParameterBuilder(blockSize).build()); 131 } 132 133 /** 134 * Constructor providing full control over the underlying LZ77 compressor. 135 * 136 * @param os the outputstream to write compressed data to 137 * @param uncompressedSize the uncompressed size of data 138 * @param params the parameters to use by the compressor - note that the format itself imposes some limits like a maximum match length of 64 bytes 139 * @throws IOException if writing of the size fails 140 */ 141 public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final Parameters params) throws IOException { 142 this.os = os; 143 consumer = new ByteUtils.OutputStreamByteConsumer(os); 144 compressor = new LZ77Compressor(params, block -> { 145 switch (block.getType()) { 146 case LITERAL: 147 writeLiteralBlock((LZ77Compressor.LiteralBlock) block); 148 break; 149 case BACK_REFERENCE: 150 writeBackReference((LZ77Compressor.BackReference) block); 151 break; 152 case EOD: 153 break; 154 } 155 }); 156 writeUncompressedSize(uncompressedSize); 157 } 158 159 @Override 160 public void close() throws IOException { 161 try { 162 finish(); 163 } finally { 164 os.close(); 165 } 166 } 167 168 /** 169 * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream. 170 * 171 * @throws IOException if an error occurs 172 */ 173 public void finish() throws IOException { 174 if (!finished) { 175 compressor.finish(); 176 finished = true; 177 } 178 } 179 180 @Override 181 public void write(final byte[] data, final int off, final int len) throws IOException { 182 compressor.compress(data, off, len); 183 } 184 185 @Override 186 public void write(final int b) throws IOException { 187 oneByte[0] = (byte) (b & 0xff); 188 write(oneByte); 189 } 190 191 private void writeBackReference(final LZ77Compressor.BackReference block) throws IOException { 192 final int len = block.getLength(); 193 final int offset = block.getOffset(); 194 if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) { 195 writeBackReferenceWithOneOffsetByte(len, offset); 196 } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) { 197 writeBackReferenceWithTwoOffsetBytes(len, offset); 198 } else { 199 writeBackReferenceWithFourOffsetBytes(len, offset); 200 } 201 } 202 203 private void writeBackReferenceWithFourOffsetBytes(final int len, final int offset) throws IOException { 204 writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset); 205 } 206 207 private void writeBackReferenceWithLittleEndianOffset(final int tag, final int offsetBytes, final int len, final int offset) throws IOException { 208 os.write(tag | len - 1 << 2); 209 writeLittleEndian(offsetBytes, offset); 210 } 211 212 private void writeBackReferenceWithOneOffsetByte(final int len, final int offset) throws IOException { 213 os.write(ONE_BYTE_COPY_TAG | len - 4 << 2 | (offset & 0x700) >> 3); 214 os.write(offset & 0xff); 215 } 216 217 private void writeBackReferenceWithTwoOffsetBytes(final int len, final int offset) throws IOException { 218 writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset); 219 } 220 221 private void writeLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { 222 final int len = block.getLength(); 223 if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) { 224 writeLiteralBlockNoSizeBytes(block, len); 225 } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) { 226 writeLiteralBlockOneSizeByte(block, len); 227 } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) { 228 writeLiteralBlockTwoSizeBytes(block, len); 229 } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) { 230 writeLiteralBlockThreeSizeBytes(block, len); 231 } else { 232 writeLiteralBlockFourSizeBytes(block, len); 233 } 234 } 235 236 private void writeLiteralBlockFourSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { 237 writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block); 238 } 239 240 private void writeLiteralBlockNoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { 241 writeLiteralBlockWithSize(len - 1 << 2, 0, len, block); 242 } 243 244 private void writeLiteralBlockOneSizeByte(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { 245 writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block); 246 } 247 248 private void writeLiteralBlockThreeSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { 249 writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block); 250 } 251 252 private void writeLiteralBlockTwoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { 253 writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block); 254 } 255 256 private void writeLiteralBlockWithSize(final int tagByte, final int sizeBytes, final int len, final LZ77Compressor.LiteralBlock block) throws IOException { 257 os.write(tagByte); 258 writeLittleEndian(sizeBytes, len - 1); 259 os.write(block.getData(), block.getOffset(), len); 260 } 261 262 private void writeLittleEndian(final int numBytes, final int num) throws IOException { 263 ByteUtils.toLittleEndian(consumer, num, numBytes); 264 } 265 266 private void writeUncompressedSize(long uncompressedSize) throws IOException { 267 boolean more; 268 do { 269 int currentByte = (int) (uncompressedSize & 0x7F); 270 more = uncompressedSize > currentByte; 271 if (more) { 272 currentByte |= 0x80; 273 } 274 os.write(currentByte); 275 uncompressedSize >>= 7; 276 } while (more); 277 } 278}