001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; 025import org.apache.commons.compress.utils.ByteUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p> 031 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must 032 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks 033 * and doesn't contain offsets bigger than 32k which is the default block size used by this class. 034 * </p> 035 * 036 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 037 * @since 1.7 038 */ 039public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { 040 041 private enum State { 042 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE 043 } 044 045 /** Mask used to determine the type of "tag" is being processed */ 046 private static final int TAG_MASK = 0x03; 047 048 /** Default block size */ 049 public static final int DEFAULT_BLOCK_SIZE = 32768; 050 051 /** The size of the uncompressed data */ 052 private final int size; 053 054 /** Number of uncompressed bytes still to be read. */ 055 private int uncompressedBytesRemaining; 056 057 /** Current state of the stream */ 058 private State state = State.NO_BLOCK; 059 060 private boolean endReached; 061 062 /** 063 * Constructor using the default buffer size of 32k. 064 * 065 * @param is An InputStream to read compressed data from 066 * @throws IOException if reading fails 067 */ 068 public SnappyCompressorInputStream(final InputStream is) throws IOException { 069 this(is, DEFAULT_BLOCK_SIZE); 070 } 071 072 /** 073 * Constructor using a configurable buffer size. 074 * 075 * @param is An InputStream to read compressed data from 076 * @param blockSize The block size used in compression 077 * @throws IOException if reading fails 078 * @throws IllegalArgumentException if blockSize is not bigger than 0 079 */ 080 public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException { 081 super(is, blockSize); 082 uncompressedBytesRemaining = size = (int) readSize(); 083 } 084 085 /** 086 * Try to fill the buffer with the next block of data. 087 */ 088 private void fill() throws IOException { 089 if (uncompressedBytesRemaining == 0) { 090 endReached = true; 091 return; 092 } 093 094 int b = readOneByte(); 095 if (b == -1) { 096 throw new IOException("Premature end of stream reading block start"); 097 } 098 int length = 0; 099 int offset = 0; 100 101 switch (b & TAG_MASK) { 102 103 case 0x00: 104 105 length = readLiteralLength(b); 106 if (length < 0) { 107 throw new IOException("Illegal block with a negative literal size found"); 108 } 109 uncompressedBytesRemaining -= length; 110 startLiteral(length); 111 state = State.IN_LITERAL; 112 break; 113 114 case 0x01: 115 116 /* 117 * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits 118 * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and 119 * the lower eight are stored in a byte following the tag byte. 120 */ 121 122 length = 4 + (b >> 2 & 0x07); 123 uncompressedBytesRemaining -= length; 124 offset = (b & 0xE0) << 3; 125 b = readOneByte(); 126 if (b == -1) { 127 throw new IOException("Premature end of stream reading back-reference length"); 128 } 129 offset |= b; 130 131 try { 132 startBackReference(offset, length); 133 } catch (final IllegalArgumentException ex) { 134 throw new IOException("Illegal block with bad offset found", ex); 135 } 136 state = State.IN_BACK_REFERENCE; 137 break; 138 139 case 0x02: 140 141 /* 142 * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits 143 * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte. 144 */ 145 146 length = (b >> 2) + 1; 147 if (length < 0) { 148 throw new IOException("Illegal block with a negative match length found"); 149 } 150 uncompressedBytesRemaining -= length; 151 152 offset = (int) ByteUtils.fromLittleEndian(supplier, 2); 153 154 try { 155 startBackReference(offset, length); 156 } catch (final IllegalArgumentException ex) { 157 throw new IOException("Illegal block with bad offset found", ex); 158 } 159 state = State.IN_BACK_REFERENCE; 160 break; 161 162 case 0x03: 163 164 /* 165 * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit 166 * integer (and thus will occupy four bytes). 167 */ 168 169 length = (b >> 2) + 1; 170 if (length < 0) { 171 throw new IOException("Illegal block with a negative match length found"); 172 } 173 uncompressedBytesRemaining -= length; 174 175 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; 176 177 try { 178 startBackReference(offset, length); 179 } catch (final IllegalArgumentException ex) { 180 throw new IOException("Illegal block with bad offset found", ex); 181 } 182 state = State.IN_BACK_REFERENCE; 183 break; 184 default: 185 // impossible as TAG_MASK is two bits and all four possible cases have been covered 186 break; 187 } 188 } 189 190 /** 191 * Gets the uncompressed size of the stream 192 * 193 * @return the uncompressed size 194 */ 195 @Override 196 public int getSize() { 197 return size; 198 } 199 200 /** 201 * {@inheritDoc} 202 */ 203 @Override 204 public int read(final byte[] b, final int off, final int len) throws IOException { 205 if (len == 0) { 206 return 0; 207 } 208 if (endReached) { 209 return -1; 210 } 211 switch (state) { 212 case NO_BLOCK: 213 fill(); 214 return read(b, off, len); 215 case IN_LITERAL: 216 final int litLen = readLiteral(b, off, len); 217 if (!hasMoreDataInBlock()) { 218 state = State.NO_BLOCK; 219 } 220 return litLen > 0 ? litLen : read(b, off, len); 221 case IN_BACK_REFERENCE: 222 final int backReferenceLen = readBackReference(b, off, len); 223 if (!hasMoreDataInBlock()) { 224 state = State.NO_BLOCK; 225 } 226 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); 227 default: 228 throw new IOException("Unknown stream state " + state); 229 } 230 } 231 232 /* 233 * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in 234 * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how 235 * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length. 236 */ 237 private int readLiteralLength(final int b) throws IOException { 238 final int length; 239 switch (b >> 2) { 240 case 60: 241 length = readOneByte(); 242 if (length == -1) { 243 throw new IOException("Premature end of stream reading literal length"); 244 } 245 break; 246 case 61: 247 length = (int) ByteUtils.fromLittleEndian(supplier, 2); 248 break; 249 case 62: 250 length = (int) ByteUtils.fromLittleEndian(supplier, 3); 251 break; 252 case 63: 253 length = (int) ByteUtils.fromLittleEndian(supplier, 4); 254 break; 255 default: 256 length = b >> 2; 257 break; 258 } 259 260 return length + 1; 261 } 262 263 /** 264 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes, 265 * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be 266 * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F. 267 * 268 * @return The size of the uncompressed data 269 * @throws IOException Could not read a byte 270 */ 271 private long readSize() throws IOException { 272 int index = 0; 273 long sz = 0; 274 int b = 0; 275 276 do { 277 b = readOneByte(); 278 if (b == -1) { 279 throw new IOException("Premature end of stream reading size"); 280 } 281 sz |= (b & 0x7f) << index++ * 7; 282 } while (0 != (b & 0x80)); 283 return sz; 284 } 285}