| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| GzipCompressorInputStream |
|
| 7.125;7,125 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one | |
| 3 | * or more contributor license agreements. See the NOTICE file | |
| 4 | * distributed with this work for additional information | |
| 5 | * regarding copyright ownership. The ASF licenses this file | |
| 6 | * to you under the Apache License, Version 2.0 (the | |
| 7 | * "License"); you may not use this file except in compliance | |
| 8 | * with the License. You may obtain a copy of the License at | |
| 9 | * | |
| 10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | * | |
| 12 | * Unless required by applicable law or agreed to in writing, | |
| 13 | * software distributed under the License is distributed on an | |
| 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
| 15 | * KIND, either express or implied. See the License for the | |
| 16 | * specific language governing permissions and limitations | |
| 17 | * under the License. | |
| 18 | */ | |
| 19 | package org.apache.commons.compress.compressors.gzip; | |
| 20 | ||
| 21 | import java.io.IOException; | |
| 22 | import java.io.EOFException; | |
| 23 | import java.io.InputStream; | |
| 24 | import java.io.DataInputStream; | |
| 25 | import java.io.BufferedInputStream; | |
| 26 | import java.util.zip.DataFormatException; | |
| 27 | import java.util.zip.Inflater; | |
| 28 | import java.util.zip.CRC32; | |
| 29 | ||
| 30 | import org.apache.commons.compress.compressors.CompressorInputStream; | |
| 31 | ||
| 32 | /** | |
| 33 | * Input stream that decompresses .gz files. | |
| 34 | * This supports decompressing concatenated .gz files which is important | |
| 35 | * when decompressing standalone .gz files. | |
| 36 | * <p> | |
| 37 | * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz | |
| 38 | * files: it stops after the first member and silently ignores the rest. | |
| 39 | * It doesn't leave the read position to point to the beginning of the next | |
| 40 | * member, which makes it difficult workaround the lack of concatenation | |
| 41 | * support. | |
| 42 | * <p> | |
| 43 | * Instead of using <code>GZIPInputStream</code>, this class has its own .gz | |
| 44 | * container format decoder. The actual decompression is done with | |
| 45 | * {@link java.util.zip.Inflater}. | |
| 46 | */ | |
| 47 | 2 | public class GzipCompressorInputStream extends CompressorInputStream { |
| 48 | // Header flags | |
| 49 | // private static final int FTEXT = 0x01; // Uninteresting for us | |
| 50 | private static final int FHCRC = 0x02; | |
| 51 | private static final int FEXTRA = 0x04; | |
| 52 | private static final int FNAME = 0x08; | |
| 53 | private static final int FCOMMENT = 0x10; | |
| 54 | private static final int FRESERVED = 0xE0; | |
| 55 | ||
| 56 | // Compressed input stream, possibly wrapped in a BufferedInputStream | |
| 57 | private final InputStream in; | |
| 58 | ||
| 59 | // True if decompressing multimember streams. | |
| 60 | private final boolean decompressConcatenated; | |
| 61 | ||
| 62 | // Buffer to hold the input data | |
| 63 | 12 | private final byte[] buf = new byte[8192]; |
| 64 | ||
| 65 | // Amount of data in buf. | |
| 66 | 12 | private int bufUsed = 0; |
| 67 | ||
| 68 | // Decompressor | |
| 69 | 12 | private Inflater inf = new Inflater(true); |
| 70 | ||
| 71 | // CRC32 from uncompressed data | |
| 72 | 12 | private final CRC32 crc = new CRC32(); |
| 73 | ||
| 74 | private int memberSize; | |
| 75 | ||
| 76 | // True once everything has been decompressed | |
| 77 | 12 | private boolean endReached = false; |
| 78 | ||
| 79 | // used in no-arg read method | |
| 80 | 12 | private final byte[] oneByte = new byte[1]; |
| 81 | ||
| 82 | /** | |
| 83 | * Constructs a new input stream that decompresses gzip-compressed data | |
| 84 | * from the specified input stream. | |
| 85 | * <p> | |
| 86 | * This is equivalent to | |
| 87 | * <code>GzipCompressorInputStream(inputStream, false)</code> and thus | |
| 88 | * will not decompress concatenated .gz files. | |
| 89 | * | |
| 90 | * @param inputStream the InputStream from which this object should | |
| 91 | * be created of | |
| 92 | * | |
| 93 | * @throws IOException if the stream could not be created | |
| 94 | */ | |
| 95 | public GzipCompressorInputStream(InputStream inputStream) | |
| 96 | throws IOException { | |
| 97 | 8 | this(inputStream, false); |
| 98 | 8 | } |
| 99 | ||
| 100 | /** | |
| 101 | * Constructs a new input stream that decompresses gzip-compressed data | |
| 102 | * from the specified input stream. | |
| 103 | * <p> | |
| 104 | * If <code>decompressConcatenated</code> is {@code false}: | |
| 105 | * This decompressor might read more input than it will actually use. | |
| 106 | * If <code>inputStream</code> supports <code>mark</code> and | |
| 107 | * <code>reset</code>, then the input position will be adjusted | |
| 108 | * so that it is right after the last byte of the compressed stream. | |
| 109 | * If <code>mark</code> isn't supported, the input position will be | |
| 110 | * undefined. | |
| 111 | * | |
| 112 | * @param inputStream the InputStream from which this object should | |
| 113 | * be created of | |
| 114 | * @param decompressConcatenated | |
| 115 | * if true, decompress until the end of the input; | |
| 116 | * if false, stop after the first .gz member | |
| 117 | * | |
| 118 | * @throws IOException if the stream could not be created | |
| 119 | */ | |
| 120 | public GzipCompressorInputStream(InputStream inputStream, | |
| 121 | boolean decompressConcatenated) | |
| 122 | 12 | throws IOException { |
| 123 | // Mark support is strictly needed for concatenated files only, | |
| 124 | // but it's simpler if it is always available. | |
| 125 | 12 | if (inputStream.markSupported()) { |
| 126 | 4 | in = inputStream; |
| 127 | } else { | |
| 128 | 8 | in = new BufferedInputStream(inputStream); |
| 129 | } | |
| 130 | ||
| 131 | 12 | this.decompressConcatenated = decompressConcatenated; |
| 132 | 12 | init(true); |
| 133 | 12 | } |
| 134 | ||
| 135 | private boolean init(boolean isFirstMember) throws IOException { | |
| 136 | 16 | assert isFirstMember || decompressConcatenated; |
| 137 | ||
| 138 | // Check the magic bytes without a possibility of EOFException. | |
| 139 | 16 | int magic0 = in.read(); |
| 140 | 16 | int magic1 = in.read(); |
| 141 | ||
| 142 | // If end of input was reached after decompressing at least | |
| 143 | // one .gz member, we have reached the end of the file successfully. | |
| 144 | 16 | if (magic0 == -1 && !isFirstMember) { |
| 145 | 2 | return false; |
| 146 | } | |
| 147 | ||
| 148 | 14 | if (magic0 != 31 || magic1 != 139) { |
| 149 | 0 | throw new IOException(isFirstMember |
| 150 | ? "Input is not in the .gz format" | |
| 151 | : "Garbage after a valid .gz stream"); | |
| 152 | } | |
| 153 | ||
| 154 | // Parsing the rest of the header may throw EOFException. | |
| 155 | 14 | DataInputStream inData = new DataInputStream(in); |
| 156 | 14 | int method = inData.readUnsignedByte(); |
| 157 | 14 | if (method != 8) { |
| 158 | 0 | throw new IOException("Unsupported compression method " |
| 159 | + method + " in the .gz header"); | |
| 160 | } | |
| 161 | ||
| 162 | 14 | int flg = inData.readUnsignedByte(); |
| 163 | 14 | if ((flg & FRESERVED) != 0) { |
| 164 | 0 | throw new IOException( |
| 165 | "Reserved flags are set in the .gz header"); | |
| 166 | } | |
| 167 | ||
| 168 | 14 | inData.readInt(); // mtime, ignored |
| 169 | 14 | inData.readUnsignedByte(); // extra flags, ignored |
| 170 | 14 | inData.readUnsignedByte(); // operating system, ignored |
| 171 | ||
| 172 | // Extra field, ignored | |
| 173 | 14 | if ((flg & FEXTRA) != 0) { |
| 174 | 0 | int xlen = inData.readUnsignedByte(); |
| 175 | 0 | xlen |= inData.readUnsignedByte() << 8; |
| 176 | ||
| 177 | // This isn't as efficient as calling in.skip would be, | |
| 178 | // but it's lazier to handle unexpected end of input this way. | |
| 179 | // Most files don't have an extra field anyway. | |
| 180 | 0 | while (xlen-- > 0) { |
| 181 | 0 | inData.readUnsignedByte(); |
| 182 | } | |
| 183 | } | |
| 184 | ||
| 185 | // Original file name, ignored | |
| 186 | 14 | if ((flg & FNAME) != 0) { |
| 187 | 6 | readToNull(inData); |
| 188 | } | |
| 189 | ||
| 190 | // Comment, ignored | |
| 191 | 14 | if ((flg & FCOMMENT) != 0) { |
| 192 | 0 | readToNull(inData); |
| 193 | } | |
| 194 | ||
| 195 | // Header "CRC16" which is actually a truncated CRC32 (which isn't | |
| 196 | // as good as real CRC16). I don't know if any encoder implementation | |
| 197 | // sets this, so it's not worth trying to verify it. GNU gzip 1.4 | |
| 198 | // doesn't support this field, but zlib seems to be able to at least | |
| 199 | // skip over it. | |
| 200 | 14 | if ((flg & FHCRC) != 0) { |
| 201 | 0 | inData.readShort(); |
| 202 | } | |
| 203 | ||
| 204 | // Reset | |
| 205 | 14 | inf.reset(); |
| 206 | 14 | crc.reset(); |
| 207 | 14 | memberSize = 0; |
| 208 | ||
| 209 | 14 | return true; |
| 210 | } | |
| 211 | ||
| 212 | private void readToNull(DataInputStream inData) throws IOException { | |
| 213 | 12 | while (inData.readUnsignedByte() != 0x00) {} |
| 214 | 6 | } |
| 215 | ||
| 216 | /** {@inheritDoc} */ | |
| 217 | @Override | |
| 218 | public int read() throws IOException { | |
| 219 | 10 | return read(oneByte, 0, 1) == -1 ? -1 : (oneByte[0] & 0xFF); |
| 220 | } | |
| 221 | ||
| 222 | /** | |
| 223 | * {@inheritDoc} | |
| 224 | * | |
| 225 | * @since 1.1 | |
| 226 | */ | |
| 227 | @Override | |
| 228 | public int read(byte[] b, int off, int len) throws IOException { | |
| 229 | 22 | if (endReached) { |
| 230 | 6 | return -1; |
| 231 | } | |
| 232 | ||
| 233 | 16 | int size = 0; |
| 234 | ||
| 235 | 22 | while (len > 0) { |
| 236 | 16 | if (inf.needsInput()) { |
| 237 | // Remember the current position because we may need to | |
| 238 | // rewind after reading too much input. | |
| 239 | 12 | in.mark(buf.length); |
| 240 | ||
| 241 | 12 | bufUsed = in.read(buf); |
| 242 | 12 | if (bufUsed == -1) { |
| 243 | 0 | throw new EOFException(); |
| 244 | } | |
| 245 | ||
| 246 | 12 | inf.setInput(buf, 0, bufUsed); |
| 247 | } | |
| 248 | ||
| 249 | int ret; | |
| 250 | try { | |
| 251 | 16 | ret = inf.inflate(b, off, len); |
| 252 | 0 | } catch (DataFormatException e) { |
| 253 | 0 | throw new IOException("Gzip-compressed data is corrupt"); |
| 254 | 16 | } |
| 255 | ||
| 256 | 16 | crc.update(b, off, ret); |
| 257 | 16 | memberSize += ret; |
| 258 | 16 | off += ret; |
| 259 | 16 | len -= ret; |
| 260 | 16 | size += ret; |
| 261 | 16 | count(ret); |
| 262 | ||
| 263 | 16 | if (inf.finished()) { |
| 264 | // We may have read too many bytes. Rewind the read | |
| 265 | // position to match the actual amount used. | |
| 266 | // | |
| 267 | // NOTE: The "if" is there just in case. Since we used | |
| 268 | // in.mark earler, it should always skip enough. | |
| 269 | 12 | in.reset(); |
| 270 | ||
| 271 | 12 | int skipAmount = bufUsed - inf.getRemaining(); |
| 272 | 12 | if (in.skip(skipAmount) != skipAmount) { |
| 273 | 0 | throw new IOException(); |
| 274 | } | |
| 275 | ||
| 276 | 12 | bufUsed = 0; |
| 277 | ||
| 278 | 12 | DataInputStream inData = new DataInputStream(in); |
| 279 | ||
| 280 | // CRC32 | |
| 281 | 12 | long crcStored = 0; |
| 282 | 60 | for (int i = 0; i < 4; ++i) { |
| 283 | 48 | crcStored |= (long)inData.readUnsignedByte() << (i * 8); |
| 284 | } | |
| 285 | ||
| 286 | 12 | if (crcStored != crc.getValue()) { |
| 287 | 0 | throw new IOException("Gzip-compressed data is corrupt " |
| 288 | + "(CRC32 error)"); | |
| 289 | } | |
| 290 | ||
| 291 | // Uncompressed size modulo 2^32 (ISIZE in the spec) | |
| 292 | 12 | int isize = 0; |
| 293 | 58 | for (int i = 0; i < 4; ++i) { |
| 294 | 48 | isize |= inData.readUnsignedByte() << (i * 8); |
| 295 | } | |
| 296 | ||
| 297 | 10 | if (isize != memberSize) { |
| 298 | 0 | throw new IOException("Gzip-compressed data is corrupt" |
| 299 | + "(uncompressed size mismatch)"); | |
| 300 | } | |
| 301 | ||
| 302 | // See if this is the end of the file. | |
| 303 | 10 | if (!decompressConcatenated || !init(false)) { |
| 304 | 8 | inf.end(); |
| 305 | 8 | inf = null; |
| 306 | 8 | endReached = true; |
| 307 | 8 | return size == 0 ? -1 : size; |
| 308 | } | |
| 309 | } | |
| 310 | 6 | } |
| 311 | ||
| 312 | 6 | return size; |
| 313 | } | |
| 314 | ||
| 315 | /** | |
| 316 | * Checks if the signature matches what is expected for a .gz file. | |
| 317 | * | |
| 318 | * @param signature the bytes to check | |
| 319 | * @param length the number of bytes to check | |
| 320 | * @return true if this is a .gz stream, false otherwise | |
| 321 | * | |
| 322 | * @since 1.1 | |
| 323 | */ | |
| 324 | public static boolean matches(byte[] signature, int length) { | |
| 325 | ||
| 326 | 2 | if (length < 2) { |
| 327 | 0 | return false; |
| 328 | } | |
| 329 | ||
| 330 | 2 | if (signature[0] != 31) { |
| 331 | 0 | return false; |
| 332 | } | |
| 333 | ||
| 334 | 2 | if (signature[1] != -117) { |
| 335 | 0 | return false; |
| 336 | } | |
| 337 | ||
| 338 | 2 | return true; |
| 339 | } | |
| 340 | ||
| 341 | /** | |
| 342 | * Closes the input stream (unless it is System.in). | |
| 343 | * | |
| 344 | * @since 1.2 | |
| 345 | */ | |
| 346 | @Override | |
| 347 | public void close() throws IOException { | |
| 348 | 10 | if (inf != null) { |
| 349 | 2 | inf.end(); |
| 350 | 2 | inf = null; |
| 351 | } | |
| 352 | ||
| 353 | 10 | if (this.in != System.in) { |
| 354 | 10 | this.in.close(); |
| 355 | } | |
| 356 | 10 | } |
| 357 | } |