001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 */
017
018/*
019 * This package is based on the work done by Timothy Gerard Endres
020 * (time@ice.com) to whom the Ant project is very grateful for his great code.
021 */
022
023package org.apache.commons.compress.archivers.tar;
024
025import java.io.ByteArrayOutputStream;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.HashMap;
032import java.util.List;
033import java.util.Map;
034
035import org.apache.commons.compress.archivers.ArchiveEntry;
036import org.apache.commons.compress.archivers.ArchiveInputStream;
037import org.apache.commons.compress.archivers.zip.ZipEncoding;
038import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
039import org.apache.commons.compress.utils.ArchiveUtils;
040import org.apache.commons.compress.utils.BoundedInputStream;
041import org.apache.commons.compress.utils.IOUtils;
042
043/**
044 * The TarInputStream reads a UNIX tar archive as an InputStream. methods are provided to position at each successive entry in the archive, and the read each
045 * entry as a normal input stream using read().
046 *
047 * @NotThreadSafe
048 */
049public class TarArchiveInputStream extends ArchiveInputStream<TarArchiveEntry> {
050
051    private static final int SMALL_BUFFER_SIZE = 256;
052
053    /**
054     * Checks if the signature matches what is expected for a tar file.
055     *
056     * @param signature the bytes to check
057     * @param length    the number of bytes to check
058     * @return true, if this stream is a tar archive stream, false otherwise
059     */
060    public static boolean matches(final byte[] signature, final int length) {
061        final int versionOffset = TarConstants.VERSION_OFFSET;
062        final int versionLen = TarConstants.VERSIONLEN;
063        if (length < versionOffset + versionLen) {
064            return false;
065        }
066
067        final int magicOffset = TarConstants.MAGIC_OFFSET;
068        final int magicLen = TarConstants.MAGICLEN;
069        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, signature, magicOffset, magicLen)
070                && ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, signature, versionOffset, versionLen)) {
071            return true;
072        }
073        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, signature, magicOffset, magicLen)
074                && (ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, signature, versionOffset, versionLen)
075                        || ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, signature, versionOffset, versionLen))) {
076            return true;
077        }
078        // COMPRESS-107 - recognize Ant tar files
079        return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, signature, magicOffset, magicLen)
080                && ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, signature, versionOffset, versionLen);
081    }
082
083    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
084
085    /** The buffer to store the TAR header. **/
086    private final byte[] recordBuffer;
087
088    /** The size of a block. */
089    private final int blockSize;
090
091    /** True if stream is at EOF. */
092    private boolean atEof;
093
094    /** Size of the current . */
095    private long entrySize;
096
097    /** How far into the entry the stream is at. */
098    private long entryOffset;
099
100    /** Input streams for reading sparse entries. **/
101    private List<InputStream> sparseInputStreams;
102
103    /** The index of current input stream being read when reading sparse entries. */
104    private int currentSparseInputStreamIndex;
105
106    /** The meta-data about the current entry. */
107    private TarArchiveEntry currEntry;
108
109    /** The encoding of the file. */
110    private final ZipEncoding zipEncoding;
111
112    /** The global PAX header. */
113    private Map<String, String> globalPaxHeaders = new HashMap<>();
114
115    /** The global sparse headers, this is only used in PAX Format 0.X. */
116    private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
117
118    private final boolean lenient;
119
120    /**
121     * Constructs a new instance.
122     *
123     * @param inputStream the input stream to use
124     */
125    public TarArchiveInputStream(final InputStream inputStream) {
126        this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
127    }
128
129    /**
130     * Constructs a new instance.
131     *
132     * @param inputStream the input stream to use
133     * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
134     *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
135     * @since 1.19
136     */
137    public TarArchiveInputStream(final InputStream inputStream, final boolean lenient) {
138        this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
139    }
140
141    /**
142     * Constructs a new instance.
143     *
144     * @param inputStream the input stream to use
145     * @param blockSize   the block size to use
146     */
147    public TarArchiveInputStream(final InputStream inputStream, final int blockSize) {
148        this(inputStream, blockSize, TarConstants.DEFAULT_RCDSIZE);
149    }
150
151    /**
152     * Constructs a new instance.
153     *
154     * @param inputStream the input stream to use
155     * @param blockSize   the block size to use
156     * @param recordSize  the record size to use
157     */
158    public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize) {
159        this(inputStream, blockSize, recordSize, null);
160    }
161
162    /**
163     * Constructs a new instance.
164     *
165     * @param inputStream the input stream to use
166     * @param blockSize   the block size to use
167     * @param recordSize  the record size to use
168     * @param encoding    name of the encoding to use for file names
169     * @since 1.4
170     */
171    public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize, final String encoding) {
172        this(inputStream, blockSize, recordSize, encoding, false);
173    }
174
175    /**
176     * Constructs a new instance.
177     *
178     * @param inputStream the input stream to use
179     * @param blockSize   the block size to use
180     * @param recordSize  the record size to use
181     * @param encoding    name of the encoding to use for file names
182     * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
183     *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
184     * @since 1.19
185     */
186    public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize, final String encoding, final boolean lenient) {
187        super(inputStream, encoding);
188        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
189        this.recordBuffer = new byte[recordSize];
190        this.blockSize = blockSize;
191        this.lenient = lenient;
192    }
193
194    /**
195     * Constructs a new instance.
196     *
197     * @param inputStream the input stream to use
198     * @param blockSize   the block size to use
199     * @param encoding    name of the encoding to use for file names
200     * @since 1.4
201     */
202    public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final String encoding) {
203        this(inputStream, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
204    }
205
206    /**
207     * Constructs a new instance.
208     *
209     * @param inputStream the input stream to use
210     * @param encoding    name of the encoding to use for file names
211     * @since 1.4
212     */
213    public TarArchiveInputStream(final InputStream inputStream, final String encoding) {
214        this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding);
215    }
216
217    private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) throws IOException {
218        currEntry.updateEntryFromPaxHeaders(headers);
219        currEntry.setSparseHeaders(sparseHeaders);
220    }
221
222    /**
223     * Gets the available data that can be read from the current entry in the archive. This does not indicate how much data is left in the entire archive, only
224     * in the current entry. This value is determined from the entry's size header field and the amount of data already read from the current entry.
225     * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE bytes are left in the current entry in the archive.
226     *
227     * @return The number of available bytes for the current entry.
228     * @throws IOException for signature
229     */
230    @Override
231    public int available() throws IOException {
232        if (isDirectory()) {
233            return 0;
234        }
235        final long available = currEntry.getRealSize() - entryOffset;
236        if (available > Integer.MAX_VALUE) {
237            return Integer.MAX_VALUE;
238        }
239        return (int) available;
240    }
241
242    /**
243     * Build the input streams consisting of all-zero input streams and non-zero input streams. When reading from the non-zero input streams, the data is
244     * actually read from the original input stream. The size of each input stream is introduced by the sparse headers.
245     * <p>
246     * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the 0 size input streams because they are meaningless.
247     * </p>
248     */
249    private void buildSparseInputStreams() throws IOException {
250        currentSparseInputStreamIndex = -1;
251        sparseInputStreams = new ArrayList<>();
252
253        final List<TarArchiveStructSparse> sparseHeaders = currEntry.getOrderedSparseHeaders();
254
255        // Stream doesn't need to be closed at all as it doesn't use any resources
256        final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); // NOSONAR
257        // logical offset into the extracted entry
258        long offset = 0;
259        for (final TarArchiveStructSparse sparseHeader : sparseHeaders) {
260            final long zeroBlockSize = sparseHeader.getOffset() - offset;
261            if (zeroBlockSize < 0) {
262                // sparse header says to move backwards inside the extracted entry
263                throw new IOException("Corrupted struct sparse detected");
264            }
265
266            // only store the zero block if it is not empty
267            if (zeroBlockSize > 0) {
268                sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset));
269            }
270
271            // only store the input streams with non-zero size
272            if (sparseHeader.getNumbytes() > 0) {
273                sparseInputStreams.add(new BoundedInputStream(in, sparseHeader.getNumbytes()));
274            }
275
276            offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
277        }
278
279        if (!sparseInputStreams.isEmpty()) {
280            currentSparseInputStreamIndex = 0;
281        }
282    }
283
284    /**
285     * Whether this class is able to read the given entry.
286     *
287     * @return The implementation will return true if the {@link ArchiveEntry} is an instance of {@link TarArchiveEntry}
288     */
289    @Override
290    public boolean canReadEntryData(final ArchiveEntry archiveEntry) {
291        return archiveEntry instanceof TarArchiveEntry;
292    }
293
294    /**
295     * Closes this stream. Calls the TarBuffer's close() method.
296     *
297     * @throws IOException on error
298     */
299    @Override
300    public void close() throws IOException {
301        // Close all the input streams in sparseInputStreams
302        if (sparseInputStreams != null) {
303            for (final InputStream inputStream : sparseInputStreams) {
304                inputStream.close();
305            }
306        }
307        in.close();
308    }
309
310    /**
311     * This method is invoked once the end of the archive is hit, it tries to consume the remaining bytes under the assumption that the tool creating this
312     * archive has padded the last block.
313     */
314    private void consumeRemainderOfLastBlock() throws IOException {
315        final long bytesReadOfLastBlock = getBytesRead() % blockSize;
316        if (bytesReadOfLastBlock > 0) {
317            count(IOUtils.skip(in, blockSize - bytesReadOfLastBlock));
318        }
319    }
320
321    /**
322     * For FileInputStream, the skip always return the number you input, so we need the available bytes to determine how many bytes are actually skipped
323     *
324     * @param available available bytes returned by inputStream.available()
325     * @param skipped   skipped bytes returned by inputStream.skip()
326     * @param expected  bytes expected to skip
327     * @return number of bytes actually skipped
328     * @throws IOException if a truncated tar archive is detected
329     */
330    private long getActuallySkipped(final long available, final long skipped, final long expected) throws IOException {
331        long actuallySkipped = skipped;
332        if (in instanceof FileInputStream) {
333            actuallySkipped = Math.min(skipped, available);
334        }
335        if (actuallySkipped != expected) {
336            throw new IOException("Truncated TAR archive");
337        }
338        return actuallySkipped;
339    }
340
341    /**
342     * Gets the current TAR Archive Entry that this input stream is processing
343     *
344     * @return The current Archive Entry
345     */
346    public TarArchiveEntry getCurrentEntry() {
347        return currEntry;
348    }
349
350    /**
351     * Gets the next entry in this tar archive as long name data.
352     *
353     * @return The next entry in the archive as long name data, or null.
354     * @throws IOException on error
355     */
356    protected byte[] getLongNameData() throws IOException {
357        // read in the name
358        final ByteArrayOutputStream longName = new ByteArrayOutputStream();
359        int length = 0;
360        while ((length = read(smallBuf)) >= 0) {
361            longName.write(smallBuf, 0, length);
362        }
363        getNextEntry();
364        if (currEntry == null) {
365            // Bugzilla: 40334
366            // Malformed tar file - long entry name not followed by entry
367            return null;
368        }
369        byte[] longNameData = longName.toByteArray();
370        // remove trailing null terminator(s)
371        length = longNameData.length;
372        while (length > 0 && longNameData[length - 1] == 0) {
373            --length;
374        }
375        if (length != longNameData.length) {
376            longNameData = Arrays.copyOf(longNameData, length);
377        }
378        return longNameData;
379    }
380
381    /**
382     * Gets the next TarArchiveEntry in this stream.
383     *
384     * @return the next entry, or {@code null} if there are no more entries
385     * @throws IOException if the next entry could not be read
386     */
387    @Override
388    public TarArchiveEntry getNextEntry() throws IOException {
389        return getNextTarEntry();
390    }
391
392    /**
393     * Gets the next entry in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
394     * header of the next entry, and read the header and instantiate a new TarEntry from the header bytes and return that entry. If there are no more entries in
395     * the archive, null will be returned to indicate that the end of the archive has been reached.
396     *
397     * @return The next TarEntry in the archive, or null.
398     * @throws IOException on error
399     * @deprecated Use {@link #getNextEntry()}.
400     */
401    @Deprecated
402    public TarArchiveEntry getNextTarEntry() throws IOException {
403        if (isAtEOF()) {
404            return null;
405        }
406
407        if (currEntry != null) {
408            /* Skip will only go to the end of the current entry */
409            IOUtils.skip(this, Long.MAX_VALUE);
410
411            /* skip to the end of the last record */
412            skipRecordPadding();
413        }
414
415        final byte[] headerBuf = getRecord();
416
417        if (headerBuf == null) {
418            /* hit EOF */
419            currEntry = null;
420            return null;
421        }
422
423        try {
424            currEntry = new TarArchiveEntry(globalPaxHeaders, headerBuf, zipEncoding, lenient);
425        } catch (final IllegalArgumentException e) {
426            throw new IOException("Error detected parsing the header", e);
427        }
428
429        entryOffset = 0;
430        entrySize = currEntry.getSize();
431
432        if (currEntry.isGNULongLinkEntry()) {
433            final byte[] longLinkData = getLongNameData();
434            if (longLinkData == null) {
435                // Bugzilla: 40334
436                // Malformed tar file - long link entry name not followed by entry
437                return null;
438            }
439            currEntry.setLinkName(zipEncoding.decode(longLinkData));
440        }
441
442        if (currEntry.isGNULongNameEntry()) {
443            final byte[] longNameData = getLongNameData();
444            if (longNameData == null) {
445                // Bugzilla: 40334
446                // Malformed tar file - long entry name not followed by entry
447                return null;
448            }
449
450            // COMPRESS-509 : the name of directories should end with '/'
451            final String name = zipEncoding.decode(longNameData);
452            currEntry.setName(name);
453            if (currEntry.isDirectory() && !name.endsWith("/")) {
454                currEntry.setName(name + "/");
455            }
456        }
457
458        if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
459            readGlobalPaxHeaders();
460        }
461
462        try {
463            if (currEntry.isPaxHeader()) { // Process Pax headers
464                paxHeaders();
465            } else if (!globalPaxHeaders.isEmpty()) {
466                applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
467            }
468        } catch (final NumberFormatException e) {
469            throw new IOException("Error detected parsing the pax header", e);
470        }
471
472        if (currEntry.isOldGNUSparse()) { // Process sparse files
473            readOldGNUSparse();
474        }
475
476        // If the size of the next element in the archive has changed
477        // due to a new size being reported in the POSIX header
478        // information, we update entrySize here so that it contains
479        // the correct value.
480        entrySize = currEntry.getSize();
481
482        return currEntry;
483    }
484
485    /**
486     * Gets the next record in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
487     * header of the next entry.
488     * <p>
489     * If there are no more entries in the archive, null will be returned to indicate that the end of the archive has been reached. At the same time the
490     * {@code hasHitEOF} marker will be set to true.
491     * </p>
492     *
493     * @return The next header in the archive, or null.
494     * @throws IOException on error
495     */
496    private byte[] getRecord() throws IOException {
497        byte[] headerBuf = readRecord();
498        setAtEOF(isEOFRecord(headerBuf));
499        if (isAtEOF() && headerBuf != null) {
500            tryToConsumeSecondEOFRecord();
501            consumeRemainderOfLastBlock();
502            headerBuf = null;
503        }
504        return headerBuf;
505    }
506
507    /**
508     * Gets the record size being used by this stream's buffer.
509     *
510     * @return The TarBuffer record size.
511     */
512    public int getRecordSize() {
513        return recordBuffer.length;
514    }
515
516    protected final boolean isAtEOF() {
517        return atEof;
518    }
519
520    private boolean isDirectory() {
521        return currEntry != null && currEntry.isDirectory();
522    }
523
524    /**
525     * Tests if an archive record indicate End of Archive. End of archive is indicated by a record that consists entirely of null bytes.
526     *
527     * @param record The record data to check.
528     * @return true if the record data is an End of Archive
529     */
530    protected boolean isEOFRecord(final byte[] record) {
531        return record == null || ArchiveUtils.isArrayZero(record, getRecordSize());
532    }
533
534    /**
535     * Since we do not support marking just yet, we do nothing.
536     *
537     * @param markLimit The limit to mark.
538     */
539    @Override
540    public synchronized void mark(final int markLimit) {
541    }
542
543    /**
544     * Since we do not support marking just yet, we return false.
545     *
546     * @return false.
547     */
548    @Override
549    public boolean markSupported() {
550        return false;
551    }
552
553    /**
554     * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) may appear multi times, and they look like:
555     * <p>
556     * GNU.sparse.size=size GNU.sparse.numblocks=numblocks repeat numblocks times GNU.sparse.offset=offset GNU.sparse.numbytes=numbytes end repeat
557     * </p>
558     * <p>
559     * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
560     * </p>
561     * <p>
562     * GNU.sparse.map Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
563     * </p>
564     * <p>
565     * For PAX Format 1.X: The sparse map itself is stored in the file data block, preceding the actual file data. It consists of a series of decimal numbers
566     * delimited by newlines. The map is padded with nulls to the nearest block boundary. The first number gives the number of entries in the map. Following are
567     * map entries, each one consisting of two numbers giving the offset and size of the data block it describes.
568     * </p>
569     *
570     * @throws IOException
571     */
572    private void paxHeaders() throws IOException {
573        List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
574        final Map<String, String> headers = TarUtils.parsePaxHeaders(this, sparseHeaders, globalPaxHeaders, entrySize);
575
576        // for 0.1 PAX Headers
577        if (headers.containsKey(TarGnuSparseKeys.MAP)) {
578            sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get(TarGnuSparseKeys.MAP)));
579        }
580        getNextEntry(); // Get the actual file entry
581        if (currEntry == null) {
582            throw new IOException("premature end of tar archive. Didn't find any entry after PAX header.");
583        }
584        applyPaxHeadersToCurrentEntry(headers, sparseHeaders);
585
586        // for 1.0 PAX Format, the sparse map is stored in the file data block
587        if (currEntry.isPaxGNU1XSparse()) {
588            sparseHeaders = TarUtils.parsePAX1XSparseHeaders(in, getRecordSize());
589            currEntry.setSparseHeaders(sparseHeaders);
590        }
591
592        // sparse headers are all done reading, we need to build
593        // sparse input streams using these sparse headers
594        buildSparseInputStreams();
595    }
596
597    /**
598     * Reads bytes from the current tar archive entry.
599     * <p>
600     * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF.
601     * </p>
602     *
603     * @param buf       The buffer into which to place bytes read.
604     * @param offset    The offset at which to place bytes read.
605     * @param numToRead The number of bytes to read.
606     * @return The number of bytes read, or -1 at EOF.
607     * @throws IOException on error
608     */
609    @Override
610    public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
611        if (numToRead == 0) {
612            return 0;
613        }
614        int totalRead = 0;
615
616        if (isAtEOF() || isDirectory()) {
617            return -1;
618        }
619
620        if (currEntry == null) {
621            throw new IllegalStateException("No current tar entry");
622        }
623
624        if (entryOffset >= currEntry.getRealSize()) {
625            return -1;
626        }
627
628        numToRead = Math.min(numToRead, available());
629
630        if (currEntry.isSparse()) {
631            // for sparse entries, we need to read them in another way
632            totalRead = readSparse(buf, offset, numToRead);
633        } else {
634            totalRead = in.read(buf, offset, numToRead);
635        }
636
637        if (totalRead == -1) {
638            if (numToRead > 0) {
639                throw new IOException("Truncated TAR archive");
640            }
641            setAtEOF(true);
642        } else {
643            count(totalRead);
644            entryOffset += totalRead;
645        }
646
647        return totalRead;
648    }
649
650    private void readGlobalPaxHeaders() throws IOException {
651        globalPaxHeaders = TarUtils.parsePaxHeaders(this, globalSparseHeaders, globalPaxHeaders, entrySize);
652        getNextEntry(); // Get the actual file entry
653
654        if (currEntry == null) {
655            throw new IOException("Error detected parsing the pax header");
656        }
657    }
658
659    /**
660     * Adds the sparse chunks from the current entry to the sparse chunks, including any additional sparse entries following the current entry.
661     *
662     * @throws IOException on error
663     */
664    private void readOldGNUSparse() throws IOException {
665        if (currEntry.isExtended()) {
666            TarArchiveSparseEntry entry;
667            do {
668                final byte[] headerBuf = getRecord();
669                if (headerBuf == null) {
670                    throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag.");
671                }
672                entry = new TarArchiveSparseEntry(headerBuf);
673                currEntry.getSparseHeaders().addAll(entry.getSparseHeaders());
674            } while (entry.isExtended());
675        }
676
677        // sparse headers are all done reading, we need to build
678        // sparse input streams using these sparse headers
679        buildSparseInputStreams();
680    }
681
682    /**
683     * Read a record from the input stream and return the data.
684     *
685     * @return The record data or null if EOF has been hit.
686     * @throws IOException on error
687     */
688    protected byte[] readRecord() throws IOException {
689        final int readCount = IOUtils.readFully(in, recordBuffer);
690        count(readCount);
691        if (readCount != getRecordSize()) {
692            return null;
693        }
694
695        return recordBuffer;
696    }
697
698    /**
699     * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is stored in tar files, and they are stored
700     * separately. The structure of non-zero data is introduced by the sparse headers using the offset, where a block of non-zero data starts, and numbytes, the
701     * length of the non-zero data block. When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together according to
702     * the sparse headers.
703     *
704     * @param buf       The buffer into which to place bytes read.
705     * @param offset    The offset at which to place bytes read.
706     * @param numToRead The number of bytes to read.
707     * @return The number of bytes read, or -1 at EOF.
708     * @throws IOException on error
709     */
710    private int readSparse(final byte[] buf, final int offset, final int numToRead) throws IOException {
711        // if there are no actual input streams, just read from the original input stream
712        if (sparseInputStreams == null || sparseInputStreams.isEmpty()) {
713            return in.read(buf, offset, numToRead);
714        }
715        if (currentSparseInputStreamIndex >= sparseInputStreams.size()) {
716            return -1;
717        }
718        final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
719        final int readLen = currentInputStream.read(buf, offset, numToRead);
720        // if the current input stream is the last input stream,
721        // just return the number of bytes read from current input stream
722        if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
723            return readLen;
724        }
725        // if EOF of current input stream is meet, open a new input stream and recursively call read
726        if (readLen == -1) {
727            currentSparseInputStreamIndex++;
728            return readSparse(buf, offset, numToRead);
729        }
730        // if the rest data of current input stream is not long enough, open a new input stream
731        // and recursively call read
732        if (readLen < numToRead) {
733            currentSparseInputStreamIndex++;
734            final int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
735            if (readLenOfNext == -1) {
736                return readLen;
737            }
738            return readLen + readLenOfNext;
739        }
740        // if the rest data of current input stream is enough(which means readLen == len), just return readLen
741        return readLen;
742    }
743
744    /**
745     * Since we do not support marking just yet, we do nothing.
746     */
747    @Override
748    public synchronized void reset() {
749    }
750
751    protected final void setAtEOF(final boolean atEof) {
752        this.atEof = atEof;
753    }
754
755    protected final void setCurrentEntry(final TarArchiveEntry currEntry) {
756        this.currEntry = currEntry;
757    }
758
759    /**
760     * Skips over and discards {@code n} bytes of data from this input stream. The {@code skip} method may, for a variety of reasons, end up skipping over some
761     * smaller number of bytes, possibly {@code 0}. This may result from any of a number of conditions; reaching end of file or end of entry before {@code n}
762     * bytes have been skipped; are only two possibilities. The actual number of bytes skipped is returned. If {@code n} is negative, no bytes are skipped.
763     *
764     * @param n the number of bytes to be skipped.
765     * @return the actual number of bytes skipped.
766     * @throws IOException if a truncated tar archive is detected or some other I/O error occurs
767     */
768    @Override
769    public long skip(final long n) throws IOException {
770        if (n <= 0 || isDirectory()) {
771            return 0;
772        }
773
774        final long availableOfInputStream = in.available();
775        final long available = currEntry.getRealSize() - entryOffset;
776        final long numToSkip = Math.min(n, available);
777        long skipped;
778
779        if (!currEntry.isSparse()) {
780            skipped = IOUtils.skip(in, numToSkip);
781            // for non-sparse entry, we should get the bytes actually skipped bytes along with
782            // inputStream.available() if inputStream is instance of FileInputStream
783            skipped = getActuallySkipped(availableOfInputStream, skipped, numToSkip);
784        } else {
785            skipped = skipSparse(numToSkip);
786        }
787
788        count(skipped);
789        entryOffset += skipped;
790        return skipped;
791    }
792
793    /**
794     * The last record block should be written at the full size, so skip any additional space used to fill a record after an entry.
795     *
796     * @throws IOException if a truncated tar archive is detected
797     */
798    private void skipRecordPadding() throws IOException {
799        if (!isDirectory() && this.entrySize > 0 && this.entrySize % getRecordSize() != 0) {
800            final long available = in.available();
801            final long numRecords = this.entrySize / getRecordSize() + 1;
802            final long padding = numRecords * getRecordSize() - this.entrySize;
803            long skipped = IOUtils.skip(in, padding);
804
805            skipped = getActuallySkipped(available, skipped, padding);
806
807            count(skipped);
808        }
809    }
810
811    /**
812     * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip, jump to the next input stream and skip the rest
813     * bytes, keep doing this until total n bytes are skipped or the input streams are all skipped
814     *
815     * @param n bytes of data to skip
816     * @return actual bytes of data skipped
817     * @throws IOException
818     */
819    private long skipSparse(final long n) throws IOException {
820        if (sparseInputStreams == null || sparseInputStreams.isEmpty()) {
821            return in.skip(n);
822        }
823        long bytesSkipped = 0;
824        while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) {
825            final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
826            bytesSkipped += currentInputStream.skip(n - bytesSkipped);
827            if (bytesSkipped < n) {
828                currentSparseInputStreamIndex++;
829            }
830        }
831        return bytesSkipped;
832    }
833
834    /**
835     * Tries to read the next record rewinding the stream if it is not an EOF record.
836     * <p>
837     * This is meant to protect against cases where a tar implementation has written only one EOF record when two are expected. Actually this won't help since a
838     * non-conforming implementation likely won't fill full blocks consisting of - by default - ten records either so we probably have already read beyond the
839     * archive anyway.
840     * </p>
841     */
842    private void tryToConsumeSecondEOFRecord() throws IOException {
843        boolean shouldReset = true;
844        final boolean marked = in.markSupported();
845        if (marked) {
846            in.mark(getRecordSize());
847        }
848        try {
849            shouldReset = !isEOFRecord(readRecord());
850        } finally {
851            if (shouldReset && marked) {
852                pushedBackBytes(getRecordSize());
853                in.reset();
854            }
855        }
856    }
857}