View Javadoc
1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one or more
3    *  contributor license agreements.  See the NOTICE file distributed with
4    *  this work for additional information regarding copyright ownership.
5    *  The ASF licenses this file to You under the Apache License, Version 2.0
6    *  (the "License"); you may not use this file except in compliance with
7    *  the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  /*
19   * This package is based on the work done by Timothy Gerard Endres
20   * (time@ice.com) to whom the Ant project is very grateful for his great code.
21   */
22  
23  package org.apache.commons.compress.archivers.tar;
24  
25  import java.io.ByteArrayOutputStream;
26  import java.io.FileInputStream;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.HashMap;
32  import java.util.List;
33  import java.util.Map;
34  
35  import org.apache.commons.compress.archivers.ArchiveEntry;
36  import org.apache.commons.compress.archivers.ArchiveInputStream;
37  import org.apache.commons.compress.archivers.zip.ZipEncoding;
38  import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
39  import org.apache.commons.compress.utils.ArchiveUtils;
40  import org.apache.commons.compress.utils.BoundedInputStream;
41  import org.apache.commons.compress.utils.IOUtils;
42  
43  /**
44   * The TarInputStream reads a UNIX tar archive as an InputStream. methods are provided to position at each successive entry in the archive, and the read each
45   * entry as a normal input stream using read().
46   *
47   * @NotThreadSafe
48   */
49  public class TarArchiveInputStream extends ArchiveInputStream<TarArchiveEntry> {
50  
51      private static final int SMALL_BUFFER_SIZE = 256;
52  
53      /**
54       * Checks if the signature matches what is expected for a tar file.
55       *
56       * @param signature the bytes to check
57       * @param length    the number of bytes to check
58       * @return true, if this stream is a tar archive stream, false otherwise
59       */
60      public static boolean matches(final byte[] signature, final int length) {
61          final int versionOffset = TarConstants.VERSION_OFFSET;
62          final int versionLen = TarConstants.VERSIONLEN;
63          if (length < versionOffset + versionLen) {
64              return false;
65          }
66  
67          final int magicOffset = TarConstants.MAGIC_OFFSET;
68          final int magicLen = TarConstants.MAGICLEN;
69          if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, signature, magicOffset, magicLen)
70                  && ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, signature, versionOffset, versionLen)) {
71              return true;
72          }
73          if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, signature, magicOffset, magicLen)
74                  && (ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, signature, versionOffset, versionLen)
75                          || ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, signature, versionOffset, versionLen))) {
76              return true;
77          }
78          // COMPRESS-107 - recognize Ant tar files
79          return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, signature, magicOffset, magicLen)
80                  && ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, signature, versionOffset, versionLen);
81      }
82  
83      private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
84  
85      /** The buffer to store the TAR header. **/
86      private final byte[] recordBuffer;
87  
88      /** The size of a block. */
89      private final int blockSize;
90  
91      /** True if stream is at EOF. */
92      private boolean atEof;
93  
94      /** Size of the current . */
95      private long entrySize;
96  
97      /** How far into the entry the stream is at. */
98      private long entryOffset;
99  
100     /** Input streams for reading sparse entries. **/
101     private List<InputStream> sparseInputStreams;
102 
103     /** The index of current input stream being read when reading sparse entries. */
104     private int currentSparseInputStreamIndex;
105 
106     /** The meta-data about the current entry. */
107     private TarArchiveEntry currEntry;
108 
109     /** The encoding of the file. */
110     private final ZipEncoding zipEncoding;
111 
112     /** The global PAX header. */
113     private Map<String, String> globalPaxHeaders = new HashMap<>();
114 
115     /** The global sparse headers, this is only used in PAX Format 0.X. */
116     private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
117 
118     private final boolean lenient;
119 
120     /**
121      * Constructs a new instance.
122      *
123      * @param inputStream the input stream to use
124      */
125     public TarArchiveInputStream(final InputStream inputStream) {
126         this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
127     }
128 
129     /**
130      * Constructs a new instance.
131      *
132      * @param inputStream the input stream to use
133      * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
134      *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
135      * @since 1.19
136      */
137     public TarArchiveInputStream(final InputStream inputStream, final boolean lenient) {
138         this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
139     }
140 
141     /**
142      * Constructs a new instance.
143      *
144      * @param inputStream the input stream to use
145      * @param blockSize   the block size to use
146      */
147     public TarArchiveInputStream(final InputStream inputStream, final int blockSize) {
148         this(inputStream, blockSize, TarConstants.DEFAULT_RCDSIZE);
149     }
150 
151     /**
152      * Constructs a new instance.
153      *
154      * @param inputStream the input stream to use
155      * @param blockSize   the block size to use
156      * @param recordSize  the record size to use
157      */
158     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize) {
159         this(inputStream, blockSize, recordSize, null);
160     }
161 
162     /**
163      * Constructs a new instance.
164      *
165      * @param inputStream the input stream to use
166      * @param blockSize   the block size to use
167      * @param recordSize  the record size to use
168      * @param encoding    name of the encoding to use for file names
169      * @since 1.4
170      */
171     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize, final String encoding) {
172         this(inputStream, blockSize, recordSize, encoding, false);
173     }
174 
175     /**
176      * Constructs a new instance.
177      *
178      * @param inputStream the input stream to use
179      * @param blockSize   the block size to use
180      * @param recordSize  the record size to use
181      * @param encoding    name of the encoding to use for file names
182      * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
183      *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
184      * @since 1.19
185      */
186     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize, final String encoding, final boolean lenient) {
187         super(inputStream, encoding);
188         this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
189         this.recordBuffer = new byte[recordSize];
190         this.blockSize = blockSize;
191         this.lenient = lenient;
192     }
193 
194     /**
195      * Constructs a new instance.
196      *
197      * @param inputStream the input stream to use
198      * @param blockSize   the block size to use
199      * @param encoding    name of the encoding to use for file names
200      * @since 1.4
201      */
202     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final String encoding) {
203         this(inputStream, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
204     }
205 
206     /**
207      * Constructs a new instance.
208      *
209      * @param inputStream the input stream to use
210      * @param encoding    name of the encoding to use for file names
211      * @since 1.4
212      */
213     public TarArchiveInputStream(final InputStream inputStream, final String encoding) {
214         this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding);
215     }
216 
217     private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) throws IOException {
218         currEntry.updateEntryFromPaxHeaders(headers);
219         currEntry.setSparseHeaders(sparseHeaders);
220     }
221 
222     /**
223      * Gets the available data that can be read from the current entry in the archive. This does not indicate how much data is left in the entire archive, only
224      * in the current entry. This value is determined from the entry's size header field and the amount of data already read from the current entry.
225      * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE bytes are left in the current entry in the archive.
226      *
227      * @return The number of available bytes for the current entry.
228      * @throws IOException for signature
229      */
230     @Override
231     public int available() throws IOException {
232         if (isDirectory()) {
233             return 0;
234         }
235         final long available = currEntry.getRealSize() - entryOffset;
236         if (available > Integer.MAX_VALUE) {
237             return Integer.MAX_VALUE;
238         }
239         return (int) available;
240     }
241 
242     /**
243      * Build the input streams consisting of all-zero input streams and non-zero input streams. When reading from the non-zero input streams, the data is
244      * actually read from the original input stream. The size of each input stream is introduced by the sparse headers.
245      * <p>
246      * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the 0 size input streams because they are meaningless.
247      * </p>
248      */
249     private void buildSparseInputStreams() throws IOException {
250         currentSparseInputStreamIndex = -1;
251         sparseInputStreams = new ArrayList<>();
252 
253         final List<TarArchiveStructSparse> sparseHeaders = currEntry.getOrderedSparseHeaders();
254 
255         // Stream doesn't need to be closed at all as it doesn't use any resources
256         final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); // NOSONAR
257         // logical offset into the extracted entry
258         long offset = 0;
259         for (final TarArchiveStructSparse sparseHeader : sparseHeaders) {
260             final long zeroBlockSize = sparseHeader.getOffset() - offset;
261             if (zeroBlockSize < 0) {
262                 // sparse header says to move backwards inside the extracted entry
263                 throw new IOException("Corrupted struct sparse detected");
264             }
265 
266             // only store the zero block if it is not empty
267             if (zeroBlockSize > 0) {
268                 sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset));
269             }
270 
271             // only store the input streams with non-zero size
272             if (sparseHeader.getNumbytes() > 0) {
273                 sparseInputStreams.add(new BoundedInputStream(in, sparseHeader.getNumbytes()));
274             }
275 
276             offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
277         }
278 
279         if (!sparseInputStreams.isEmpty()) {
280             currentSparseInputStreamIndex = 0;
281         }
282     }
283 
284     /**
285      * Whether this class is able to read the given entry.
286      *
287      * @return The implementation will return true if the {@link ArchiveEntry} is an instance of {@link TarArchiveEntry}
288      */
289     @Override
290     public boolean canReadEntryData(final ArchiveEntry archiveEntry) {
291         return archiveEntry instanceof TarArchiveEntry;
292     }
293 
294     /**
295      * Closes this stream. Calls the TarBuffer's close() method.
296      *
297      * @throws IOException on error
298      */
299     @Override
300     public void close() throws IOException {
301         // Close all the input streams in sparseInputStreams
302         if (sparseInputStreams != null) {
303             for (final InputStream inputStream : sparseInputStreams) {
304                 inputStream.close();
305             }
306         }
307         in.close();
308     }
309 
310     /**
311      * This method is invoked once the end of the archive is hit, it tries to consume the remaining bytes under the assumption that the tool creating this
312      * archive has padded the last block.
313      */
314     private void consumeRemainderOfLastBlock() throws IOException {
315         final long bytesReadOfLastBlock = getBytesRead() % blockSize;
316         if (bytesReadOfLastBlock > 0) {
317             count(IOUtils.skip(in, blockSize - bytesReadOfLastBlock));
318         }
319     }
320 
321     /**
322      * For FileInputStream, the skip always return the number you input, so we need the available bytes to determine how many bytes are actually skipped
323      *
324      * @param available available bytes returned by inputStream.available()
325      * @param skipped   skipped bytes returned by inputStream.skip()
326      * @param expected  bytes expected to skip
327      * @return number of bytes actually skipped
328      * @throws IOException if a truncated tar archive is detected
329      */
330     private long getActuallySkipped(final long available, final long skipped, final long expected) throws IOException {
331         long actuallySkipped = skipped;
332         if (in instanceof FileInputStream) {
333             actuallySkipped = Math.min(skipped, available);
334         }
335         if (actuallySkipped != expected) {
336             throw new IOException("Truncated TAR archive");
337         }
338         return actuallySkipped;
339     }
340 
341     /**
342      * Gets the current TAR Archive Entry that this input stream is processing
343      *
344      * @return The current Archive Entry
345      */
346     public TarArchiveEntry getCurrentEntry() {
347         return currEntry;
348     }
349 
350     /**
351      * Gets the next entry in this tar archive as long name data.
352      *
353      * @return The next entry in the archive as long name data, or null.
354      * @throws IOException on error
355      */
356     protected byte[] getLongNameData() throws IOException {
357         // read in the name
358         final ByteArrayOutputStream longName = new ByteArrayOutputStream();
359         int length = 0;
360         while ((length = read(smallBuf)) >= 0) {
361             longName.write(smallBuf, 0, length);
362         }
363         getNextEntry();
364         if (currEntry == null) {
365             // Bugzilla: 40334
366             // Malformed tar file - long entry name not followed by entry
367             return null;
368         }
369         byte[] longNameData = longName.toByteArray();
370         // remove trailing null terminator(s)
371         length = longNameData.length;
372         while (length > 0 && longNameData[length - 1] == 0) {
373             --length;
374         }
375         if (length != longNameData.length) {
376             longNameData = Arrays.copyOf(longNameData, length);
377         }
378         return longNameData;
379     }
380 
381     /**
382      * Gets the next TarArchiveEntry in this stream.
383      *
384      * @return the next entry, or {@code null} if there are no more entries
385      * @throws IOException if the next entry could not be read
386      */
387     @Override
388     public TarArchiveEntry getNextEntry() throws IOException {
389         return getNextTarEntry();
390     }
391 
392     /**
393      * Gets the next entry in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
394      * header of the next entry, and read the header and instantiate a new TarEntry from the header bytes and return that entry. If there are no more entries in
395      * the archive, null will be returned to indicate that the end of the archive has been reached.
396      *
397      * @return The next TarEntry in the archive, or null.
398      * @throws IOException on error
399      * @deprecated Use {@link #getNextEntry()}.
400      */
401     @Deprecated
402     public TarArchiveEntry getNextTarEntry() throws IOException {
403         if (isAtEOF()) {
404             return null;
405         }
406 
407         if (currEntry != null) {
408             /* Skip will only go to the end of the current entry */
409             IOUtils.skip(this, Long.MAX_VALUE);
410 
411             /* skip to the end of the last record */
412             skipRecordPadding();
413         }
414 
415         final byte[] headerBuf = getRecord();
416 
417         if (headerBuf == null) {
418             /* hit EOF */
419             currEntry = null;
420             return null;
421         }
422 
423         try {
424             currEntry = new TarArchiveEntry(globalPaxHeaders, headerBuf, zipEncoding, lenient);
425         } catch (final IllegalArgumentException e) {
426             throw new IOException("Error detected parsing the header", e);
427         }
428 
429         entryOffset = 0;
430         entrySize = currEntry.getSize();
431 
432         if (currEntry.isGNULongLinkEntry()) {
433             final byte[] longLinkData = getLongNameData();
434             if (longLinkData == null) {
435                 // Bugzilla: 40334
436                 // Malformed tar file - long link entry name not followed by entry
437                 return null;
438             }
439             currEntry.setLinkName(zipEncoding.decode(longLinkData));
440         }
441 
442         if (currEntry.isGNULongNameEntry()) {
443             final byte[] longNameData = getLongNameData();
444             if (longNameData == null) {
445                 // Bugzilla: 40334
446                 // Malformed tar file - long entry name not followed by entry
447                 return null;
448             }
449 
450             // COMPRESS-509 : the name of directories should end with '/'
451             final String name = zipEncoding.decode(longNameData);
452             currEntry.setName(name);
453             if (currEntry.isDirectory() && !name.endsWith("/")) {
454                 currEntry.setName(name + "/");
455             }
456         }
457 
458         if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
459             readGlobalPaxHeaders();
460         }
461 
462         try {
463             if (currEntry.isPaxHeader()) { // Process Pax headers
464                 paxHeaders();
465             } else if (!globalPaxHeaders.isEmpty()) {
466                 applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
467             }
468         } catch (final NumberFormatException e) {
469             throw new IOException("Error detected parsing the pax header", e);
470         }
471 
472         if (currEntry.isOldGNUSparse()) { // Process sparse files
473             readOldGNUSparse();
474         }
475 
476         // If the size of the next element in the archive has changed
477         // due to a new size being reported in the POSIX header
478         // information, we update entrySize here so that it contains
479         // the correct value.
480         entrySize = currEntry.getSize();
481 
482         return currEntry;
483     }
484 
485     /**
486      * Gets the next record in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
487      * header of the next entry.
488      * <p>
489      * If there are no more entries in the archive, null will be returned to indicate that the end of the archive has been reached. At the same time the
490      * {@code hasHitEOF} marker will be set to true.
491      * </p>
492      *
493      * @return The next header in the archive, or null.
494      * @throws IOException on error
495      */
496     private byte[] getRecord() throws IOException {
497         byte[] headerBuf = readRecord();
498         setAtEOF(isEOFRecord(headerBuf));
499         if (isAtEOF() && headerBuf != null) {
500             tryToConsumeSecondEOFRecord();
501             consumeRemainderOfLastBlock();
502             headerBuf = null;
503         }
504         return headerBuf;
505     }
506 
507     /**
508      * Gets the record size being used by this stream's buffer.
509      *
510      * @return The TarBuffer record size.
511      */
512     public int getRecordSize() {
513         return recordBuffer.length;
514     }
515 
516     protected final boolean isAtEOF() {
517         return atEof;
518     }
519 
520     private boolean isDirectory() {
521         return currEntry != null && currEntry.isDirectory();
522     }
523 
524     /**
525      * Tests if an archive record indicate End of Archive. End of archive is indicated by a record that consists entirely of null bytes.
526      *
527      * @param record The record data to check.
528      * @return true if the record data is an End of Archive
529      */
530     protected boolean isEOFRecord(final byte[] record) {
531         return record == null || ArchiveUtils.isArrayZero(record, getRecordSize());
532     }
533 
534     /**
535      * Since we do not support marking just yet, we do nothing.
536      *
537      * @param markLimit The limit to mark.
538      */
539     @Override
540     public synchronized void mark(final int markLimit) {
541     }
542 
543     /**
544      * Since we do not support marking just yet, we return false.
545      *
546      * @return false.
547      */
548     @Override
549     public boolean markSupported() {
550         return false;
551     }
552 
553     /**
554      * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) may appear multi times, and they look like:
555      * <p>
556      * GNU.sparse.size=size GNU.sparse.numblocks=numblocks repeat numblocks times GNU.sparse.offset=offset GNU.sparse.numbytes=numbytes end repeat
557      * </p>
558      * <p>
559      * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
560      * </p>
561      * <p>
562      * GNU.sparse.map Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
563      * </p>
564      * <p>
565      * For PAX Format 1.X: The sparse map itself is stored in the file data block, preceding the actual file data. It consists of a series of decimal numbers
566      * delimited by newlines. The map is padded with nulls to the nearest block boundary. The first number gives the number of entries in the map. Following are
567      * map entries, each one consisting of two numbers giving the offset and size of the data block it describes.
568      * </p>
569      *
570      * @throws IOException
571      */
572     private void paxHeaders() throws IOException {
573         List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
574         final Map<String, String> headers = TarUtils.parsePaxHeaders(this, sparseHeaders, globalPaxHeaders, entrySize);
575 
576         // for 0.1 PAX Headers
577         if (headers.containsKey(TarGnuSparseKeys.MAP)) {
578             sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get(TarGnuSparseKeys.MAP)));
579         }
580         getNextEntry(); // Get the actual file entry
581         if (currEntry == null) {
582             throw new IOException("premature end of tar archive. Didn't find any entry after PAX header.");
583         }
584         applyPaxHeadersToCurrentEntry(headers, sparseHeaders);
585 
586         // for 1.0 PAX Format, the sparse map is stored in the file data block
587         if (currEntry.isPaxGNU1XSparse()) {
588             sparseHeaders = TarUtils.parsePAX1XSparseHeaders(in, getRecordSize());
589             currEntry.setSparseHeaders(sparseHeaders);
590         }
591 
592         // sparse headers are all done reading, we need to build
593         // sparse input streams using these sparse headers
594         buildSparseInputStreams();
595     }
596 
597     /**
598      * Reads bytes from the current tar archive entry.
599      * <p>
600      * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF.
601      * </p>
602      *
603      * @param buf       The buffer into which to place bytes read.
604      * @param offset    The offset at which to place bytes read.
605      * @param numToRead The number of bytes to read.
606      * @return The number of bytes read, or -1 at EOF.
607      * @throws IOException on error
608      */
609     @Override
610     public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
611         if (numToRead == 0) {
612             return 0;
613         }
614         int totalRead = 0;
615 
616         if (isAtEOF() || isDirectory()) {
617             return -1;
618         }
619 
620         if (currEntry == null) {
621             throw new IllegalStateException("No current tar entry");
622         }
623 
624         if (entryOffset >= currEntry.getRealSize()) {
625             return -1;
626         }
627 
628         numToRead = Math.min(numToRead, available());
629 
630         if (currEntry.isSparse()) {
631             // for sparse entries, we need to read them in another way
632             totalRead = readSparse(buf, offset, numToRead);
633         } else {
634             totalRead = in.read(buf, offset, numToRead);
635         }
636 
637         if (totalRead == -1) {
638             if (numToRead > 0) {
639                 throw new IOException("Truncated TAR archive");
640             }
641             setAtEOF(true);
642         } else {
643             count(totalRead);
644             entryOffset += totalRead;
645         }
646 
647         return totalRead;
648     }
649 
650     private void readGlobalPaxHeaders() throws IOException {
651         globalPaxHeaders = TarUtils.parsePaxHeaders(this, globalSparseHeaders, globalPaxHeaders, entrySize);
652         getNextEntry(); // Get the actual file entry
653 
654         if (currEntry == null) {
655             throw new IOException("Error detected parsing the pax header");
656         }
657     }
658 
659     /**
660      * Adds the sparse chunks from the current entry to the sparse chunks, including any additional sparse entries following the current entry.
661      *
662      * @throws IOException on error
663      */
664     private void readOldGNUSparse() throws IOException {
665         if (currEntry.isExtended()) {
666             TarArchiveSparseEntry entry;
667             do {
668                 final byte[] headerBuf = getRecord();
669                 if (headerBuf == null) {
670                     throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag.");
671                 }
672                 entry = new TarArchiveSparseEntry(headerBuf);
673                 currEntry.getSparseHeaders().addAll(entry.getSparseHeaders());
674             } while (entry.isExtended());
675         }
676 
677         // sparse headers are all done reading, we need to build
678         // sparse input streams using these sparse headers
679         buildSparseInputStreams();
680     }
681 
682     /**
683      * Read a record from the input stream and return the data.
684      *
685      * @return The record data or null if EOF has been hit.
686      * @throws IOException on error
687      */
688     protected byte[] readRecord() throws IOException {
689         final int readCount = IOUtils.readFully(in, recordBuffer);
690         count(readCount);
691         if (readCount != getRecordSize()) {
692             return null;
693         }
694 
695         return recordBuffer;
696     }
697 
698     /**
699      * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is stored in tar files, and they are stored
700      * separately. The structure of non-zero data is introduced by the sparse headers using the offset, where a block of non-zero data starts, and numbytes, the
701      * length of the non-zero data block. When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together according to
702      * the sparse headers.
703      *
704      * @param buf       The buffer into which to place bytes read.
705      * @param offset    The offset at which to place bytes read.
706      * @param numToRead The number of bytes to read.
707      * @return The number of bytes read, or -1 at EOF.
708      * @throws IOException on error
709      */
710     private int readSparse(final byte[] buf, final int offset, final int numToRead) throws IOException {
711         // if there are no actual input streams, just read from the original input stream
712         if (sparseInputStreams == null || sparseInputStreams.isEmpty()) {
713             return in.read(buf, offset, numToRead);
714         }
715         if (currentSparseInputStreamIndex >= sparseInputStreams.size()) {
716             return -1;
717         }
718         final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
719         final int readLen = currentInputStream.read(buf, offset, numToRead);
720         // if the current input stream is the last input stream,
721         // just return the number of bytes read from current input stream
722         if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
723             return readLen;
724         }
725         // if EOF of current input stream is meet, open a new input stream and recursively call read
726         if (readLen == -1) {
727             currentSparseInputStreamIndex++;
728             return readSparse(buf, offset, numToRead);
729         }
730         // if the rest data of current input stream is not long enough, open a new input stream
731         // and recursively call read
732         if (readLen < numToRead) {
733             currentSparseInputStreamIndex++;
734             final int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
735             if (readLenOfNext == -1) {
736                 return readLen;
737             }
738             return readLen + readLenOfNext;
739         }
740         // if the rest data of current input stream is enough(which means readLen == len), just return readLen
741         return readLen;
742     }
743 
744     /**
745      * Since we do not support marking just yet, we do nothing.
746      */
747     @Override
748     public synchronized void reset() {
749     }
750 
751     protected final void setAtEOF(final boolean atEof) {
752         this.atEof = atEof;
753     }
754 
755     protected final void setCurrentEntry(final TarArchiveEntry currEntry) {
756         this.currEntry = currEntry;
757     }
758 
759     /**
760      * Skips over and discards {@code n} bytes of data from this input stream. The {@code skip} method may, for a variety of reasons, end up skipping over some
761      * smaller number of bytes, possibly {@code 0}. This may result from any of a number of conditions; reaching end of file or end of entry before {@code n}
762      * bytes have been skipped; are only two possibilities. The actual number of bytes skipped is returned. If {@code n} is negative, no bytes are skipped.
763      *
764      * @param n the number of bytes to be skipped.
765      * @return the actual number of bytes skipped.
766      * @throws IOException if a truncated tar archive is detected or some other I/O error occurs
767      */
768     @Override
769     public long skip(final long n) throws IOException {
770         if (n <= 0 || isDirectory()) {
771             return 0;
772         }
773 
774         final long availableOfInputStream = in.available();
775         final long available = currEntry.getRealSize() - entryOffset;
776         final long numToSkip = Math.min(n, available);
777         long skipped;
778 
779         if (!currEntry.isSparse()) {
780             skipped = IOUtils.skip(in, numToSkip);
781             // for non-sparse entry, we should get the bytes actually skipped bytes along with
782             // inputStream.available() if inputStream is instance of FileInputStream
783             skipped = getActuallySkipped(availableOfInputStream, skipped, numToSkip);
784         } else {
785             skipped = skipSparse(numToSkip);
786         }
787 
788         count(skipped);
789         entryOffset += skipped;
790         return skipped;
791     }
792 
793     /**
794      * The last record block should be written at the full size, so skip any additional space used to fill a record after an entry.
795      *
796      * @throws IOException if a truncated tar archive is detected
797      */
798     private void skipRecordPadding() throws IOException {
799         if (!isDirectory() && this.entrySize > 0 && this.entrySize % getRecordSize() != 0) {
800             final long available = in.available();
801             final long numRecords = this.entrySize / getRecordSize() + 1;
802             final long padding = numRecords * getRecordSize() - this.entrySize;
803             long skipped = IOUtils.skip(in, padding);
804 
805             skipped = getActuallySkipped(available, skipped, padding);
806 
807             count(skipped);
808         }
809     }
810 
811     /**
812      * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip, jump to the next input stream and skip the rest
813      * bytes, keep doing this until total n bytes are skipped or the input streams are all skipped
814      *
815      * @param n bytes of data to skip
816      * @return actual bytes of data skipped
817      * @throws IOException
818      */
819     private long skipSparse(final long n) throws IOException {
820         if (sparseInputStreams == null || sparseInputStreams.isEmpty()) {
821             return in.skip(n);
822         }
823         long bytesSkipped = 0;
824         while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) {
825             final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
826             bytesSkipped += currentInputStream.skip(n - bytesSkipped);
827             if (bytesSkipped < n) {
828                 currentSparseInputStreamIndex++;
829             }
830         }
831         return bytesSkipped;
832     }
833 
834     /**
835      * Tries to read the next record rewinding the stream if it is not an EOF record.
836      * <p>
837      * This is meant to protect against cases where a tar implementation has written only one EOF record when two are expected. Actually this won't help since a
838      * non-conforming implementation likely won't fill full blocks consisting of - by default - ten records either so we probably have already read beyond the
839      * archive anyway.
840      * </p>
841      */
842     private void tryToConsumeSecondEOFRecord() throws IOException {
843         boolean shouldReset = true;
844         final boolean marked = in.markSupported();
845         if (marked) {
846             in.mark(getRecordSize());
847         }
848         try {
849             shouldReset = !isEOFRecord(readRecord());
850         } finally {
851             if (shouldReset && marked) {
852                 pushedBackBytes(getRecordSize());
853                 in.reset();
854             }
855         }
856     }
857 }