View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  /*
21   * This package is based on the work done by Timothy Gerard Endres
22   * (time@ice.com) to whom the Ant project is very grateful for his great code.
23   */
24  
25  package org.apache.commons.compress.archivers.tar;
26  
27  import java.io.ByteArrayOutputStream;
28  import java.io.FileInputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.util.ArrayList;
32  import java.util.Arrays;
33  import java.util.HashMap;
34  import java.util.List;
35  import java.util.Map;
36  
37  import org.apache.commons.compress.archivers.ArchiveEntry;
38  import org.apache.commons.compress.archivers.ArchiveInputStream;
39  import org.apache.commons.compress.archivers.zip.ZipEncoding;
40  import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
41  import org.apache.commons.compress.utils.ArchiveUtils;
42  import org.apache.commons.compress.utils.IOUtils;
43  import org.apache.commons.io.input.BoundedInputStream;
44  
45  /**
46   * The TarInputStream reads a Unix tar archive as an InputStream. methods are provided to position at each successive entry in the archive, and the read each
47   * entry as a normal input stream using read().
48   *
49   * @NotThreadSafe
50   */
51  public class TarArchiveInputStream extends ArchiveInputStream<TarArchiveEntry> {
52  
53      /**
54       * IBM AIX <a href=""https://www.ibm.com/docs/sv/aix/7.2.0?topic=files-tarh-file">tar.h</a>: "This field is terminated with a space only."
55       */
56      private static final String VERSION_AIX = "0 ";
57  
58      private static final int SMALL_BUFFER_SIZE = 256;
59  
60      /**
61       * Checks if the signature matches what is expected for a tar file.
62       *
63       * @param signature the bytes to check.
64       * @param length    the number of bytes to check.
65       * @return true, if this stream is a tar archive stream, false otherwise.
66       */
67      public static boolean matches(final byte[] signature, final int length) {
68          final int versionOffset = TarConstants.VERSION_OFFSET;
69          final int versionLen = TarConstants.VERSIONLEN;
70          if (length < versionOffset + versionLen) {
71              return false;
72          }
73          final int magicOffset = TarConstants.MAGIC_OFFSET;
74          final int magicLen = TarConstants.MAGICLEN;
75          if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, signature, magicOffset, magicLen)
76                  && ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, signature, versionOffset, versionLen)) {
77              return true;
78          }
79          // IBM AIX tar.h https://www.ibm.com/docs/sv/aix/7.2.0?topic=files-tarh-file : "This field is terminated with a space only."
80          if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, signature, magicOffset, magicLen)
81                  && ArchiveUtils.matchAsciiBuffer(VERSION_AIX, signature, versionOffset, versionLen)) {
82              return true;
83          }
84          if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, signature, magicOffset, magicLen)
85                  && (ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, signature, versionOffset, versionLen)
86                          || ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, signature, versionOffset, versionLen))) {
87              return true;
88          }
89          // COMPRESS-107 - recognize Ant tar files
90          return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, signature, magicOffset, magicLen)
91                  && ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, signature, versionOffset, versionLen);
92      }
93  
94      private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
95  
96      /** The buffer to store the TAR header. **/
97      private final byte[] recordBuffer;
98  
99      /** The size of a block. */
100     private final int blockSize;
101 
102     /** True if stream is at EOF. */
103     private boolean atEof;
104 
105     /** Size of the current . */
106     private long entrySize;
107 
108     /** How far into the entry the stream is at. */
109     private long entryOffset;
110 
111     /** Input streams for reading sparse entries. **/
112     private List<InputStream> sparseInputStreams;
113 
114     /** The index of current input stream being read when reading sparse entries. */
115     private int currentSparseInputStreamIndex;
116 
117     /** The meta-data about the current entry. */
118     private TarArchiveEntry currEntry;
119 
120     /** The encoding of the file. */
121     private final ZipEncoding zipEncoding;
122 
123     /** The global PAX header. */
124     private Map<String, String> globalPaxHeaders = new HashMap<>();
125 
126     /** The global sparse headers, this is only used in PAX Format 0.X. */
127     private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
128 
129     private final boolean lenient;
130 
131     /**
132      * Constructs a new instance.
133      *
134      * @param inputStream the input stream to use
135      */
136     public TarArchiveInputStream(final InputStream inputStream) {
137         this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
138     }
139 
140     /**
141      * Constructs a new instance.
142      *
143      * @param inputStream the input stream to use
144      * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
145      *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
146      * @since 1.19
147      */
148     public TarArchiveInputStream(final InputStream inputStream, final boolean lenient) {
149         this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
150     }
151 
152     /**
153      * Constructs a new instance.
154      *
155      * @param inputStream the input stream to use
156      * @param blockSize   the block size to use
157      */
158     public TarArchiveInputStream(final InputStream inputStream, final int blockSize) {
159         this(inputStream, blockSize, TarConstants.DEFAULT_RCDSIZE);
160     }
161 
162     /**
163      * Constructs a new instance.
164      *
165      * @param inputStream the input stream to use
166      * @param blockSize   the block size to use
167      * @param recordSize  the record size to use
168      */
169     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize) {
170         this(inputStream, blockSize, recordSize, null);
171     }
172 
173     /**
174      * Constructs a new instance.
175      *
176      * @param inputStream the input stream to use
177      * @param blockSize   the block size to use
178      * @param recordSize  the record size to use
179      * @param encoding    name of the encoding to use for file names
180      * @since 1.4
181      */
182     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize, final String encoding) {
183         this(inputStream, blockSize, recordSize, encoding, false);
184     }
185 
186     /**
187      * Constructs a new instance.
188      *
189      * @param inputStream the input stream to use
190      * @param blockSize   the block size to use
191      * @param recordSize  the record size to use
192      * @param encoding    name of the encoding to use for file names
193      * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
194      *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
195      * @since 1.19
196      */
197     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final int recordSize, final String encoding, final boolean lenient) {
198         super(inputStream, encoding);
199         this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
200         this.recordBuffer = new byte[recordSize];
201         this.blockSize = blockSize;
202         this.lenient = lenient;
203     }
204 
205     /**
206      * Constructs a new instance.
207      *
208      * @param inputStream the input stream to use
209      * @param blockSize   the block size to use
210      * @param encoding    name of the encoding to use for file names
211      * @since 1.4
212      */
213     public TarArchiveInputStream(final InputStream inputStream, final int blockSize, final String encoding) {
214         this(inputStream, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
215     }
216 
217     /**
218      * Constructs a new instance.
219      *
220      * @param inputStream the input stream to use
221      * @param encoding    name of the encoding to use for file names
222      * @since 1.4
223      */
224     public TarArchiveInputStream(final InputStream inputStream, final String encoding) {
225         this(inputStream, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding);
226     }
227 
228     private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) throws IOException {
229         currEntry.updateEntryFromPaxHeaders(headers);
230         currEntry.setSparseHeaders(sparseHeaders);
231     }
232 
233     /**
234      * Gets the available data that can be read from the current entry in the archive. This does not indicate how much data is left in the entire archive, only
235      * in the current entry. This value is determined from the entry's size header field and the amount of data already read from the current entry.
236      * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE bytes are left in the current entry in the archive.
237      *
238      * @return The number of available bytes for the current entry.
239      * @throws IOException for signature
240      */
241     @Override
242     public int available() throws IOException {
243         if (isDirectory()) {
244             return 0;
245         }
246         final long available = currEntry.getRealSize() - entryOffset;
247         if (available > Integer.MAX_VALUE) {
248             return Integer.MAX_VALUE;
249         }
250         return (int) available;
251     }
252 
253     /**
254      * Build the input streams consisting of all-zero input streams and non-zero input streams. When reading from the non-zero input streams, the data is
255      * actually read from the original input stream. The size of each input stream is introduced by the sparse headers.
256      * <p>
257      * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the 0 size input streams because they are meaningless.
258      * </p>
259      */
260     private void buildSparseInputStreams() throws IOException {
261         currentSparseInputStreamIndex = -1;
262         sparseInputStreams = new ArrayList<>();
263 
264         final List<TarArchiveStructSparse> sparseHeaders = currEntry.getOrderedSparseHeaders();
265 
266         // Stream doesn't need to be closed at all as it doesn't use any resources
267         final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); // NOSONAR
268         // logical offset into the extracted entry
269         long offset = 0;
270         for (final TarArchiveStructSparse sparseHeader : sparseHeaders) {
271             final long zeroBlockSize = sparseHeader.getOffset() - offset;
272             if (zeroBlockSize < 0) {
273                 // sparse header says to move backwards inside the extracted entry
274                 throw new IOException("Corrupted struct sparse detected");
275             }
276             // only store the zero block if it is not empty
277             if (zeroBlockSize > 0) {
278                 // @formatter:off
279                 sparseInputStreams.add(BoundedInputStream.builder()
280                         .setInputStream(zeroInputStream)
281                         .setMaxCount(sparseHeader.getOffset() - offset)
282                         .get());
283                 // @formatter:on
284             }
285             // only store the input streams with non-zero size
286             if (sparseHeader.getNumbytes() > 0) {
287                 // @formatter:off
288                 sparseInputStreams.add(BoundedInputStream.builder()
289                         .setInputStream(in)
290                         .setMaxCount(sparseHeader.getNumbytes())
291                         .get());
292                 // @formatter:on
293             }
294             offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
295         }
296         if (!sparseInputStreams.isEmpty()) {
297             currentSparseInputStreamIndex = 0;
298         }
299     }
300 
301     /**
302      * Tests whether this class is able to read the given entry.
303      *
304      * @return The implementation will return true if the {@link ArchiveEntry} is an instance of {@link TarArchiveEntry}
305      */
306     @Override
307     public boolean canReadEntryData(final ArchiveEntry archiveEntry) {
308         return archiveEntry instanceof TarArchiveEntry;
309     }
310 
311     /**
312      * Closes this stream. Calls the TarBuffer's close() method.
313      *
314      * @throws IOException on error
315      */
316     @Override
317     public void close() throws IOException {
318         // Close all the input streams in sparseInputStreams
319         if (sparseInputStreams != null) {
320             for (final InputStream inputStream : sparseInputStreams) {
321                 inputStream.close();
322             }
323         }
324         in.close();
325     }
326 
327     /**
328      * This method is invoked once the end of the archive is hit, it tries to consume the remaining bytes under the assumption that the tool creating this
329      * archive has padded the last block.
330      */
331     private void consumeRemainderOfLastBlock() throws IOException {
332         final long bytesReadOfLastBlock = getBytesRead() % blockSize;
333         if (bytesReadOfLastBlock > 0) {
334             count(IOUtils.skip(in, blockSize - bytesReadOfLastBlock));
335         }
336     }
337 
338     /**
339      * For FileInputStream, the skip always return the number you input, so we need the available bytes to determine how many bytes are actually skipped
340      *
341      * @param available available bytes returned by inputStream.available()
342      * @param skipped   skipped bytes returned by inputStream.skip()
343      * @param expected  bytes expected to skip
344      * @return number of bytes actually skipped
345      * @throws IOException if a truncated tar archive is detected
346      */
347     private long getActuallySkipped(final long available, final long skipped, final long expected) throws IOException {
348         long actuallySkipped = skipped;
349         if (in instanceof FileInputStream) {
350             actuallySkipped = Math.min(skipped, available);
351         }
352         if (actuallySkipped != expected) {
353             throw new IOException("Truncated TAR archive");
354         }
355         return actuallySkipped;
356     }
357 
358     /**
359      * Gets the current TAR Archive Entry that this input stream is processing
360      *
361      * @return The current Archive Entry
362      */
363     public TarArchiveEntry getCurrentEntry() {
364         return currEntry;
365     }
366 
367     /**
368      * Gets the next entry in this tar archive as long name data.
369      *
370      * @return The next entry in the archive as long name data, or null.
371      * @throws IOException on error
372      */
373     protected byte[] getLongNameData() throws IOException {
374         // read in the name
375         final ByteArrayOutputStream longName = new ByteArrayOutputStream();
376         int length = 0;
377         while ((length = read(smallBuf)) >= 0) {
378             longName.write(smallBuf, 0, length);
379         }
380         getNextEntry();
381         if (currEntry == null) {
382             // Bugzilla: 40334
383             // Malformed tar file - long entry name not followed by entry
384             return null;
385         }
386         byte[] longNameData = longName.toByteArray();
387         // remove trailing null terminator(s)
388         length = longNameData.length;
389         while (length > 0 && longNameData[length - 1] == 0) {
390             --length;
391         }
392         if (length != longNameData.length) {
393             longNameData = Arrays.copyOf(longNameData, length);
394         }
395         return longNameData;
396     }
397 
398     /**
399      * Gets the next TarArchiveEntry in this stream.
400      *
401      * @return the next entry, or {@code null} if there are no more entries
402      * @throws IOException if the next entry could not be read
403      */
404     @Override
405     public TarArchiveEntry getNextEntry() throws IOException {
406         return getNextTarEntry();
407     }
408 
409     /**
410      * Gets the next entry in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
411      * header of the next entry, and read the header and instantiate a new TarEntry from the header bytes and return that entry. If there are no more entries in
412      * the archive, null will be returned to indicate that the end of the archive has been reached.
413      *
414      * @return The next TarEntry in the archive, or null.
415      * @throws IOException on error
416      * @deprecated Use {@link #getNextEntry()}.
417      */
418     @Deprecated
419     public TarArchiveEntry getNextTarEntry() throws IOException {
420         if (isAtEOF()) {
421             return null;
422         }
423         if (currEntry != null) {
424             /* Skip will only go to the end of the current entry */
425             IOUtils.skip(this, Long.MAX_VALUE);
426             /* skip to the end of the last record */
427             skipRecordPadding();
428         }
429         final byte[] headerBuf = getRecord();
430         if (headerBuf == null) {
431             /* hit EOF */
432             currEntry = null;
433             return null;
434         }
435         try {
436             currEntry = new TarArchiveEntry(globalPaxHeaders, headerBuf, zipEncoding, lenient);
437         } catch (final IllegalArgumentException e) {
438             throw new IOException("Error detected parsing the header", e);
439         }
440         entryOffset = 0;
441         entrySize = currEntry.getSize();
442         if (currEntry.isGNULongLinkEntry()) {
443             final byte[] longLinkData = getLongNameData();
444             if (longLinkData == null) {
445                 // Bugzilla: 40334
446                 // Malformed tar file - long link entry name not followed by entry
447                 return null;
448             }
449             currEntry.setLinkName(zipEncoding.decode(longLinkData));
450         }
451         if (currEntry.isGNULongNameEntry()) {
452             final byte[] longNameData = getLongNameData();
453             if (longNameData == null) {
454                 // Bugzilla: 40334
455                 // Malformed tar file - long entry name not followed by entry
456                 return null;
457             }
458             // COMPRESS-509 : the name of directories should end with '/'
459             final String name = zipEncoding.decode(longNameData);
460             currEntry.setName(name);
461             if (currEntry.isDirectory() && !name.endsWith("/")) {
462                 currEntry.setName(name + "/");
463             }
464         }
465         if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
466             readGlobalPaxHeaders();
467         }
468         try {
469             if (currEntry.isPaxHeader()) { // Process Pax headers
470                 paxHeaders();
471             } else if (!globalPaxHeaders.isEmpty()) {
472                 applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
473             }
474         } catch (final NumberFormatException e) {
475             throw new IOException("Error detected parsing the pax header", e);
476         }
477         if (currEntry.isOldGNUSparse()) { // Process sparse files
478             readOldGNUSparse();
479         }
480         // If the size of the next element in the archive has changed
481         // due to a new size being reported in the POSIX header
482         // information, we update entrySize here so that it contains
483         // the correct value.
484         entrySize = currEntry.getSize();
485         return currEntry;
486     }
487 
488     /**
489      * Gets the next record in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
490      * header of the next entry.
491      * <p>
492      * If there are no more entries in the archive, null will be returned to indicate that the end of the archive has been reached. At the same time the
493      * {@code hasHitEOF} marker will be set to true.
494      * </p>
495      *
496      * @return The next header in the archive, or null.
497      * @throws IOException on error
498      */
499     private byte[] getRecord() throws IOException {
500         byte[] headerBuf = readRecord();
501         setAtEOF(isEOFRecord(headerBuf));
502         if (isAtEOF() && headerBuf != null) {
503             tryToConsumeSecondEOFRecord();
504             consumeRemainderOfLastBlock();
505             headerBuf = null;
506         }
507         return headerBuf;
508     }
509 
510     /**
511      * Gets the record size being used by this stream's buffer.
512      *
513      * @return The TarBuffer record size.
514      */
515     public int getRecordSize() {
516         return recordBuffer.length;
517     }
518 
519     /**
520      * Tests whether we are at the end-of-file.
521      *
522      * @return whether we are at the end-of-file.
523      */
524     protected final boolean isAtEOF() {
525         return atEof;
526     }
527 
528     private boolean isDirectory() {
529         return currEntry != null && currEntry.isDirectory();
530     }
531 
532     /**
533      * Tests if an archive record indicate End of Archive. End of archive is indicated by a record that consists entirely of null bytes.
534      *
535      * @param record The record data to check.
536      * @return true if the record data is an End of Archive
537      */
538     protected boolean isEOFRecord(final byte[] record) {
539         return record == null || ArchiveUtils.isArrayZero(record, getRecordSize());
540     }
541 
542     /**
543      * Since we do not support marking just yet, we do nothing.
544      *
545      * @param markLimit The limit to mark.
546      */
547     @Override
548     public synchronized void mark(final int markLimit) {
549     }
550 
551     /**
552      * Since we do not support marking just yet, we return false.
553      *
554      * @return false.
555      */
556     @Override
557     public boolean markSupported() {
558         return false;
559     }
560 
561     /**
562      * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) may appear multi times, and they look like:
563      * <p>
564      * GNU.sparse.size=size GNU.sparse.numblocks=numblocks repeat numblocks times GNU.sparse.offset=offset GNU.sparse.numbytes=numbytes end repeat
565      * </p>
566      * <p>
567      * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
568      * </p>
569      * <p>
570      * GNU.sparse.map Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
571      * </p>
572      * <p>
573      * For PAX Format 1.X: The sparse map itself is stored in the file data block, preceding the actual file data. It consists of a series of decimal numbers
574      * delimited by newlines. The map is padded with nulls to the nearest block boundary. The first number gives the number of entries in the map. Following are
575      * map entries, each one consisting of two numbers giving the offset and size of the data block it describes.
576      * </p>
577      *
578      * @throws IOException if an I/O error occurs.
579      */
580     private void paxHeaders() throws IOException {
581         List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
582         final Map<String, String> headers = TarUtils.parsePaxHeaders(this, sparseHeaders, globalPaxHeaders, entrySize);
583         // for 0.1 PAX Headers
584         if (headers.containsKey(TarGnuSparseKeys.MAP)) {
585             sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get(TarGnuSparseKeys.MAP)));
586         }
587         getNextEntry(); // Get the actual file entry
588         if (currEntry == null) {
589             throw new IOException("premature end of tar archive. Didn't find any entry after PAX header.");
590         }
591         applyPaxHeadersToCurrentEntry(headers, sparseHeaders);
592         // for 1.0 PAX Format, the sparse map is stored in the file data block
593         if (currEntry.isPaxGNU1XSparse()) {
594             sparseHeaders = TarUtils.parsePAX1XSparseHeaders(in, getRecordSize());
595             currEntry.setSparseHeaders(sparseHeaders);
596         }
597         // sparse headers are all done reading, we need to build
598         // sparse input streams using these sparse headers
599         buildSparseInputStreams();
600     }
601 
602     /**
603      * Reads bytes from the current tar archive entry.
604      * <p>
605      * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF.
606      * </p>
607      *
608      * @param buf       The buffer into which to place bytes read.
609      * @param offset    The offset at which to place bytes read.
610      * @param numToRead The number of bytes to read.
611      * @return The number of bytes read, or -1 at EOF.
612      * @throws IOException on error
613      */
614     @Override
615     public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
616         if (numToRead == 0) {
617             return 0;
618         }
619         int totalRead = 0;
620         if (isAtEOF() || isDirectory()) {
621             return -1;
622         }
623         if (currEntry == null) {
624             throw new IllegalStateException("No current tar entry");
625         }
626         if (entryOffset >= currEntry.getRealSize()) {
627             return -1;
628         }
629         numToRead = Math.min(numToRead, available());
630         if (currEntry.isSparse()) {
631             // for sparse entries, we need to read them in another way
632             totalRead = readSparse(buf, offset, numToRead);
633         } else {
634             totalRead = in.read(buf, offset, numToRead);
635         }
636         if (totalRead == -1) {
637             if (numToRead > 0) {
638                 throw new IOException("Truncated TAR archive");
639             }
640             setAtEOF(true);
641         } else {
642             count(totalRead);
643             entryOffset += totalRead;
644         }
645         return totalRead;
646     }
647 
648     private void readGlobalPaxHeaders() throws IOException {
649         globalPaxHeaders = TarUtils.parsePaxHeaders(this, globalSparseHeaders, globalPaxHeaders, entrySize);
650         getNextEntry(); // Get the actual file entry
651         if (currEntry == null) {
652             throw new IOException("Error detected parsing the pax header");
653         }
654     }
655 
656     /**
657      * Adds the sparse chunks from the current entry to the sparse chunks, including any additional sparse entries following the current entry.
658      *
659      * @throws IOException on error
660      */
661     private void readOldGNUSparse() throws IOException {
662         if (currEntry.isExtended()) {
663             TarArchiveSparseEntry entry;
664             do {
665                 final byte[] headerBuf = getRecord();
666                 if (headerBuf == null) {
667                     throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag.");
668                 }
669                 entry = new TarArchiveSparseEntry(headerBuf);
670                 currEntry.getSparseHeaders().addAll(entry.getSparseHeaders());
671             } while (entry.isExtended());
672         }
673         // sparse headers are all done reading, we need to build
674         // sparse input streams using these sparse headers
675         buildSparseInputStreams();
676     }
677 
678     /**
679      * Reads a record from the input stream and return the data.
680      *
681      * @return The record data or null if EOF has been hit.
682      * @throws IOException on error
683      */
684     protected byte[] readRecord() throws IOException {
685         final int readCount = IOUtils.readFully(in, recordBuffer);
686         count(readCount);
687         if (readCount != getRecordSize()) {
688             return null;
689         }
690         return recordBuffer;
691     }
692 
693     /**
694      * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is stored in tar files, and they are stored
695      * separately. The structure of non-zero data is introduced by the sparse headers using the offset, where a block of non-zero data starts, and numbytes, the
696      * length of the non-zero data block. When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together according to
697      * the sparse headers.
698      *
699      * @param buf       The buffer into which to place bytes read.
700      * @param offset    The offset at which to place bytes read.
701      * @param numToRead The number of bytes to read.
702      * @return The number of bytes read, or -1 at EOF.
703      * @throws IOException on error
704      */
705     private int readSparse(final byte[] buf, final int offset, final int numToRead) throws IOException {
706         // if there are no actual input streams, just read from the original input stream
707         if (sparseInputStreams == null || sparseInputStreams.isEmpty()) {
708             return in.read(buf, offset, numToRead);
709         }
710         if (currentSparseInputStreamIndex >= sparseInputStreams.size()) {
711             return -1;
712         }
713         final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
714         final int readLen = currentInputStream.read(buf, offset, numToRead);
715         // if the current input stream is the last input stream,
716         // just return the number of bytes read from current input stream
717         if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
718             return readLen;
719         }
720         // if EOF of current input stream is meet, open a new input stream and recursively call read
721         if (readLen == -1) {
722             currentSparseInputStreamIndex++;
723             return readSparse(buf, offset, numToRead);
724         }
725         // if the rest data of current input stream is not long enough, open a new input stream
726         // and recursively call read
727         if (readLen < numToRead) {
728             currentSparseInputStreamIndex++;
729             final int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
730             if (readLenOfNext == -1) {
731                 return readLen;
732             }
733             return readLen + readLenOfNext;
734         }
735         // if the rest data of current input stream is enough(which means readLen == len), just return readLen
736         return readLen;
737     }
738 
739     /**
740      * Since we do not support marking just yet, we do nothing.
741      */
742     @Override
743     public synchronized void reset() {
744         // empty
745     }
746 
747     /**
748      * Sets whether we are at the end-of-file.
749      *
750      * @param atEof whether we are at the end-of-file.
751      */
752     protected final void setAtEOF(final boolean atEof) {
753         this.atEof = atEof;
754     }
755 
756     /**
757      * Sets the current entry.
758      *
759      * @param currEntry the current entry.
760      */
761     protected final void setCurrentEntry(final TarArchiveEntry currEntry) {
762         this.currEntry = currEntry;
763     }
764 
765     /**
766      * Skips over and discards {@code n} bytes of data from this input stream. The {@code skip} method may, for a variety of reasons, end up skipping over some
767      * smaller number of bytes, possibly {@code 0}. This may result from any of a number of conditions; reaching end of file or end of entry before {@code n}
768      * bytes have been skipped; are only two possibilities. The actual number of bytes skipped is returned. If {@code n} is negative, no bytes are skipped.
769      *
770      * @param n the number of bytes to be skipped.
771      * @return the actual number of bytes skipped.
772      * @throws IOException if a truncated tar archive is detected or some other I/O error occurs
773      */
774     @Override
775     public long skip(final long n) throws IOException {
776         if (n <= 0 || isDirectory()) {
777             return 0;
778         }
779         final long availableOfInputStream = in.available();
780         final long available = currEntry.getRealSize() - entryOffset;
781         final long numToSkip = Math.min(n, available);
782         long skipped;
783         if (!currEntry.isSparse()) {
784             skipped = IOUtils.skip(in, numToSkip);
785             // for non-sparse entry, we should get the bytes actually skipped bytes along with
786             // inputStream.available() if inputStream is instance of FileInputStream
787             skipped = getActuallySkipped(availableOfInputStream, skipped, numToSkip);
788         } else {
789             skipped = skipSparse(numToSkip);
790         }
791         count(skipped);
792         entryOffset += skipped;
793         return skipped;
794     }
795 
796     /**
797      * The last record block should be written at the full size, so skip any additional space used to fill a record after an entry.
798      *
799      * @throws IOException if a truncated tar archive is detected
800      */
801     private void skipRecordPadding() throws IOException {
802         if (!isDirectory() && this.entrySize > 0 && this.entrySize % getRecordSize() != 0) {
803             final long available = in.available();
804             final long numRecords = this.entrySize / getRecordSize() + 1;
805             final long padding = numRecords * getRecordSize() - this.entrySize;
806             long skipped = IOUtils.skip(in, padding);
807             skipped = getActuallySkipped(available, skipped, padding);
808             count(skipped);
809         }
810     }
811 
812     /**
813      * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip, jump to the next input stream and skip the rest
814      * bytes, keep doing this until total n bytes are skipped or the input streams are all skipped
815      *
816      * @param n bytes of data to skip
817      * @return actual bytes of data skipped
818      * @throws IOException if an I/O error occurs.
819      */
820     private long skipSparse(final long n) throws IOException {
821         if (sparseInputStreams == null || sparseInputStreams.isEmpty()) {
822             return in.skip(n);
823         }
824         long bytesSkipped = 0;
825         while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) {
826             final InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
827             bytesSkipped += currentInputStream.skip(n - bytesSkipped);
828             if (bytesSkipped < n) {
829                 currentSparseInputStreamIndex++;
830             }
831         }
832         return bytesSkipped;
833     }
834 
835     /**
836      * Tries to read the next record rewinding the stream if it is not an EOF record.
837      * <p>
838      * This is meant to protect against cases where a tar implementation has written only one EOF record when two are expected. Actually this won't help since a
839      * non-conforming implementation likely won't fill full blocks consisting of - by default - ten records either so we probably have already read beyond the
840      * archive anyway.
841      * </p>
842      */
843     private void tryToConsumeSecondEOFRecord() throws IOException {
844         boolean shouldReset = true;
845         final boolean marked = in.markSupported();
846         if (marked) {
847             in.mark(getRecordSize());
848         }
849         try {
850             shouldReset = !isEOFRecord(readRecord());
851         } finally {
852             if (shouldReset && marked) {
853                 pushedBackBytes(getRecordSize());
854                 in.reset();
855             }
856         }
857     }
858 }