001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 *
017 */
018
019/*
020 * This package is based on the work done by Timothy Gerard Endres
021 * (time@ice.com) to whom the Ant project is very grateful for his great code.
022 */
023
024package org.apache.commons.compress.archivers.tar;
025
026import java.io.ByteArrayOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.HashMap;
030import java.util.Map;
031
032import org.apache.commons.compress.archivers.ArchiveEntry;
033import org.apache.commons.compress.archivers.ArchiveInputStream;
034import org.apache.commons.compress.archivers.zip.ZipEncoding;
035import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
036import org.apache.commons.compress.utils.ArchiveUtils;
037import org.apache.commons.compress.utils.CharsetNames;
038import org.apache.commons.compress.utils.IOUtils;
039
040/**
041 * The TarInputStream reads a UNIX tar archive as an InputStream.
042 * methods are provided to position at each successive entry in
043 * the archive, and the read each entry as a normal input stream
044 * using read().
045 * @NotThreadSafe
046 */
047public class TarArchiveInputStream extends ArchiveInputStream {
048
049    private static final int SMALL_BUFFER_SIZE = 256;
050
051    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
052
053    /** The size the TAR header */
054    private final int recordSize;
055
056    /** The size of a block */
057    private final int blockSize;
058
059    /** True if file has hit EOF */
060    private boolean hasHitEOF;
061
062    /** Size of the current entry */
063    private long entrySize;
064
065    /** How far into the entry the stream is at */
066    private long entryOffset;
067
068    /** An input stream to read from */
069    private final InputStream is;
070
071    /** The meta-data about the current entry */
072    private TarArchiveEntry currEntry;
073
074    /** The encoding of the file */
075    private final ZipEncoding zipEncoding;
076
077    // the provided encoding (for unit tests)
078    final String encoding;
079
080    // the global PAX header
081    private Map<String, String> globalPaxHeaders = new HashMap<>();
082
083    private final boolean lenient;
084
085    /**
086     * Constructor for TarInputStream.
087     * @param is the input stream to use
088     */
089    public TarArchiveInputStream(final InputStream is) {
090        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
091    }
092
093    /**
094     * Constructor for TarInputStream.
095     * @param is the input stream to use
096     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
097     * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
098     * exception instead.
099     * @since 1.19
100     */
101    public TarArchiveInputStream(final InputStream is, boolean lenient) {
102        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
103    }
104
105    /**
106     * Constructor for TarInputStream.
107     * @param is the input stream to use
108     * @param encoding name of the encoding to use for file names
109     * @since 1.4
110     */
111    public TarArchiveInputStream(final InputStream is, final String encoding) {
112        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE,
113             encoding);
114    }
115
116    /**
117     * Constructor for TarInputStream.
118     * @param is the input stream to use
119     * @param blockSize the block size to use
120     */
121    public TarArchiveInputStream(final InputStream is, final int blockSize) {
122        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE);
123    }
124
125    /**
126     * Constructor for TarInputStream.
127     * @param is the input stream to use
128     * @param blockSize the block size to use
129     * @param encoding name of the encoding to use for file names
130     * @since 1.4
131     */
132    public TarArchiveInputStream(final InputStream is, final int blockSize,
133                                 final String encoding) {
134        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
135    }
136
137    /**
138     * Constructor for TarInputStream.
139     * @param is the input stream to use
140     * @param blockSize the block size to use
141     * @param recordSize the record size to use
142     */
143    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) {
144        this(is, blockSize, recordSize, null);
145    }
146
147    /**
148     * Constructor for TarInputStream.
149     * @param is the input stream to use
150     * @param blockSize the block size to use
151     * @param recordSize the record size to use
152     * @param encoding name of the encoding to use for file names
153     * @since 1.4
154     */
155    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
156                                 final String encoding) {
157        this(is, blockSize, recordSize, encoding, false);
158    }
159
160    /**
161     * Constructor for TarInputStream.
162     * @param is the input stream to use
163     * @param blockSize the block size to use
164     * @param recordSize the record size to use
165     * @param encoding name of the encoding to use for file names
166     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
167     * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
168     * exception instead.
169     * @since 1.19
170     */
171    public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
172                                 final String encoding, boolean lenient) {
173        this.is = is;
174        this.hasHitEOF = false;
175        this.encoding = encoding;
176        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
177        this.recordSize = recordSize;
178        this.blockSize = blockSize;
179        this.lenient = lenient;
180    }
181
182    /**
183     * Closes this stream. Calls the TarBuffer's close() method.
184     * @throws IOException on error
185     */
186    @Override
187    public void close() throws IOException {
188        is.close();
189    }
190
191    /**
192     * Get the record size being used by this stream's buffer.
193     *
194     * @return The TarBuffer record size.
195     */
196    public int getRecordSize() {
197        return recordSize;
198    }
199
200    /**
201     * Get the available data that can be read from the current
202     * entry in the archive. This does not indicate how much data
203     * is left in the entire archive, only in the current entry.
204     * This value is determined from the entry's size header field
205     * and the amount of data already read from the current entry.
206     * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
207     * bytes are left in the current entry in the archive.
208     *
209     * @return The number of available bytes for the current entry.
210     * @throws IOException for signature
211     */
212    @Override
213    public int available() throws IOException {
214        if (isDirectory()) {
215            return 0;
216        }
217        if (entrySize - entryOffset > Integer.MAX_VALUE) {
218            return Integer.MAX_VALUE;
219        }
220        return (int) (entrySize - entryOffset);
221    }
222
223
224    /**
225     * Skips over and discards <code>n</code> bytes of data from this input
226     * stream. The <code>skip</code> method may, for a variety of reasons, end
227     * up skipping over some smaller number of bytes, possibly <code>0</code>.
228     * This may result from any of a number of conditions; reaching end of file
229     * or end of entry before <code>n</code> bytes have been skipped; are only
230     * two possibilities. The actual number of bytes skipped is returned. If
231     * <code>n</code> is negative, no bytes are skipped.
232     *
233     *
234     * @param n
235     *            the number of bytes to be skipped.
236     * @return the actual number of bytes skipped.
237     * @throws IOException
238     *                if some other I/O error occurs.
239     */
240    @Override
241    public long skip(final long n) throws IOException {
242        if (n <= 0 || isDirectory()) {
243            return 0;
244        }
245
246        final long available = entrySize - entryOffset;
247        final long skipped = IOUtils.skip(is, Math.min(n, available));
248        count(skipped);
249        entryOffset += skipped;
250        return skipped;
251    }
252
253    /**
254     * Since we do not support marking just yet, we return false.
255     *
256     * @return False.
257     */
258    @Override
259    public boolean markSupported() {
260        return false;
261    }
262
263    /**
264     * Since we do not support marking just yet, we do nothing.
265     *
266     * @param markLimit The limit to mark.
267     */
268    @Override
269    public void mark(final int markLimit) {
270    }
271
272    /**
273     * Since we do not support marking just yet, we do nothing.
274     */
275    @Override
276    public synchronized void reset() {
277    }
278
279    /**
280     * Get the next entry in this tar archive. This will skip
281     * over any remaining data in the current entry, if there
282     * is one, and place the input stream at the header of the
283     * next entry, and read the header and instantiate a new
284     * TarEntry from the header bytes and return that entry.
285     * If there are no more entries in the archive, null will
286     * be returned to indicate that the end of the archive has
287     * been reached.
288     *
289     * @return The next TarEntry in the archive, or null.
290     * @throws IOException on error
291     */
292    public TarArchiveEntry getNextTarEntry() throws IOException {
293        if (isAtEOF()) {
294            return null;
295        }
296
297        if (currEntry != null) {
298            /* Skip will only go to the end of the current entry */
299            IOUtils.skip(this, Long.MAX_VALUE);
300
301            /* skip to the end of the last record */
302            skipRecordPadding();
303        }
304
305        final byte[] headerBuf = getRecord();
306
307        if (headerBuf == null) {
308            /* hit EOF */
309            currEntry = null;
310            return null;
311        }
312
313        try {
314            currEntry = new TarArchiveEntry(headerBuf, zipEncoding, lenient);
315        } catch (final IllegalArgumentException e) {
316            throw new IOException("Error detected parsing the header", e);
317        }
318
319        entryOffset = 0;
320        entrySize = currEntry.getSize();
321
322        if (currEntry.isGNULongLinkEntry()) {
323            final byte[] longLinkData = getLongNameData();
324            if (longLinkData == null) {
325                // Bugzilla: 40334
326                // Malformed tar file - long link entry name not followed by
327                // entry
328                return null;
329            }
330            currEntry.setLinkName(zipEncoding.decode(longLinkData));
331        }
332
333        if (currEntry.isGNULongNameEntry()) {
334            final byte[] longNameData = getLongNameData();
335            if (longNameData == null) {
336                // Bugzilla: 40334
337                // Malformed tar file - long entry name not followed by
338                // entry
339                return null;
340            }
341            currEntry.setName(zipEncoding.decode(longNameData));
342        }
343
344        if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers
345            readGlobalPaxHeaders();
346        }
347
348        if (currEntry.isPaxHeader()){ // Process Pax headers
349            paxHeaders();
350        } else if (!globalPaxHeaders.isEmpty()) {
351            applyPaxHeadersToCurrentEntry(globalPaxHeaders);
352        }
353
354        if (currEntry.isOldGNUSparse()){ // Process sparse files
355            readOldGNUSparse();
356        }
357
358        // If the size of the next element in the archive has changed
359        // due to a new size being reported in the posix header
360        // information, we update entrySize here so that it contains
361        // the correct value.
362        entrySize = currEntry.getSize();
363
364        return currEntry;
365    }
366
367    /**
368     * The last record block should be written at the full size, so skip any
369     * additional space used to fill a record after an entry
370     */
371    private void skipRecordPadding() throws IOException {
372        if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
373            final long numRecords = (this.entrySize / this.recordSize) + 1;
374            final long padding = (numRecords * this.recordSize) - this.entrySize;
375            final long skipped = IOUtils.skip(is, padding);
376            count(skipped);
377        }
378    }
379
380    /**
381     * Get the next entry in this tar archive as longname data.
382     *
383     * @return The next entry in the archive as longname data, or null.
384     * @throws IOException on error
385     */
386    protected byte[] getLongNameData() throws IOException {
387        // read in the name
388        final ByteArrayOutputStream longName = new ByteArrayOutputStream();
389        int length = 0;
390        while ((length = read(smallBuf)) >= 0) {
391            longName.write(smallBuf, 0, length);
392        }
393        getNextEntry();
394        if (currEntry == null) {
395            // Bugzilla: 40334
396            // Malformed tar file - long entry name not followed by entry
397            return null;
398        }
399        byte[] longNameData = longName.toByteArray();
400        // remove trailing null terminator(s)
401        length = longNameData.length;
402        while (length > 0 && longNameData[length - 1] == 0) {
403            --length;
404        }
405        if (length != longNameData.length) {
406            final byte[] l = new byte[length];
407            System.arraycopy(longNameData, 0, l, 0, length);
408            longNameData = l;
409        }
410        return longNameData;
411    }
412
413    /**
414     * Get the next record in this tar archive. This will skip
415     * over any remaining data in the current entry, if there
416     * is one, and place the input stream at the header of the
417     * next entry.
418     *
419     * <p>If there are no more entries in the archive, null will be
420     * returned to indicate that the end of the archive has been
421     * reached.  At the same time the {@code hasHitEOF} marker will be
422     * set to true.</p>
423     *
424     * @return The next header in the archive, or null.
425     * @throws IOException on error
426     */
427    private byte[] getRecord() throws IOException {
428        byte[] headerBuf = readRecord();
429        setAtEOF(isEOFRecord(headerBuf));
430        if (isAtEOF() && headerBuf != null) {
431            tryToConsumeSecondEOFRecord();
432            consumeRemainderOfLastBlock();
433            headerBuf = null;
434        }
435        return headerBuf;
436    }
437
438    /**
439     * Determine if an archive record indicate End of Archive. End of
440     * archive is indicated by a record that consists entirely of null bytes.
441     *
442     * @param record The record data to check.
443     * @return true if the record data is an End of Archive
444     */
445    protected boolean isEOFRecord(final byte[] record) {
446        return record == null || ArchiveUtils.isArrayZero(record, recordSize);
447    }
448
449    /**
450     * Read a record from the input stream and return the data.
451     *
452     * @return The record data or null if EOF has been hit.
453     * @throws IOException on error
454     */
455    protected byte[] readRecord() throws IOException {
456
457        final byte[] record = new byte[recordSize];
458
459        final int readNow = IOUtils.readFully(is, record);
460        count(readNow);
461        if (readNow != recordSize) {
462            return null;
463        }
464
465        return record;
466    }
467
468    private void readGlobalPaxHeaders() throws IOException {
469        globalPaxHeaders = parsePaxHeaders(this);
470        getNextEntry(); // Get the actual file entry
471    }
472
473    private void paxHeaders() throws IOException{
474        final Map<String, String> headers = parsePaxHeaders(this);
475        getNextEntry(); // Get the actual file entry
476        applyPaxHeadersToCurrentEntry(headers);
477    }
478
479    // NOTE, using a Map here makes it impossible to ever support GNU
480    // sparse files using the PAX Format 0.0, see
481    // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188
482    Map<String, String> parsePaxHeaders(final InputStream i)
483        throws IOException {
484        final Map<String, String> headers = new HashMap<>(globalPaxHeaders);
485        // Format is "length keyword=value\n";
486        while(true){ // get length
487            int ch;
488            int len = 0;
489            int read = 0;
490            while((ch = i.read()) != -1) {
491                read++;
492                if (ch == '\n') { // blank line in header
493                    break;
494                } else if (ch == ' '){ // End of length string
495                    // Get keyword
496                    final ByteArrayOutputStream coll = new ByteArrayOutputStream();
497                    while((ch = i.read()) != -1) {
498                        read++;
499                        if (ch == '='){ // end of keyword
500                            final String keyword = coll.toString(CharsetNames.UTF_8);
501                            // Get rest of entry
502                            final int restLen = len - read;
503                            if (restLen == 1) { // only NL
504                                headers.remove(keyword);
505                            } else {
506                                final byte[] rest = new byte[restLen];
507                                final int got = IOUtils.readFully(i, rest);
508                                if (got != restLen) {
509                                    throw new IOException("Failed to read "
510                                                          + "Paxheader. Expected "
511                                                          + restLen
512                                                          + " bytes, read "
513                                                          + got);
514                                }
515                                // Drop trailing NL
516                                final String value = new String(rest, 0,
517                                                          restLen - 1, CharsetNames.UTF_8);
518                                headers.put(keyword, value);
519                            }
520                            break;
521                        }
522                        coll.write((byte) ch);
523                    }
524                    break; // Processed single header
525                }
526                len *= 10;
527                len += ch - '0';
528            }
529            if (ch == -1){ // EOF
530                break;
531            }
532        }
533        return headers;
534    }
535
536    private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers) {
537        currEntry.updateEntryFromPaxHeaders(headers);
538
539    }
540
541    /**
542     * Adds the sparse chunks from the current entry to the sparse chunks,
543     * including any additional sparse entries following the current entry.
544     *
545     * @throws IOException on error
546     *
547     * @todo Sparse files get not yet really processed.
548     */
549    private void readOldGNUSparse() throws IOException {
550        /* we do not really process sparse files yet
551        sparses = new ArrayList();
552        sparses.addAll(currEntry.getSparses());
553        */
554        if (currEntry.isExtended()) {
555            TarArchiveSparseEntry entry;
556            do {
557                final byte[] headerBuf = getRecord();
558                if (headerBuf == null) {
559                    currEntry = null;
560                    break;
561                }
562                entry = new TarArchiveSparseEntry(headerBuf);
563                /* we do not really process sparse files yet
564                sparses.addAll(entry.getSparses());
565                */
566            } while (entry.isExtended());
567        }
568    }
569
570    private boolean isDirectory() {
571        return currEntry != null && currEntry.isDirectory();
572    }
573
574    /**
575     * Returns the next Archive Entry in this Stream.
576     *
577     * @return the next entry,
578     *         or {@code null} if there are no more entries
579     * @throws IOException if the next entry could not be read
580     */
581    @Override
582    public ArchiveEntry getNextEntry() throws IOException {
583        return getNextTarEntry();
584    }
585
586    /**
587     * Tries to read the next record rewinding the stream if it is not a EOF record.
588     *
589     * <p>This is meant to protect against cases where a tar
590     * implementation has written only one EOF record when two are
591     * expected.  Actually this won't help since a non-conforming
592     * implementation likely won't fill full blocks consisting of - by
593     * default - ten records either so we probably have already read
594     * beyond the archive anyway.</p>
595     */
596    private void tryToConsumeSecondEOFRecord() throws IOException {
597        boolean shouldReset = true;
598        final boolean marked = is.markSupported();
599        if (marked) {
600            is.mark(recordSize);
601        }
602        try {
603            shouldReset = !isEOFRecord(readRecord());
604        } finally {
605            if (shouldReset && marked) {
606                pushedBackBytes(recordSize);
607                is.reset();
608            }
609        }
610    }
611
612    /**
613     * Reads bytes from the current tar archive entry.
614     *
615     * This method is aware of the boundaries of the current
616     * entry in the archive and will deal with them as if they
617     * were this stream's start and EOF.
618     *
619     * @param buf The buffer into which to place bytes read.
620     * @param offset The offset at which to place bytes read.
621     * @param numToRead The number of bytes to read.
622     * @return The number of bytes read, or -1 at EOF.
623     * @throws IOException on error
624     */
625    @Override
626    public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
627        int totalRead = 0;
628
629        if (isAtEOF() || isDirectory() || entryOffset >= entrySize) {
630            return -1;
631        }
632
633        if (currEntry == null) {
634            throw new IllegalStateException("No current tar entry");
635        }
636
637        numToRead = Math.min(numToRead, available());
638
639        totalRead = is.read(buf, offset, numToRead);
640
641        if (totalRead == -1) {
642            if (numToRead > 0) {
643                throw new IOException("Truncated TAR archive");
644            }
645            setAtEOF(true);
646        } else {
647            count(totalRead);
648            entryOffset += totalRead;
649        }
650
651        return totalRead;
652    }
653
654    /**
655     * Whether this class is able to read the given entry.
656     *
657     * <p>May return false if the current entry is a sparse file.</p>
658     */
659    @Override
660    public boolean canReadEntryData(final ArchiveEntry ae) {
661        if (ae instanceof TarArchiveEntry) {
662            final TarArchiveEntry te = (TarArchiveEntry) ae;
663            return !te.isSparse();
664        }
665        return false;
666    }
667
668    /**
669     * Get the current TAR Archive Entry that this input stream is processing
670     *
671     * @return The current Archive Entry
672     */
673    public TarArchiveEntry getCurrentEntry() {
674        return currEntry;
675    }
676
677    protected final void setCurrentEntry(final TarArchiveEntry e) {
678        currEntry = e;
679    }
680
681    protected final boolean isAtEOF() {
682        return hasHitEOF;
683    }
684
685    protected final void setAtEOF(final boolean b) {
686        hasHitEOF = b;
687    }
688
689    /**
690     * This method is invoked once the end of the archive is hit, it
691     * tries to consume the remaining bytes under the assumption that
692     * the tool creating this archive has padded the last block.
693     */
694    private void consumeRemainderOfLastBlock() throws IOException {
695        final long bytesReadOfLastBlock = getBytesRead() % blockSize;
696        if (bytesReadOfLastBlock > 0) {
697            final long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock);
698            count(skipped);
699        }
700    }
701
702    /**
703     * Checks if the signature matches what is expected for a tar file.
704     *
705     * @param signature
706     *            the bytes to check
707     * @param length
708     *            the number of bytes to check
709     * @return true, if this stream is a tar archive stream, false otherwise
710     */
711    public static boolean matches(final byte[] signature, final int length) {
712        if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
713            return false;
714        }
715
716        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
717                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
718            &&
719            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
720                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
721                ){
722            return true;
723        }
724        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
725                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
726            &&
727            (
728             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
729                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
730            ||
731            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
732                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
733            )
734                ){
735            return true;
736        }
737        // COMPRESS-107 - recognise Ant tar files
738        return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
739                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
740                &&
741                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
742                        signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
743    }
744
745}