001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.archivers.dump;
020
021import java.io.EOFException;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Arrays;
025import java.util.BitSet;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.PriorityQueue;
029import java.util.Queue;
030import java.util.Stack;
031
032import org.apache.commons.compress.archivers.ArchiveException;
033import org.apache.commons.compress.archivers.ArchiveInputStream;
034import org.apache.commons.compress.archivers.zip.ZipEncoding;
035import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
036import org.apache.commons.compress.utils.IOUtils;
037
038/**
039 * The DumpArchiveInputStream reads a Unix dump archive as an InputStream. Methods are provided to position at each successive entry in the archive, and the
040 * read each entry as a normal input stream using read().
041 * <p>
042 * There doesn't seem to exist a hint on the encoding of string values in any piece documentation. Given the main purpose of dump/restore is backing up a system
043 * it seems very likely the format uses the current default encoding of the system.
044 * </p>
045 * @NotThreadSafe
046 * @since 1.3
047 */
048public class DumpArchiveInputStream extends ArchiveInputStream<DumpArchiveEntry> {
049
050    private static final String CURRENT_PATH_SEGMENT = ".";
051    private static final String PARENT_PATH_SEGMENT = "..";
052
053    /**
054     * Look at the first few bytes of the file to decide if it's a dump archive. With 32 bytes we can look at the magic value, with a full 1k we can verify the
055     * checksum.
056     *
057     * @param buffer data to match
058     * @param length length of data
059     * @return whether the buffer seems to contain dump data
060     */
061    public static boolean matches(final byte[] buffer, final int length) {
062        // do we have enough of the header?
063        if (length < 32) {
064            return false;
065        }
066
067        // this is the best test
068        if (length >= DumpArchiveConstants.TP_SIZE) {
069            return DumpArchiveUtil.verify(buffer);
070        }
071
072        // this will work in a pinch.
073        return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer, 24);
074    }
075
076    private final DumpArchiveSummary summary;
077    private DumpArchiveEntry active;
078    private boolean isClosed;
079    private boolean hasHitEOF;
080    private long entrySize;
081    private long entryOffset;
082    private int readIdx;
083    private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
084    private byte[] blockBuffer;
085    private int recordOffset;
086    private long filepos;
087
088    /**
089     * TapeInputStream is the raw input.
090     */
091    protected TapeInputStream raw;
092
093    /** Map of ino -> dirent entry. We can use this to reconstruct full paths. */
094    private final Map<Integer, Dirent> names = new HashMap<>();
095
096    /** Map of ino -> (directory) entry when we're missing one or more elements in the path. */
097    private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>();
098
099    /** Queue of (directory) entries where we now have the full path. */
100    private final Queue<DumpArchiveEntry> queue;
101
102    /**
103     * The encoding to use for file names and labels.
104     */
105    private final ZipEncoding zipEncoding;
106
107    /**
108     * Constructor using the platform's default encoding for file names.
109     *
110     * @param is stream to read from
111     * @throws ArchiveException on error
112     */
113    public DumpArchiveInputStream(final InputStream is) throws ArchiveException {
114        this(is, null);
115    }
116
117    /**
118     * Constructs a new instance.
119     *
120     * @param is       stream to read from
121     * @param encoding the encoding to use for file names, use null for the platform's default encoding
122     * @throws ArchiveException on error
123     * @since 1.6
124     */
125    public DumpArchiveInputStream(final InputStream is, final String encoding) throws ArchiveException {
126        super(is, encoding);
127        this.raw = new TapeInputStream(is);
128        this.hasHitEOF = false;
129        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
130
131        try {
132            // read header, verify it's a dump archive.
133            final byte[] headerBytes = raw.readRecord();
134
135            if (!DumpArchiveUtil.verify(headerBytes)) {
136                throw new UnrecognizedFormatException();
137            }
138
139            // get summary information
140            summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);
141
142            // reset buffer with actual block size.
143            raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
144
145            // allocate our read buffer.
146            blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];
147
148            // skip past CLRI and BITS segments since we don't handle them yet.
149            readCLRI();
150            readBITS();
151        } catch (final IOException e) {
152            throw new ArchiveException(e.getMessage(), (Throwable) e);
153        }
154
155        // put in a dummy record for the root node.
156        final Dirent root = new Dirent(2, 2, 4, CURRENT_PATH_SEGMENT);
157        names.put(2, root);
158
159        // use priority based on queue to ensure parent directories are
160        // released first.
161        queue = new PriorityQueue<>(10, (p, q) -> {
162            if (p.getOriginalName() == null || q.getOriginalName() == null) {
163                return Integer.MAX_VALUE;
164            }
165
166            return p.getOriginalName().compareTo(q.getOriginalName());
167        });
168    }
169
170    /**
171     * Closes the stream for this entry.
172     */
173    @Override
174    public void close() throws IOException {
175        if (!isClosed) {
176            isClosed = true;
177            raw.close();
178        }
179    }
180
181    @Override
182    public long getBytesRead() {
183        return raw.getBytesRead();
184    }
185
186    @Deprecated
187    @Override
188    public int getCount() {
189        return (int) getBytesRead();
190    }
191
192    /**
193     * Reads the next entry.
194     *
195     * @return the next entry
196     * @throws IOException on error
197     * @deprecated Use {@link #getNextEntry()}.
198     */
199    @Deprecated
200    public DumpArchiveEntry getNextDumpEntry() throws IOException {
201        return getNextEntry();
202    }
203
204    @Override
205    public DumpArchiveEntry getNextEntry() throws IOException {
206        DumpArchiveEntry entry = null;
207        String path = null;
208
209        // is there anything in the queue?
210        if (!queue.isEmpty()) {
211            return queue.remove();
212        }
213
214        while (entry == null) {
215            if (hasHitEOF) {
216                return null;
217            }
218
219            // skip any remaining records in this segment for prior file.
220            // we might still have holes... easiest to do it
221            // block by block. We may want to revisit this if
222            // the unnecessary decompression time adds up.
223            while (readIdx < active.getHeaderCount()) {
224                if (!active.isSparseRecord(readIdx++) && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
225                    throw new EOFException();
226                }
227            }
228
229            readIdx = 0;
230            filepos = raw.getBytesRead();
231
232            byte[] headerBytes = raw.readRecord();
233
234            if (!DumpArchiveUtil.verify(headerBytes)) {
235                throw new InvalidFormatException();
236            }
237
238            active = DumpArchiveEntry.parse(headerBytes);
239
240            // skip any remaining segments for prior file.
241            while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
242                if (raw.skip((long) DumpArchiveConstants.TP_SIZE * (active.getHeaderCount() - active.getHeaderHoles())) == -1) {
243                    throw new EOFException();
244                }
245
246                filepos = raw.getBytesRead();
247                headerBytes = raw.readRecord();
248
249                if (!DumpArchiveUtil.verify(headerBytes)) {
250                    throw new InvalidFormatException();
251                }
252
253                active = DumpArchiveEntry.parse(headerBytes);
254            }
255
256            // check if this is an end-of-volume marker.
257            if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
258                hasHitEOF = true;
259
260                return null;
261            }
262
263            entry = active;
264
265            if (entry.isDirectory()) {
266                readDirectoryEntry(active);
267
268                // now we create an empty InputStream.
269                entryOffset = 0;
270                entrySize = 0;
271                readIdx = active.getHeaderCount();
272            } else {
273                entryOffset = 0;
274                entrySize = active.getEntrySize();
275                readIdx = 0;
276            }
277
278            recordOffset = readBuf.length;
279
280            path = getPath(entry);
281
282            if (path == null) {
283                entry = null;
284            }
285        }
286
287        entry.setName(path);
288        entry.setSimpleName(names.get(entry.getIno()).getName());
289        entry.setOffset(filepos);
290
291        return entry;
292    }
293
294    /**
295     * Gets full path for specified archive entry, or null if there's a gap.
296     *
297     * @param entry
298     * @return full path for specified archive entry, or null if there's a gap.
299     * @throws DumpArchiveException Infinite loop detected in directory entries.
300     */
301    private String getPath(final DumpArchiveEntry entry) throws DumpArchiveException {
302        // build the stack of elements. It's possible that we're
303        // still missing an intermediate value and if so we
304        final Stack<String> elements = new Stack<>();
305        final BitSet visited = new BitSet();
306        Dirent dirent = null;
307        for (int i = entry.getIno();; i = dirent.getParentIno()) {
308            if (!names.containsKey(i)) {
309                elements.clear();
310                break;
311            }
312            if (visited.get(i)) {
313                throw new DumpArchiveException("Duplicate node " + i);
314            }
315            dirent = names.get(i);
316            visited.set(i);
317            elements.push(dirent.getName());
318            if (dirent.getIno() == dirent.getParentIno()) {
319                break;
320            }
321        }
322        // if an element is missing defer the work and read next entry.
323        if (elements.isEmpty()) {
324            pending.put(entry.getIno(), entry);
325            return null;
326        }
327        // generate full path from stack of elements.
328        final StringBuilder sb = new StringBuilder(elements.pop());
329        while (!elements.isEmpty()) {
330            sb.append('/');
331            sb.append(elements.pop());
332        }
333        return sb.toString();
334    }
335
336    /**
337     * Gets the archive summary information.
338     *
339     * @return the summary
340     */
341    public DumpArchiveSummary getSummary() {
342        return summary;
343    }
344
345    /**
346     * Reads bytes from the current dump archive entry.
347     *
348     * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF.
349     *
350     * @param buf The buffer into which to place bytes read.
351     * @param off The offset at which to place bytes read.
352     * @param len The number of bytes to read.
353     * @return The number of bytes read, or -1 at EOF.
354     * @throws IOException on error
355     */
356    @Override
357    public int read(final byte[] buf, int off, int len) throws IOException {
358        if (len == 0) {
359            return 0;
360        }
361        int totalRead = 0;
362
363        if (hasHitEOF || isClosed || entryOffset >= entrySize) {
364            return -1;
365        }
366
367        if (active == null) {
368            throw new IllegalStateException("No current dump entry");
369        }
370
371        if (len + entryOffset > entrySize) {
372            len = (int) (entrySize - entryOffset);
373        }
374
375        while (len > 0) {
376            final int sz = Math.min(len, readBuf.length - recordOffset);
377
378            // copy any data we have
379            if (recordOffset + sz <= readBuf.length) {
380                System.arraycopy(readBuf, recordOffset, buf, off, sz);
381                totalRead += sz;
382                recordOffset += sz;
383                len -= sz;
384                off += sz;
385            }
386
387            // load next block if necessary.
388            if (len > 0) {
389                if (readIdx >= 512) {
390                    final byte[] headerBytes = raw.readRecord();
391
392                    if (!DumpArchiveUtil.verify(headerBytes)) {
393                        throw new InvalidFormatException();
394                    }
395
396                    active = DumpArchiveEntry.parse(headerBytes);
397                    readIdx = 0;
398                }
399
400                if (!active.isSparseRecord(readIdx++)) {
401                    final int r = raw.read(readBuf, 0, readBuf.length);
402                    if (r != readBuf.length) {
403                        throw new EOFException();
404                    }
405                } else {
406                    Arrays.fill(readBuf, (byte) 0);
407                }
408
409                recordOffset = 0;
410            }
411        }
412
413        entryOffset += totalRead;
414
415        return totalRead;
416    }
417
418    /**
419     * Reads BITS segment.
420     */
421    private void readBITS() throws IOException {
422        final byte[] buffer = raw.readRecord();
423
424        if (!DumpArchiveUtil.verify(buffer)) {
425            throw new InvalidFormatException();
426        }
427
428        active = DumpArchiveEntry.parse(buffer);
429
430        if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
431            throw new InvalidFormatException();
432        }
433
434        // we don't do anything with this yet.
435        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) {
436            throw new EOFException();
437        }
438        readIdx = active.getHeaderCount();
439    }
440
441    /**
442     * Reads CLRI (deleted inode) segment.
443     */
444    private void readCLRI() throws IOException {
445        final byte[] buffer = raw.readRecord();
446
447        if (!DumpArchiveUtil.verify(buffer)) {
448            throw new InvalidFormatException();
449        }
450
451        active = DumpArchiveEntry.parse(buffer);
452
453        if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
454            throw new InvalidFormatException();
455        }
456
457        // we don't do anything with this yet.
458        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) {
459            throw new EOFException();
460        }
461        readIdx = active.getHeaderCount();
462    }
463
464    /**
465     * Reads directory entry.
466     */
467    private void readDirectoryEntry(DumpArchiveEntry entry) throws IOException {
468        long size = entry.getEntrySize();
469        boolean first = true;
470
471        while (first || DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
472            // read the header that we just peeked at.
473            if (!first) {
474                raw.readRecord();
475            }
476
477            if (!names.containsKey(entry.getIno()) && DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
478                pending.put(entry.getIno(), entry);
479            }
480
481            final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();
482
483            if (blockBuffer.length < datalen) {
484                blockBuffer = IOUtils.readRange(raw, datalen);
485                if (blockBuffer.length != datalen) {
486                    throw new EOFException();
487                }
488            } else if (raw.read(blockBuffer, 0, datalen) != datalen) {
489                throw new EOFException();
490            }
491
492            int reclen = 0;
493
494            for (int i = 0; i < datalen - 8 && i < size - 8; i += reclen) {
495                final int ino = DumpArchiveUtil.convert32(blockBuffer, i);
496                reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
497                if (reclen == 0) {
498                    throw new DumpArchiveException("reclen cannot be 0");
499                }
500
501                final byte type = blockBuffer[i + 6];
502
503                final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);
504
505                if (CURRENT_PATH_SEGMENT.equals(name) || PARENT_PATH_SEGMENT.equals(name)) {
506                    // do nothing...
507                    continue;
508                }
509
510                final Dirent d = new Dirent(ino, entry.getIno(), type, name);
511
512                /*
513                 * if ((type == 4) && names.containsKey(ino)) { System.out.println("we already have ino: " + names.get(ino)); }
514                 */
515
516                names.put(ino, d);
517
518                // check whether this allows us to fill anything in the pending list.
519                for (final Map.Entry<Integer, DumpArchiveEntry> mapEntry : pending.entrySet()) {
520                    final DumpArchiveEntry v = mapEntry.getValue();
521                    final String path = getPath(v);
522                    if (path != null) {
523                        v.setName(path);
524                        v.setSimpleName(names.get(mapEntry.getKey()).getName());
525                        queue.add(v);
526                    }
527                }
528
529                // remove anything that we found. (We can't do it earlier
530                // because of concurrent modification exceptions.)
531                queue.forEach(e -> pending.remove(e.getIno()));
532            }
533
534            final byte[] peekBytes = raw.peek();
535
536            if (!DumpArchiveUtil.verify(peekBytes)) {
537                throw new InvalidFormatException();
538            }
539
540            entry = DumpArchiveEntry.parse(peekBytes);
541            first = false;
542            size -= DumpArchiveConstants.TP_SIZE;
543        }
544    }
545
546}