001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.archivers.dump;
020
021import java.io.EOFException;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Arrays;
025import java.util.BitSet;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.PriorityQueue;
029import java.util.Queue;
030import java.util.Stack;
031
032import org.apache.commons.compress.archivers.ArchiveException;
033import org.apache.commons.compress.archivers.ArchiveInputStream;
034import org.apache.commons.compress.archivers.zip.ZipEncoding;
035import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
036import org.apache.commons.compress.utils.IOUtils;
037
038/**
039 * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream. Methods are provided to position at each successive entry in the archive, and the
040 * read each entry as a normal input stream using read().
041 * <p>
042 * There doesn't seem to exist a hint on the encoding of string values in any piece documentation. Given the main purpose of dump/restore is backing up a system
043 * it seems very likely the format uses the current default encoding of the system.
044 * </p>
045 * @NotThreadSafe
046 * @since 1.3
047 */
048public class DumpArchiveInputStream extends ArchiveInputStream<DumpArchiveEntry> {
049
050    private static final String CURRENT_PATH_SEGMENT = ".";
051    private static final String PARENT_PATH_SEGMENT = "..";
052
053    /**
054     * Look at the first few bytes of the file to decide if it's a dump archive. With 32 bytes we can look at the magic value, with a full 1k we can verify the
055     * checksum.
056     *
057     * @param buffer data to match
058     * @param length length of data
059     * @return whether the buffer seems to contain dump data
060     */
061    public static boolean matches(final byte[] buffer, final int length) {
062        // do we have enough of the header?
063        if (length < 32) {
064            return false;
065        }
066
067        // this is the best test
068        if (length >= DumpArchiveConstants.TP_SIZE) {
069            return DumpArchiveUtil.verify(buffer);
070        }
071
072        // this will work in a pinch.
073        return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer, 24);
074    }
075
076    private final DumpArchiveSummary summary;
077    private DumpArchiveEntry active;
078    private boolean isClosed;
079    private boolean hasHitEOF;
080    private long entrySize;
081    private long entryOffset;
082    private int readIdx;
083    private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
084    private byte[] blockBuffer;
085    private int recordOffset;
086    private long filepos;
087
088    protected TapeInputStream raw;
089
090    /** Map of ino -> dirent entry. We can use this to reconstruct full paths. */
091    private final Map<Integer, Dirent> names = new HashMap<>();
092
093    /** Map of ino -> (directory) entry when we're missing one or more elements in the path. */
094    private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>();
095
096    /** Queue of (directory) entries where we now have the full path. */
097    private final Queue<DumpArchiveEntry> queue;
098
099    /**
100     * The encoding to use for file names and labels.
101     */
102    private final ZipEncoding zipEncoding;
103
104    /**
105     * Constructor using the platform's default encoding for file names.
106     *
107     * @param is stream to read from
108     * @throws ArchiveException on error
109     */
110    public DumpArchiveInputStream(final InputStream is) throws ArchiveException {
111        this(is, null);
112    }
113
114    /**
115     * Constructs a new instance.
116     *
117     * @param is       stream to read from
118     * @param encoding the encoding to use for file names, use null for the platform's default encoding
119     * @since 1.6
120     * @throws ArchiveException on error
121     */
122    public DumpArchiveInputStream(final InputStream is, final String encoding) throws ArchiveException {
123        super(is, encoding);
124        this.raw = new TapeInputStream(is);
125        this.hasHitEOF = false;
126        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
127
128        try {
129            // read header, verify it's a dump archive.
130            final byte[] headerBytes = raw.readRecord();
131
132            if (!DumpArchiveUtil.verify(headerBytes)) {
133                throw new UnrecognizedFormatException();
134            }
135
136            // get summary information
137            summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);
138
139            // reset buffer with actual block size.
140            raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
141
142            // allocate our read buffer.
143            blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];
144
145            // skip past CLRI and BITS segments since we don't handle them yet.
146            readCLRI();
147            readBITS();
148        } catch (final IOException ex) {
149            throw new ArchiveException(ex.getMessage(), ex);
150        }
151
152        // put in a dummy record for the root node.
153        final Dirent root = new Dirent(2, 2, 4, CURRENT_PATH_SEGMENT);
154        names.put(2, root);
155
156        // use priority based on queue to ensure parent directories are
157        // released first.
158        queue = new PriorityQueue<>(10, (p, q) -> {
159            if (p.getOriginalName() == null || q.getOriginalName() == null) {
160                return Integer.MAX_VALUE;
161            }
162
163            return p.getOriginalName().compareTo(q.getOriginalName());
164        });
165    }
166
167    /**
168     * Closes the stream for this entry.
169     */
170    @Override
171    public void close() throws IOException {
172        if (!isClosed) {
173            isClosed = true;
174            raw.close();
175        }
176    }
177
178    @Override
179    public long getBytesRead() {
180        return raw.getBytesRead();
181    }
182
183    @Deprecated
184    @Override
185    public int getCount() {
186        return (int) getBytesRead();
187    }
188
189    /**
190     * Reads the next entry.
191     *
192     * @return the next entry
193     * @throws IOException on error
194     * @deprecated Use {@link #getNextEntry()}.
195     */
196    @Deprecated
197    public DumpArchiveEntry getNextDumpEntry() throws IOException {
198        return getNextEntry();
199    }
200
201    @Override
202    public DumpArchiveEntry getNextEntry() throws IOException {
203        DumpArchiveEntry entry = null;
204        String path = null;
205
206        // is there anything in the queue?
207        if (!queue.isEmpty()) {
208            return queue.remove();
209        }
210
211        while (entry == null) {
212            if (hasHitEOF) {
213                return null;
214            }
215
216            // skip any remaining records in this segment for prior file.
217            // we might still have holes... easiest to do it
218            // block by block. We may want to revisit this if
219            // the unnecessary decompression time adds up.
220            while (readIdx < active.getHeaderCount()) {
221                if (!active.isSparseRecord(readIdx++) && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
222                    throw new EOFException();
223                }
224            }
225
226            readIdx = 0;
227            filepos = raw.getBytesRead();
228
229            byte[] headerBytes = raw.readRecord();
230
231            if (!DumpArchiveUtil.verify(headerBytes)) {
232                throw new InvalidFormatException();
233            }
234
235            active = DumpArchiveEntry.parse(headerBytes);
236
237            // skip any remaining segments for prior file.
238            while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
239                if (raw.skip((long) DumpArchiveConstants.TP_SIZE * (active.getHeaderCount() - active.getHeaderHoles())) == -1) {
240                    throw new EOFException();
241                }
242
243                filepos = raw.getBytesRead();
244                headerBytes = raw.readRecord();
245
246                if (!DumpArchiveUtil.verify(headerBytes)) {
247                    throw new InvalidFormatException();
248                }
249
250                active = DumpArchiveEntry.parse(headerBytes);
251            }
252
253            // check if this is an end-of-volume marker.
254            if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
255                hasHitEOF = true;
256
257                return null;
258            }
259
260            entry = active;
261
262            if (entry.isDirectory()) {
263                readDirectoryEntry(active);
264
265                // now we create an empty InputStream.
266                entryOffset = 0;
267                entrySize = 0;
268                readIdx = active.getHeaderCount();
269            } else {
270                entryOffset = 0;
271                entrySize = active.getEntrySize();
272                readIdx = 0;
273            }
274
275            recordOffset = readBuf.length;
276
277            path = getPath(entry);
278
279            if (path == null) {
280                entry = null;
281            }
282        }
283
284        entry.setName(path);
285        entry.setSimpleName(names.get(entry.getIno()).getName());
286        entry.setOffset(filepos);
287
288        return entry;
289    }
290
291    /**
292     * Gets full path for specified archive entry, or null if there's a gap.
293     *
294     * @param entry
295     * @return full path for specified archive entry, or null if there's a gap.
296     * @throws DumpArchiveException Infinite loop detected in directory entries.
297     */
298    private String getPath(final DumpArchiveEntry entry) throws DumpArchiveException {
299        // build the stack of elements. It's possible that we're
300        // still missing an intermediate value and if so we
301        final Stack<String> elements = new Stack<>();
302        final BitSet visited = new BitSet();
303        Dirent dirent = null;
304        for (int i = entry.getIno();; i = dirent.getParentIno()) {
305            if (!names.containsKey(i)) {
306                elements.clear();
307                break;
308            }
309            if (visited.get(i)) {
310                throw new DumpArchiveException("Duplicate node " + i);
311            }
312            dirent = names.get(i);
313            visited.set(i);
314            elements.push(dirent.getName());
315            if (dirent.getIno() == dirent.getParentIno()) {
316                break;
317            }
318        }
319        // if an element is missing defer the work and read next entry.
320        if (elements.isEmpty()) {
321            pending.put(entry.getIno(), entry);
322            return null;
323        }
324        // generate full path from stack of elements.
325        final StringBuilder sb = new StringBuilder(elements.pop());
326        while (!elements.isEmpty()) {
327            sb.append('/');
328            sb.append(elements.pop());
329        }
330        return sb.toString();
331    }
332
333    /**
334     * Gets the archive summary information.
335     *
336     * @return the summary
337     */
338    public DumpArchiveSummary getSummary() {
339        return summary;
340    }
341
342    /**
343     * Reads bytes from the current dump archive entry.
344     *
345     * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF.
346     *
347     * @param buf The buffer into which to place bytes read.
348     * @param off The offset at which to place bytes read.
349     * @param len The number of bytes to read.
350     * @return The number of bytes read, or -1 at EOF.
351     * @throws IOException on error
352     */
353    @Override
354    public int read(final byte[] buf, int off, int len) throws IOException {
355        if (len == 0) {
356            return 0;
357        }
358        int totalRead = 0;
359
360        if (hasHitEOF || isClosed || entryOffset >= entrySize) {
361            return -1;
362        }
363
364        if (active == null) {
365            throw new IllegalStateException("No current dump entry");
366        }
367
368        if (len + entryOffset > entrySize) {
369            len = (int) (entrySize - entryOffset);
370        }
371
372        while (len > 0) {
373            final int sz = Math.min(len, readBuf.length - recordOffset);
374
375            // copy any data we have
376            if (recordOffset + sz <= readBuf.length) {
377                System.arraycopy(readBuf, recordOffset, buf, off, sz);
378                totalRead += sz;
379                recordOffset += sz;
380                len -= sz;
381                off += sz;
382            }
383
384            // load next block if necessary.
385            if (len > 0) {
386                if (readIdx >= 512) {
387                    final byte[] headerBytes = raw.readRecord();
388
389                    if (!DumpArchiveUtil.verify(headerBytes)) {
390                        throw new InvalidFormatException();
391                    }
392
393                    active = DumpArchiveEntry.parse(headerBytes);
394                    readIdx = 0;
395                }
396
397                if (!active.isSparseRecord(readIdx++)) {
398                    final int r = raw.read(readBuf, 0, readBuf.length);
399                    if (r != readBuf.length) {
400                        throw new EOFException();
401                    }
402                } else {
403                    Arrays.fill(readBuf, (byte) 0);
404                }
405
406                recordOffset = 0;
407            }
408        }
409
410        entryOffset += totalRead;
411
412        return totalRead;
413    }
414
415    /**
416     * Read BITS segment.
417     */
418    private void readBITS() throws IOException {
419        final byte[] buffer = raw.readRecord();
420
421        if (!DumpArchiveUtil.verify(buffer)) {
422            throw new InvalidFormatException();
423        }
424
425        active = DumpArchiveEntry.parse(buffer);
426
427        if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
428            throw new InvalidFormatException();
429        }
430
431        // we don't do anything with this yet.
432        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) {
433            throw new EOFException();
434        }
435        readIdx = active.getHeaderCount();
436    }
437
438    /**
439     * Read CLRI (deleted inode) segment.
440     */
441    private void readCLRI() throws IOException {
442        final byte[] buffer = raw.readRecord();
443
444        if (!DumpArchiveUtil.verify(buffer)) {
445            throw new InvalidFormatException();
446        }
447
448        active = DumpArchiveEntry.parse(buffer);
449
450        if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
451            throw new InvalidFormatException();
452        }
453
454        // we don't do anything with this yet.
455        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) {
456            throw new EOFException();
457        }
458        readIdx = active.getHeaderCount();
459    }
460
461    /**
462     * Read directory entry.
463     */
464    private void readDirectoryEntry(DumpArchiveEntry entry) throws IOException {
465        long size = entry.getEntrySize();
466        boolean first = true;
467
468        while (first || DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
469            // read the header that we just peeked at.
470            if (!first) {
471                raw.readRecord();
472            }
473
474            if (!names.containsKey(entry.getIno()) && DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
475                pending.put(entry.getIno(), entry);
476            }
477
478            final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();
479
480            if (blockBuffer.length < datalen) {
481                blockBuffer = IOUtils.readRange(raw, datalen);
482                if (blockBuffer.length != datalen) {
483                    throw new EOFException();
484                }
485            } else if (raw.read(blockBuffer, 0, datalen) != datalen) {
486                throw new EOFException();
487            }
488
489            int reclen = 0;
490
491            for (int i = 0; i < datalen - 8 && i < size - 8; i += reclen) {
492                final int ino = DumpArchiveUtil.convert32(blockBuffer, i);
493                reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
494                if (reclen == 0) {
495                    throw new DumpArchiveException("reclen cannot be 0");
496                }
497
498                final byte type = blockBuffer[i + 6];
499
500                final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);
501
502                if (CURRENT_PATH_SEGMENT.equals(name) || PARENT_PATH_SEGMENT.equals(name)) {
503                    // do nothing...
504                    continue;
505                }
506
507                final Dirent d = new Dirent(ino, entry.getIno(), type, name);
508
509                /*
510                 * if ((type == 4) && names.containsKey(ino)) { System.out.println("we already have ino: " + names.get(ino)); }
511                 */
512
513                names.put(ino, d);
514
515                // check whether this allows us to fill anything in the pending list.
516                for (final Map.Entry<Integer, DumpArchiveEntry> mapEntry : pending.entrySet()) {
517                    final DumpArchiveEntry v = mapEntry.getValue();
518                    final String path = getPath(v);
519                    if (path != null) {
520                        v.setName(path);
521                        v.setSimpleName(names.get(mapEntry.getKey()).getName());
522                        queue.add(v);
523                    }
524                }
525
526                // remove anything that we found. (We can't do it earlier
527                // because of concurrent modification exceptions.)
528                queue.forEach(e -> pending.remove(e.getIno()));
529            }
530
531            final byte[] peekBytes = raw.peek();
532
533            if (!DumpArchiveUtil.verify(peekBytes)) {
534                throw new InvalidFormatException();
535            }
536
537            entry = DumpArchiveEntry.parse(peekBytes);
538            first = false;
539            size -= DumpArchiveConstants.TP_SIZE;
540        }
541    }
542
543}