DumpArchiveInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.archivers.dump;

  20. import java.io.EOFException;
  21. import java.io.IOException;
  22. import java.io.InputStream;
  23. import java.util.Arrays;
  24. import java.util.BitSet;
  25. import java.util.HashMap;
  26. import java.util.Map;
  27. import java.util.PriorityQueue;
  28. import java.util.Queue;
  29. import java.util.Stack;

  30. import org.apache.commons.compress.archivers.ArchiveException;
  31. import org.apache.commons.compress.archivers.ArchiveInputStream;
  32. import org.apache.commons.compress.archivers.zip.ZipEncoding;
  33. import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
  34. import org.apache.commons.compress.utils.IOUtils;

  35. /**
  36.  * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream. Methods are provided to position at each successive entry in the archive, and the
  37.  * read each entry as a normal input stream using read().
  38.  * <p>
  39.  * There doesn't seem to exist a hint on the encoding of string values in any piece documentation. Given the main purpose of dump/restore is backing up a system
  40.  * it seems very likely the format uses the current default encoding of the system.
  41.  * </p>
  42.  * @NotThreadSafe
  43.  * @since 1.3
  44.  */
  45. public class DumpArchiveInputStream extends ArchiveInputStream<DumpArchiveEntry> {

  46.     private static final String CURRENT_PATH_SEGMENT = ".";
  47.     private static final String PARENT_PATH_SEGMENT = "..";

  48.     /**
  49.      * Look at the first few bytes of the file to decide if it's a dump archive. With 32 bytes we can look at the magic value, with a full 1k we can verify the
  50.      * checksum.
  51.      *
  52.      * @param buffer data to match
  53.      * @param length length of data
  54.      * @return whether the buffer seems to contain dump data
  55.      */
  56.     public static boolean matches(final byte[] buffer, final int length) {
  57.         // do we have enough of the header?
  58.         if (length < 32) {
  59.             return false;
  60.         }

  61.         // this is the best test
  62.         if (length >= DumpArchiveConstants.TP_SIZE) {
  63.             return DumpArchiveUtil.verify(buffer);
  64.         }

  65.         // this will work in a pinch.
  66.         return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer, 24);
  67.     }

  68.     private final DumpArchiveSummary summary;
  69.     private DumpArchiveEntry active;
  70.     private boolean isClosed;
  71.     private boolean hasHitEOF;
  72.     private long entrySize;
  73.     private long entryOffset;
  74.     private int readIdx;
  75.     private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
  76.     private byte[] blockBuffer;
  77.     private int recordOffset;
  78.     private long filepos;

  79.     protected TapeInputStream raw;

  80.     /** Map of ino -> dirent entry. We can use this to reconstruct full paths. */
  81.     private final Map<Integer, Dirent> names = new HashMap<>();

  82.     /** Map of ino -> (directory) entry when we're missing one or more elements in the path. */
  83.     private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>();

  84.     /** Queue of (directory) entries where we now have the full path. */
  85.     private final Queue<DumpArchiveEntry> queue;

  86.     /**
  87.      * The encoding to use for file names and labels.
  88.      */
  89.     private final ZipEncoding zipEncoding;

  90.     /**
  91.      * Constructor using the platform's default encoding for file names.
  92.      *
  93.      * @param is stream to read from
  94.      * @throws ArchiveException on error
  95.      */
  96.     public DumpArchiveInputStream(final InputStream is) throws ArchiveException {
  97.         this(is, null);
  98.     }

  99.     /**
  100.      * Constructs a new instance.
  101.      *
  102.      * @param is       stream to read from
  103.      * @param encoding the encoding to use for file names, use null for the platform's default encoding
  104.      * @since 1.6
  105.      * @throws ArchiveException on error
  106.      */
  107.     public DumpArchiveInputStream(final InputStream is, final String encoding) throws ArchiveException {
  108.         super(is, encoding);
  109.         this.raw = new TapeInputStream(is);
  110.         this.hasHitEOF = false;
  111.         this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);

  112.         try {
  113.             // read header, verify it's a dump archive.
  114.             final byte[] headerBytes = raw.readRecord();

  115.             if (!DumpArchiveUtil.verify(headerBytes)) {
  116.                 throw new UnrecognizedFormatException();
  117.             }

  118.             // get summary information
  119.             summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);

  120.             // reset buffer with actual block size.
  121.             raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());

  122.             // allocate our read buffer.
  123.             blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];

  124.             // skip past CLRI and BITS segments since we don't handle them yet.
  125.             readCLRI();
  126.             readBITS();
  127.         } catch (final IOException ex) {
  128.             throw new ArchiveException(ex.getMessage(), ex);
  129.         }

  130.         // put in a dummy record for the root node.
  131.         final Dirent root = new Dirent(2, 2, 4, CURRENT_PATH_SEGMENT);
  132.         names.put(2, root);

  133.         // use priority based on queue to ensure parent directories are
  134.         // released first.
  135.         queue = new PriorityQueue<>(10, (p, q) -> {
  136.             if (p.getOriginalName() == null || q.getOriginalName() == null) {
  137.                 return Integer.MAX_VALUE;
  138.             }

  139.             return p.getOriginalName().compareTo(q.getOriginalName());
  140.         });
  141.     }

  142.     /**
  143.      * Closes the stream for this entry.
  144.      */
  145.     @Override
  146.     public void close() throws IOException {
  147.         if (!isClosed) {
  148.             isClosed = true;
  149.             raw.close();
  150.         }
  151.     }

  152.     @Override
  153.     public long getBytesRead() {
  154.         return raw.getBytesRead();
  155.     }

  156.     @Deprecated
  157.     @Override
  158.     public int getCount() {
  159.         return (int) getBytesRead();
  160.     }

  161.     /**
  162.      * Reads the next entry.
  163.      *
  164.      * @return the next entry
  165.      * @throws IOException on error
  166.      * @deprecated Use {@link #getNextEntry()}.
  167.      */
  168.     @Deprecated
  169.     public DumpArchiveEntry getNextDumpEntry() throws IOException {
  170.         return getNextEntry();
  171.     }

  172.     @Override
  173.     public DumpArchiveEntry getNextEntry() throws IOException {
  174.         DumpArchiveEntry entry = null;
  175.         String path = null;

  176.         // is there anything in the queue?
  177.         if (!queue.isEmpty()) {
  178.             return queue.remove();
  179.         }

  180.         while (entry == null) {
  181.             if (hasHitEOF) {
  182.                 return null;
  183.             }

  184.             // skip any remaining records in this segment for prior file.
  185.             // we might still have holes... easiest to do it
  186.             // block by block. We may want to revisit this if
  187.             // the unnecessary decompression time adds up.
  188.             while (readIdx < active.getHeaderCount()) {
  189.                 if (!active.isSparseRecord(readIdx++) && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
  190.                     throw new EOFException();
  191.                 }
  192.             }

  193.             readIdx = 0;
  194.             filepos = raw.getBytesRead();

  195.             byte[] headerBytes = raw.readRecord();

  196.             if (!DumpArchiveUtil.verify(headerBytes)) {
  197.                 throw new InvalidFormatException();
  198.             }

  199.             active = DumpArchiveEntry.parse(headerBytes);

  200.             // skip any remaining segments for prior file.
  201.             while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
  202.                 if (raw.skip((long) DumpArchiveConstants.TP_SIZE * (active.getHeaderCount() - active.getHeaderHoles())) == -1) {
  203.                     throw new EOFException();
  204.                 }

  205.                 filepos = raw.getBytesRead();
  206.                 headerBytes = raw.readRecord();

  207.                 if (!DumpArchiveUtil.verify(headerBytes)) {
  208.                     throw new InvalidFormatException();
  209.                 }

  210.                 active = DumpArchiveEntry.parse(headerBytes);
  211.             }

  212.             // check if this is an end-of-volume marker.
  213.             if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
  214.                 hasHitEOF = true;

  215.                 return null;
  216.             }

  217.             entry = active;

  218.             if (entry.isDirectory()) {
  219.                 readDirectoryEntry(active);

  220.                 // now we create an empty InputStream.
  221.                 entryOffset = 0;
  222.                 entrySize = 0;
  223.                 readIdx = active.getHeaderCount();
  224.             } else {
  225.                 entryOffset = 0;
  226.                 entrySize = active.getEntrySize();
  227.                 readIdx = 0;
  228.             }

  229.             recordOffset = readBuf.length;

  230.             path = getPath(entry);

  231.             if (path == null) {
  232.                 entry = null;
  233.             }
  234.         }

  235.         entry.setName(path);
  236.         entry.setSimpleName(names.get(entry.getIno()).getName());
  237.         entry.setOffset(filepos);

  238.         return entry;
  239.     }

  240.     /**
  241.      * Gets full path for specified archive entry, or null if there's a gap.
  242.      *
  243.      * @param entry
  244.      * @return full path for specified archive entry, or null if there's a gap.
  245.      * @throws DumpArchiveException Infinite loop detected in directory entries.
  246.      */
  247.     private String getPath(final DumpArchiveEntry entry) throws DumpArchiveException {
  248.         // build the stack of elements. It's possible that we're
  249.         // still missing an intermediate value and if so we
  250.         final Stack<String> elements = new Stack<>();
  251.         final BitSet visited = new BitSet();
  252.         Dirent dirent = null;
  253.         for (int i = entry.getIno();; i = dirent.getParentIno()) {
  254.             if (!names.containsKey(i)) {
  255.                 elements.clear();
  256.                 break;
  257.             }
  258.             if (visited.get(i)) {
  259.                 throw new DumpArchiveException("Duplicate node " + i);
  260.             }
  261.             dirent = names.get(i);
  262.             visited.set(i);
  263.             elements.push(dirent.getName());
  264.             if (dirent.getIno() == dirent.getParentIno()) {
  265.                 break;
  266.             }
  267.         }
  268.         // if an element is missing defer the work and read next entry.
  269.         if (elements.isEmpty()) {
  270.             pending.put(entry.getIno(), entry);
  271.             return null;
  272.         }
  273.         // generate full path from stack of elements.
  274.         final StringBuilder sb = new StringBuilder(elements.pop());
  275.         while (!elements.isEmpty()) {
  276.             sb.append('/');
  277.             sb.append(elements.pop());
  278.         }
  279.         return sb.toString();
  280.     }

  281.     /**
  282.      * Gets the archive summary information.
  283.      *
  284.      * @return the summary
  285.      */
  286.     public DumpArchiveSummary getSummary() {
  287.         return summary;
  288.     }

  289.     /**
  290.      * Reads bytes from the current dump archive entry.
  291.      *
  292.      * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF.
  293.      *
  294.      * @param buf The buffer into which to place bytes read.
  295.      * @param off The offset at which to place bytes read.
  296.      * @param len The number of bytes to read.
  297.      * @return The number of bytes read, or -1 at EOF.
  298.      * @throws IOException on error
  299.      */
  300.     @Override
  301.     public int read(final byte[] buf, int off, int len) throws IOException {
  302.         if (len == 0) {
  303.             return 0;
  304.         }
  305.         int totalRead = 0;

  306.         if (hasHitEOF || isClosed || entryOffset >= entrySize) {
  307.             return -1;
  308.         }

  309.         if (active == null) {
  310.             throw new IllegalStateException("No current dump entry");
  311.         }

  312.         if (len + entryOffset > entrySize) {
  313.             len = (int) (entrySize - entryOffset);
  314.         }

  315.         while (len > 0) {
  316.             final int sz = Math.min(len, readBuf.length - recordOffset);

  317.             // copy any data we have
  318.             if (recordOffset + sz <= readBuf.length) {
  319.                 System.arraycopy(readBuf, recordOffset, buf, off, sz);
  320.                 totalRead += sz;
  321.                 recordOffset += sz;
  322.                 len -= sz;
  323.                 off += sz;
  324.             }

  325.             // load next block if necessary.
  326.             if (len > 0) {
  327.                 if (readIdx >= 512) {
  328.                     final byte[] headerBytes = raw.readRecord();

  329.                     if (!DumpArchiveUtil.verify(headerBytes)) {
  330.                         throw new InvalidFormatException();
  331.                     }

  332.                     active = DumpArchiveEntry.parse(headerBytes);
  333.                     readIdx = 0;
  334.                 }

  335.                 if (!active.isSparseRecord(readIdx++)) {
  336.                     final int r = raw.read(readBuf, 0, readBuf.length);
  337.                     if (r != readBuf.length) {
  338.                         throw new EOFException();
  339.                     }
  340.                 } else {
  341.                     Arrays.fill(readBuf, (byte) 0);
  342.                 }

  343.                 recordOffset = 0;
  344.             }
  345.         }

  346.         entryOffset += totalRead;

  347.         return totalRead;
  348.     }

  349.     /**
  350.      * Read BITS segment.
  351.      */
  352.     private void readBITS() throws IOException {
  353.         final byte[] buffer = raw.readRecord();

  354.         if (!DumpArchiveUtil.verify(buffer)) {
  355.             throw new InvalidFormatException();
  356.         }

  357.         active = DumpArchiveEntry.parse(buffer);

  358.         if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
  359.             throw new InvalidFormatException();
  360.         }

  361.         // we don't do anything with this yet.
  362.         if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) {
  363.             throw new EOFException();
  364.         }
  365.         readIdx = active.getHeaderCount();
  366.     }

  367.     /**
  368.      * Read CLRI (deleted inode) segment.
  369.      */
  370.     private void readCLRI() throws IOException {
  371.         final byte[] buffer = raw.readRecord();

  372.         if (!DumpArchiveUtil.verify(buffer)) {
  373.             throw new InvalidFormatException();
  374.         }

  375.         active = DumpArchiveEntry.parse(buffer);

  376.         if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
  377.             throw new InvalidFormatException();
  378.         }

  379.         // we don't do anything with this yet.
  380.         if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) {
  381.             throw new EOFException();
  382.         }
  383.         readIdx = active.getHeaderCount();
  384.     }

  385.     /**
  386.      * Read directory entry.
  387.      */
  388.     private void readDirectoryEntry(DumpArchiveEntry entry) throws IOException {
  389.         long size = entry.getEntrySize();
  390.         boolean first = true;

  391.         while (first || DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
  392.             // read the header that we just peeked at.
  393.             if (!first) {
  394.                 raw.readRecord();
  395.             }

  396.             if (!names.containsKey(entry.getIno()) && DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
  397.                 pending.put(entry.getIno(), entry);
  398.             }

  399.             final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();

  400.             if (blockBuffer.length < datalen) {
  401.                 blockBuffer = IOUtils.readRange(raw, datalen);
  402.                 if (blockBuffer.length != datalen) {
  403.                     throw new EOFException();
  404.                 }
  405.             } else if (raw.read(blockBuffer, 0, datalen) != datalen) {
  406.                 throw new EOFException();
  407.             }

  408.             int reclen = 0;

  409.             for (int i = 0; i < datalen - 8 && i < size - 8; i += reclen) {
  410.                 final int ino = DumpArchiveUtil.convert32(blockBuffer, i);
  411.                 reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
  412.                 if (reclen == 0) {
  413.                     throw new DumpArchiveException("reclen cannot be 0");
  414.                 }

  415.                 final byte type = blockBuffer[i + 6];

  416.                 final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);

  417.                 if (CURRENT_PATH_SEGMENT.equals(name) || PARENT_PATH_SEGMENT.equals(name)) {
  418.                     // do nothing...
  419.                     continue;
  420.                 }

  421.                 final Dirent d = new Dirent(ino, entry.getIno(), type, name);

  422.                 /*
  423.                  * if ((type == 4) && names.containsKey(ino)) { System.out.println("we already have ino: " + names.get(ino)); }
  424.                  */

  425.                 names.put(ino, d);

  426.                 // check whether this allows us to fill anything in the pending list.
  427.                 for (final Map.Entry<Integer, DumpArchiveEntry> mapEntry : pending.entrySet()) {
  428.                     final DumpArchiveEntry v = mapEntry.getValue();
  429.                     final String path = getPath(v);
  430.                     if (path != null) {
  431.                         v.setName(path);
  432.                         v.setSimpleName(names.get(mapEntry.getKey()).getName());
  433.                         queue.add(v);
  434.                     }
  435.                 }

  436.                 // remove anything that we found. (We can't do it earlier
  437.                 // because of concurrent modification exceptions.)
  438.                 queue.forEach(e -> pending.remove(e.getIno()));
  439.             }

  440.             final byte[] peekBytes = raw.peek();

  441.             if (!DumpArchiveUtil.verify(peekBytes)) {
  442.                 throw new InvalidFormatException();
  443.             }

  444.             entry = DumpArchiveEntry.parse(peekBytes);
  445.             first = false;
  446.             size -= DumpArchiveConstants.TP_SIZE;
  447.         }
  448.     }

  449. }