001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.archivers.dump;
020
021import java.io.EOFException;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Arrays;
025import java.util.HashMap;
026import java.util.Map;
027import java.util.PriorityQueue;
028import java.util.Queue;
029import java.util.Stack;
030
031import org.apache.commons.compress.archivers.ArchiveException;
032import org.apache.commons.compress.archivers.ArchiveInputStream;
033import org.apache.commons.compress.archivers.zip.ZipEncoding;
034import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
035import org.apache.commons.compress.utils.IOUtils;
036
037/**
038 * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream.
039 * Methods are provided to position at each successive entry in
040 * the archive, and the read each entry as a normal input stream
041 * using read().
042 *
043 * There doesn't seem to exist a hint on the encoding of string values
044 * in any piece documentation.  Given the main purpose of dump/restore
045 * is backing up a system it seems very likely the format uses the
046 * current default encoding of the system.
047 *
048 * @NotThreadSafe
049 */
050public class DumpArchiveInputStream extends ArchiveInputStream {
051    /**
052     * Look at the first few bytes of the file to decide if it's a dump
053     * archive. With 32 bytes we can look at the magic value, with a full
054     * 1k we can verify the checksum.
055     * @param buffer data to match
056     * @param length length of data
057     * @return whether the buffer seems to contain dump data
058     */
059    public static boolean matches(final byte[] buffer, final int length) {
060        // do we have enough of the header?
061        if (length < 32) {
062            return false;
063        }
064
065        // this is the best test
066        if (length >= DumpArchiveConstants.TP_SIZE) {
067            return DumpArchiveUtil.verify(buffer);
068        }
069
070        // this will work in a pinch.
071        return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer,
072            24);
073    }
074    private final DumpArchiveSummary summary;
075    private DumpArchiveEntry active;
076    private boolean isClosed;
077    private boolean hasHitEOF;
078    private long entrySize;
079    private long entryOffset;
080    private int readIdx;
081    private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
082    private byte[] blockBuffer;
083    private int recordOffset;
084    private long filepos;
085
086    protected TapeInputStream raw;
087
088    // map of ino -> dirent entry. We can use this to reconstruct full paths.
089    private final Map<Integer, Dirent> names = new HashMap<>();
090
091    // map of ino -> (directory) entry when we're missing one or more elements in the path.
092    private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>();
093
094    // queue of (directory) entries where we now have the full path.
095    private final Queue<DumpArchiveEntry> queue;
096
097    /**
098     * The encoding to use for file names and labels.
099     */
100    private final ZipEncoding zipEncoding;
101
102    // the provided encoding (for unit tests)
103    final String encoding;
104
105    /**
106     * Constructor using the platform's default encoding for file
107     * names.
108     *
109     * @param is stream to read from
110     * @throws ArchiveException on error
111     */
112    public DumpArchiveInputStream(final InputStream is) throws ArchiveException {
113        this(is, null);
114    }
115
116    /**
117     * Constructor.
118     *
119     * @param is stream to read from
120     * @param encoding the encoding to use for file names, use null
121     * for the platform's default encoding
122     * @since 1.6
123     * @throws ArchiveException on error
124     */
125    public DumpArchiveInputStream(final InputStream is, final String encoding)
126        throws ArchiveException {
127        this.raw = new TapeInputStream(is);
128        this.hasHitEOF = false;
129        this.encoding = encoding;
130        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
131
132        try {
133            // read header, verify it's a dump archive.
134            final byte[] headerBytes = raw.readRecord();
135
136            if (!DumpArchiveUtil.verify(headerBytes)) {
137                throw new UnrecognizedFormatException();
138            }
139
140            // get summary information
141            summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);
142
143            // reset buffer with actual block size.
144            raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
145
146            // allocate our read buffer.
147            blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];
148
149            // skip past CLRI and BITS segments since we don't handle them yet.
150            readCLRI();
151            readBITS();
152        } catch (final IOException ex) {
153            throw new ArchiveException(ex.getMessage(), ex);
154        }
155
156        // put in a dummy record for the root node.
157        final Dirent root = new Dirent(2, 2, 4, ".");
158        names.put(2, root);
159
160        // use priority based on queue to ensure parent directories are
161        // released first.
162        queue = new PriorityQueue<>(10,
163                (p, q) -> {
164                    if (p.getOriginalName() == null || q.getOriginalName() == null) {
165                        return Integer.MAX_VALUE;
166                    }
167
168                    return p.getOriginalName().compareTo(q.getOriginalName());
169                });
170    }
171
172    /**
173     * Closes the stream for this entry.
174     */
175    @Override
176    public void close() throws IOException {
177        if (!isClosed) {
178            isClosed = true;
179            raw.close();
180        }
181    }
182
183    @Override
184    public long getBytesRead() {
185        return raw.getBytesRead();
186    }
187
188    @Deprecated
189    @Override
190    public int getCount() {
191        return (int) getBytesRead();
192    }
193
194    /**
195     * Read the next entry.
196     * @return the next entry
197     * @throws IOException on error
198     */
199    public DumpArchiveEntry getNextDumpEntry() throws IOException {
200        return getNextEntry();
201    }
202
203    @Override
204    public DumpArchiveEntry getNextEntry() throws IOException {
205        DumpArchiveEntry entry = null;
206        String path = null;
207
208        // is there anything in the queue?
209        if (!queue.isEmpty()) {
210            return queue.remove();
211        }
212
213        while (entry == null) {
214            if (hasHitEOF) {
215                return null;
216            }
217
218            // skip any remaining records in this segment for prior file.
219            // we might still have holes... easiest to do it
220            // block by block. We may want to revisit this if
221            // the unnecessary decompression time adds up.
222            while (readIdx < active.getHeaderCount()) {
223                if (!active.isSparseRecord(readIdx++)
224                    && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
225                    throw new EOFException();
226                }
227            }
228
229            readIdx = 0;
230            filepos = raw.getBytesRead();
231
232            byte[] headerBytes = raw.readRecord();
233
234            if (!DumpArchiveUtil.verify(headerBytes)) {
235                throw new InvalidFormatException();
236            }
237
238            active = DumpArchiveEntry.parse(headerBytes);
239
240            // skip any remaining segments for prior file.
241            while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
242                if (raw.skip((long) DumpArchiveConstants.TP_SIZE
243                             * (active.getHeaderCount()
244                                - active.getHeaderHoles())) == -1) {
245                    throw new EOFException();
246                }
247
248                filepos = raw.getBytesRead();
249                headerBytes = raw.readRecord();
250
251                if (!DumpArchiveUtil.verify(headerBytes)) {
252                    throw new InvalidFormatException();
253                }
254
255                active = DumpArchiveEntry.parse(headerBytes);
256            }
257
258            // check if this is an end-of-volume marker.
259            if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
260                hasHitEOF = true;
261
262                return null;
263            }
264
265            entry = active;
266
267            if (entry.isDirectory()) {
268                readDirectoryEntry(active);
269
270                // now we create an empty InputStream.
271                entryOffset = 0;
272                entrySize = 0;
273                readIdx = active.getHeaderCount();
274            } else {
275                entryOffset = 0;
276                entrySize = active.getEntrySize();
277                readIdx = 0;
278            }
279
280            recordOffset = readBuf.length;
281
282            path = getPath(entry);
283
284            if (path == null) {
285                entry = null;
286            }
287        }
288
289        entry.setName(path);
290        entry.setSimpleName(names.get(entry.getIno()).getName());
291        entry.setOffset(filepos);
292
293        return entry;
294    }
295
296    /**
297     * Get full path for specified archive entry, or null if there's a gap.
298     *
299     * @param entry
300     * @return  full path for specified archive entry, or null if there's a gap.
301     */
302    private String getPath(final DumpArchiveEntry entry) {
303        // build the stack of elements. It's possible that we're
304        // still missing an intermediate value and if so we
305        final Stack<String> elements = new Stack<>();
306        Dirent dirent = null;
307
308        for (int i = entry.getIno();; i = dirent.getParentIno()) {
309            if (!names.containsKey(i)) {
310                elements.clear();
311                break;
312            }
313
314            dirent = names.get(i);
315            elements.push(dirent.getName());
316
317            if (dirent.getIno() == dirent.getParentIno()) {
318                break;
319            }
320        }
321
322        // if an element is missing defer the work and read next entry.
323        if (elements.isEmpty()) {
324            pending.put(entry.getIno(), entry);
325
326            return null;
327        }
328
329        // generate full path from stack of elements.
330        final StringBuilder sb = new StringBuilder(elements.pop());
331
332        while (!elements.isEmpty()) {
333            sb.append('/');
334            sb.append(elements.pop());
335        }
336
337        return sb.toString();
338    }
339
340    /**
341     * Return the archive summary information.
342     * @return the summary
343     */
344    public DumpArchiveSummary getSummary() {
345        return summary;
346    }
347
348    /**
349     * Reads bytes from the current dump archive entry.
350     *
351     * This method is aware of the boundaries of the current
352     * entry in the archive and will deal with them as if they
353     * were this stream's start and EOF.
354     *
355     * @param buf The buffer into which to place bytes read.
356     * @param off The offset at which to place bytes read.
357     * @param len The number of bytes to read.
358     * @return The number of bytes read, or -1 at EOF.
359     * @throws IOException on error
360     */
361    @Override
362    public int read(final byte[] buf, int off, int len) throws IOException {
363        if (len == 0) {
364            return 0;
365        }
366        int totalRead = 0;
367
368        if (hasHitEOF || isClosed || entryOffset >= entrySize) {
369            return -1;
370        }
371
372        if (active == null) {
373            throw new IllegalStateException("No current dump entry");
374        }
375
376        if (len + entryOffset > entrySize) {
377            len = (int) (entrySize - entryOffset);
378        }
379
380        while (len > 0) {
381            final int sz = Math.min(len, readBuf.length - recordOffset);
382
383            // copy any data we have
384            if (recordOffset + sz <= readBuf.length) {
385                System.arraycopy(readBuf, recordOffset, buf, off, sz);
386                totalRead += sz;
387                recordOffset += sz;
388                len -= sz;
389                off += sz;
390            }
391
392            // load next block if necessary.
393            if (len > 0) {
394                if (readIdx >= 512) {
395                    final byte[] headerBytes = raw.readRecord();
396
397                    if (!DumpArchiveUtil.verify(headerBytes)) {
398                        throw new InvalidFormatException();
399                    }
400
401                    active = DumpArchiveEntry.parse(headerBytes);
402                    readIdx = 0;
403                }
404
405                if (!active.isSparseRecord(readIdx++)) {
406                    final int r = raw.read(readBuf, 0, readBuf.length);
407                    if (r != readBuf.length) {
408                        throw new EOFException();
409                    }
410                } else {
411                    Arrays.fill(readBuf, (byte) 0);
412                }
413
414                recordOffset = 0;
415            }
416        }
417
418        entryOffset += totalRead;
419
420        return totalRead;
421    }
422
423    /**
424     * Read BITS segment.
425     */
426    private void readBITS() throws IOException {
427        final byte[] buffer = raw.readRecord();
428
429        if (!DumpArchiveUtil.verify(buffer)) {
430            throw new InvalidFormatException();
431        }
432
433        active = DumpArchiveEntry.parse(buffer);
434
435        if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
436            throw new InvalidFormatException();
437        }
438
439        // we don't do anything with this yet.
440        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
441            == -1) {
442            throw new EOFException();
443        }
444        readIdx = active.getHeaderCount();
445    }
446
447    /**
448     * Read CLRI (deleted inode) segment.
449     */
450    private void readCLRI() throws IOException {
451        final byte[] buffer = raw.readRecord();
452
453        if (!DumpArchiveUtil.verify(buffer)) {
454            throw new InvalidFormatException();
455        }
456
457        active = DumpArchiveEntry.parse(buffer);
458
459        if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
460            throw new InvalidFormatException();
461        }
462
463        // we don't do anything with this yet.
464        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
465            == -1) {
466            throw new EOFException();
467        }
468        readIdx = active.getHeaderCount();
469    }
470
471    /**
472     * Read directory entry.
473     */
474    private void readDirectoryEntry(DumpArchiveEntry entry)
475        throws IOException {
476        long size = entry.getEntrySize();
477        boolean first = true;
478
479        while (first ||
480                DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
481            // read the header that we just peeked at.
482            if (!first) {
483                raw.readRecord();
484            }
485
486            if (!names.containsKey(entry.getIno()) &&
487                    DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
488                pending.put(entry.getIno(), entry);
489            }
490
491            final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();
492
493            if (blockBuffer.length < datalen) {
494                blockBuffer = IOUtils.readRange(raw, datalen);
495                if (blockBuffer.length != datalen) {
496                    throw new EOFException();
497                }
498            } else if (raw.read(blockBuffer, 0, datalen) != datalen) {
499                throw new EOFException();
500            }
501
502            int reclen = 0;
503
504            for (int i = 0; i < datalen - 8 && i < size - 8;
505                    i += reclen) {
506                final int ino = DumpArchiveUtil.convert32(blockBuffer, i);
507                reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
508
509                final byte type = blockBuffer[i + 6];
510
511                final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);
512
513                if (".".equals(name) || "..".equals(name)) {
514                    // do nothing...
515                    continue;
516                }
517
518                final Dirent d = new Dirent(ino, entry.getIno(), type, name);
519
520                /*
521                if ((type == 4) && names.containsKey(ino)) {
522                    System.out.println("we already have ino: " +
523                                       names.get(ino));
524                }
525                */
526
527                names.put(ino, d);
528
529                // check whether this allows us to fill anything in the pending list.
530                pending.forEach((k, v) -> {
531                    final String path = getPath(v);
532
533                    if (path != null) {
534                        v.setName(path);
535                        v.setSimpleName(names.get(k).getName());
536                        queue.add(v);
537                    }
538                });
539
540                // remove anything that we found. (We can't do it earlier
541                // because of concurrent modification exceptions.)
542                queue.forEach(e -> pending.remove(e.getIno()));
543            }
544
545            final byte[] peekBytes = raw.peek();
546
547            if (!DumpArchiveUtil.verify(peekBytes)) {
548                throw new InvalidFormatException();
549            }
550
551            entry = DumpArchiveEntry.parse(peekBytes);
552            first = false;
553            size -= DumpArchiveConstants.TP_SIZE;
554        }
555    }
556
557}