001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
025import org.apache.commons.compress.utils.ByteUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>This implementation uses an internal buffer in order to handle
031 * the back-references that are at the heart of the LZ77 algorithm.
032 * The size of the buffer must be at least as big as the biggest
033 * offset used in the compressed stream.  The current version of the
034 * Snappy algorithm as defined by Google works on 32k blocks and
035 * doesn't contain offsets bigger than 32k which is the default block
036 * size used by this class.</p>
037 *
038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
039 * @since 1.7
040 */
041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
042
043    private enum State {
044        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
045    }
046
047    /** Mask used to determine the type of "tag" is being processed */
048    private static final int TAG_MASK = 0x03;
049
050    /** Default block size */
051    public static final int DEFAULT_BLOCK_SIZE = 32768;
052
053    /** The size of the uncompressed data */
054    private final int size;
055
056    /** Number of uncompressed bytes still to be read. */
057    private int uncompressedBytesRemaining;
058
059    /** Current state of the stream */
060    private State state = State.NO_BLOCK;
061
062    private boolean endReached;
063
064    /**
065     * Constructor using the default buffer size of 32k.
066     *
067     * @param is
068     *            An InputStream to read compressed data from
069     *
070     * @throws IOException if reading fails
071     */
072    public SnappyCompressorInputStream(final InputStream is) throws IOException {
073        this(is, DEFAULT_BLOCK_SIZE);
074    }
075
076    /**
077     * Constructor using a configurable buffer size.
078     *
079     * @param is
080     *            An InputStream to read compressed data from
081     * @param blockSize
082     *            The block size used in compression
083     *
084     * @throws IOException if reading fails
085     * @throws IllegalArgumentException if blockSize is not bigger than 0
086     */
087    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
088            throws IOException {
089        super(is, blockSize);
090        uncompressedBytesRemaining = size = (int) readSize();
091    }
092
093    /**
094     * Try to fill the buffer with the next block of data.
095     */
096    private void fill() throws IOException {
097        if (uncompressedBytesRemaining == 0) {
098            endReached = true;
099            return;
100        }
101
102        int b = readOneByte();
103        if (b == -1) {
104            throw new IOException("Premature end of stream reading block start");
105        }
106        int length = 0;
107        int offset = 0;
108
109        switch (b & TAG_MASK) {
110
111        case 0x00:
112
113            length = readLiteralLength(b);
114            if (length < 0) {
115                throw new IOException("Illegal block with a negative literal size found");
116            }
117            uncompressedBytesRemaining -= length;
118            startLiteral(length);
119            state = State.IN_LITERAL;
120            break;
121
122        case 0x01:
123
124            /*
125             * These elements can encode lengths between [4..11] bytes and
126             * offsets between [0..2047] bytes. (len-4) occupies three bits
127             * and is stored in bits [2..4] of the tag byte. The offset
128             * occupies 11 bits, of which the upper three are stored in the
129             * upper three bits ([5..7]) of the tag byte, and the lower
130             * eight are stored in a byte following the tag byte.
131             */
132
133            length = 4 + ((b >> 2) & 0x07);
134            uncompressedBytesRemaining -= length;
135            offset = (b & 0xE0) << 3;
136            b = readOneByte();
137            if (b == -1) {
138                throw new IOException("Premature end of stream reading back-reference length");
139            }
140            offset |= b;
141
142            try {
143                startBackReference(offset, length);
144            } catch (final IllegalArgumentException ex) {
145                throw new IOException("Illegal block with bad offset found", ex);
146            }
147            state = State.IN_BACK_REFERENCE;
148            break;
149
150        case 0x02:
151
152            /*
153             * These elements can encode lengths between [1..64] and offsets
154             * from [0..65535]. (len-1) occupies six bits and is stored in
155             * the upper six bits ([2..7]) of the tag byte. The offset is
156             * stored as a little-endian 16-bit integer in the two bytes
157             * following the tag byte.
158             */
159
160            length = (b >> 2) + 1;
161            if (length < 0) {
162                throw new IOException("Illegal block with a negative match length found");
163            }
164            uncompressedBytesRemaining -= length;
165
166            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
167
168            try {
169                startBackReference(offset, length);
170            } catch (final IllegalArgumentException ex) {
171                throw new IOException("Illegal block with bad offset found", ex);
172            }
173            state = State.IN_BACK_REFERENCE;
174            break;
175
176        case 0x03:
177
178            /*
179             * These are like the copies with 2-byte offsets (see previous
180             * subsection), except that the offset is stored as a 32-bit
181             * integer instead of a 16-bit integer (and thus will occupy
182             * four bytes).
183             */
184
185            length = (b >> 2) + 1;
186            if (length < 0) {
187                throw new IOException("Illegal block with a negative match length found");
188            }
189            uncompressedBytesRemaining -= length;
190
191            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
192
193            try {
194                startBackReference(offset, length);
195            } catch (final IllegalArgumentException ex) {
196                throw new IOException("Illegal block with bad offset found", ex);
197            }
198            state = State.IN_BACK_REFERENCE;
199            break;
200        default:
201            // impossible as TAG_MASK is two bits and all four possible cases have been covered
202            break;
203        }
204    }
205
206    /**
207     * Get the uncompressed size of the stream
208     *
209     * @return the uncompressed size
210     */
211    @Override
212    public int getSize() {
213        return size;
214    }
215
216    /**
217     * {@inheritDoc}
218     */
219    @Override
220    public int read(final byte[] b, final int off, final int len) throws IOException {
221        if (len == 0) {
222            return 0;
223        }
224        if (endReached) {
225            return -1;
226        }
227        switch (state) {
228        case NO_BLOCK:
229            fill();
230            return read(b, off, len);
231        case IN_LITERAL:
232            final int litLen = readLiteral(b, off, len);
233            if (!hasMoreDataInBlock()) {
234                state = State.NO_BLOCK;
235            }
236            return litLen > 0 ? litLen : read(b, off, len);
237        case IN_BACK_REFERENCE:
238            final int backReferenceLen = readBackReference(b, off, len);
239            if (!hasMoreDataInBlock()) {
240                state = State.NO_BLOCK;
241            }
242            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
243        default:
244            throw new IOException("Unknown stream state " + state);
245        }
246    }
247
248    /*
249     * For literals up to and including 60 bytes in length, the
250     * upper six bits of the tag byte contain (len-1). The literal
251     * follows immediately thereafter in the bytestream. - For
252     * longer literals, the (len-1) value is stored after the tag
253     * byte, little-endian. The upper six bits of the tag byte
254     * describe how many bytes are used for the length; 60, 61, 62
255     * or 63 for 1-4 bytes, respectively. The literal itself follows
256     * after the length.
257     */
258    private int readLiteralLength(final int b) throws IOException {
259        final int length;
260        switch (b >> 2) {
261        case 60:
262            length = readOneByte();
263            if (length == -1) {
264                throw new IOException("Premature end of stream reading literal length");
265            }
266            break;
267        case 61:
268            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
269            break;
270        case 62:
271            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
272            break;
273        case 63:
274            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
275            break;
276        default:
277            length = b >> 2;
278            break;
279        }
280
281        return length + 1;
282    }
283
284    /**
285     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
286     * 1), stored as a little-endian varint. Varints consist of a series of
287     * bytes, where the lower 7 bits are data and the upper bit is set iff there
288     * are more bytes to be read. In other words, an uncompressed length of 64
289     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
290     * would be stored as 0xFE 0xFF 0x7F.
291     *
292     * @return The size of the uncompressed data
293     *
294     * @throws IOException
295     *             Could not read a byte
296     */
297    private long readSize() throws IOException {
298        int index = 0;
299        long sz = 0;
300        int b = 0;
301
302        do {
303            b = readOneByte();
304            if (b == -1) {
305                throw new IOException("Premature end of stream reading size");
306            }
307            sz |= (b & 0x7f) << (index++ * 7);
308        } while (0 != (b & 0x80));
309        return sz;
310    }
311}