001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.BufferedInputStream; 022import java.io.ByteArrayOutputStream; 023import java.io.DataInput; 024import java.io.DataInputStream; 025import java.io.EOFException; 026import java.io.IOException; 027import java.io.InputStream; 028import java.util.zip.CRC32; 029import java.util.zip.DataFormatException; 030import java.util.zip.Deflater; 031import java.util.zip.Inflater; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.CountingInputStream; 036import org.apache.commons.compress.utils.IOUtils; 037import org.apache.commons.compress.utils.InputStreamStatistics; 038 039/** 040 * Input stream that decompresses .gz files. 041 * 042 * <p>This supports decompressing concatenated .gz files which is important 043 * when decompressing standalone .gz files.</p> 044 * 045 * <p> 046 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 047 * files: it stops after the first member and silently ignores the rest. 048 * It doesn't leave the read position to point to the beginning of the next 049 * member, which makes it difficult workaround the lack of concatenation 050 * support. 051 * </p> 052 * 053 * <p> 054 * Instead of using {@code GZIPInputStream}, this class has its own .gz 055 * container format decoder. The actual decompression is done with 056 * {@link java.util.zip.Inflater}. 057 * </p> 058 * 059 * <p>If you use the constructor {@code GzipCompressorInputStream(in)} 060 * or {@code GzipCompressorInputStream(in, false)} with some {@code 061 * InputStream} {@code in} then {@link #read} will return -1 as soon 062 * as the first internal member has been read completely. The stream 063 * {@code in} will be positioned at the start of the second gzip 064 * member if there is one.</p> 065 * 066 * <p>If you use the constructor {@code GzipCompressorInputStream(in, 067 * true)} with some {@code InputStream} {@code in} then {@link #read} 068 * will return -1 once the stream {@code in} has been exhausted. The 069 * data read from a stream constructed this way will consist of the 070 * concatenated data of all gzip members contained inside {@code 071 * in}.</p> 072 * 073 * @see "https://tools.ietf.org/html/rfc1952" 074 */ 075public class GzipCompressorInputStream extends CompressorInputStream 076 implements InputStreamStatistics { 077 078 // Header flags 079 // private static final int FTEXT = 0x01; // Uninteresting for us 080 private static final int FHCRC = 0x02; 081 private static final int FEXTRA = 0x04; 082 private static final int FNAME = 0x08; 083 private static final int FCOMMENT = 0x10; 084 private static final int FRESERVED = 0xE0; 085 086 /** 087 * Checks if the signature matches what is expected for a .gz file. 088 * 089 * @param signature the bytes to check 090 * @param length the number of bytes to check 091 * @return true if this is a .gz stream, false otherwise 092 * 093 * @since 1.1 094 */ 095 public static boolean matches(final byte[] signature, final int length) { 096 return length >= 2 && signature[0] == 31 && signature[1] == -117; 097 } 098 099 private static byte[] readToNull(final DataInput inData) throws IOException { 100 try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) { 101 int b; 102 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR 103 bos.write(b); 104 } 105 return bos.toByteArray(); 106 } 107 } 108 109 private final CountingInputStream countingStream; 110 111 // Compressed input stream, possibly wrapped in a 112 // BufferedInputStream, always wrapped in countingStream above 113 private final InputStream in; 114 115 // True if decompressing multi member streams. 116 private final boolean decompressConcatenated; 117 118 // Buffer to hold the input data 119 private final byte[] buf = new byte[8192]; 120 121 // Amount of data in buf. 122 private int bufUsed; 123 124 // Decompressor 125 private Inflater inf = new Inflater(true); 126 127 // CRC32 from uncompressed data 128 private final CRC32 crc = new CRC32(); 129 130 // True once everything has been decompressed 131 private boolean endReached; 132 133 // used in no-arg read method 134 private final byte[] oneByte = new byte[1]; 135 136 private final GzipParameters parameters = new GzipParameters(); 137 138 /** 139 * Constructs a new input stream that decompresses gzip-compressed data 140 * from the specified input stream. 141 * <p> 142 * This is equivalent to 143 * {@code GzipCompressorInputStream(inputStream, false)} and thus 144 * will not decompress concatenated .gz files. 145 * 146 * @param inputStream the InputStream from which this object should 147 * be created of 148 * 149 * @throws IOException if the stream could not be created 150 */ 151 public GzipCompressorInputStream(final InputStream inputStream) 152 throws IOException { 153 this(inputStream, false); 154 } 155 156 /** 157 * Constructs a new input stream that decompresses gzip-compressed data 158 * from the specified input stream. 159 * <p> 160 * If {@code decompressConcatenated} is {@code false}: 161 * This decompressor might read more input than it will actually use. 162 * If {@code inputStream} supports {@code mark} and 163 * {@code reset}, then the input position will be adjusted 164 * so that it is right after the last byte of the compressed stream. 165 * If {@code mark} isn't supported, the input position will be 166 * undefined. 167 * 168 * @param inputStream the InputStream from which this object should 169 * be created of 170 * @param decompressConcatenated 171 * if true, decompress until the end of the input; 172 * if false, stop after the first .gz member 173 * 174 * @throws IOException if the stream could not be created 175 */ 176 public GzipCompressorInputStream(final InputStream inputStream, 177 final boolean decompressConcatenated) 178 throws IOException { 179 countingStream = new CountingInputStream(inputStream); 180 // Mark support is strictly needed for concatenated files only, 181 // but it's simpler if it is always available. 182 if (countingStream.markSupported()) { 183 in = countingStream; 184 } else { 185 in = new BufferedInputStream(countingStream); 186 } 187 188 this.decompressConcatenated = decompressConcatenated; 189 init(true); 190 } 191 192 /** 193 * Closes the input stream (unless it is System.in). 194 * 195 * @since 1.2 196 */ 197 @Override 198 public void close() throws IOException { 199 if (inf != null) { 200 inf.end(); 201 inf = null; 202 } 203 204 if (this.in != System.in) { 205 this.in.close(); 206 } 207 } 208 209 /** 210 * @since 1.17 211 */ 212 @Override 213 public long getCompressedCount() { 214 return countingStream.getBytesRead(); 215 } 216 217 /** 218 * Provides the stream's meta data - may change with each stream 219 * when decompressing concatenated streams. 220 * @return the stream's meta data 221 * @since 1.8 222 */ 223 public GzipParameters getMetaData() { 224 return parameters; 225 } 226 227 private boolean init(final boolean isFirstMember) throws IOException { 228 assert isFirstMember || decompressConcatenated; 229 230 // Check the magic bytes without a possibility of EOFException. 231 final int magic0 = in.read(); 232 233 // If end of input was reached after decompressing at least 234 // one .gz member, we have reached the end of the file successfully. 235 if (magic0 == -1 && !isFirstMember) { 236 return false; 237 } 238 239 if (magic0 != 31 || in.read() != 139) { 240 throw new IOException(isFirstMember 241 ? "Input is not in the .gz format" 242 : "Garbage after a valid .gz stream"); 243 } 244 245 // Parsing the rest of the header may throw EOFException. 246 final DataInput inData = new DataInputStream(in); 247 final int method = inData.readUnsignedByte(); 248 if (method != Deflater.DEFLATED) { 249 throw new IOException("Unsupported compression method " 250 + method + " in the .gz header"); 251 } 252 253 final int flg = inData.readUnsignedByte(); 254 if ((flg & FRESERVED) != 0) { 255 throw new IOException( 256 "Reserved flags are set in the .gz header"); 257 } 258 259 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 260 switch (inData.readUnsignedByte()) { // extra flags 261 case 2: 262 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 263 break; 264 case 4: 265 parameters.setCompressionLevel(Deflater.BEST_SPEED); 266 break; 267 default: 268 // ignored for now 269 break; 270 } 271 parameters.setOperatingSystem(inData.readUnsignedByte()); 272 273 // Extra field, ignored 274 if ((flg & FEXTRA) != 0) { 275 int xlen = inData.readUnsignedByte(); 276 xlen |= inData.readUnsignedByte() << 8; 277 278 // This isn't as efficient as calling in.skip would be, 279 // but it's lazier to handle unexpected end of input this way. 280 // Most files don't have an extra field anyway. 281 while (xlen-- > 0) { 282 inData.readUnsignedByte(); 283 } 284 } 285 286 // Original file name 287 if ((flg & FNAME) != 0) { 288 parameters.setFilename(new String(readToNull(inData), GzipUtils.GZIP_ENCODING)); 289 } 290 291 // Comment 292 if ((flg & FCOMMENT) != 0) { 293 parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING)); 294 } 295 296 // Header "CRC16" which is actually a truncated CRC32 (which isn't 297 // as good as real CRC16). I don't know if any encoder implementation 298 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 299 // doesn't support this field, but zlib seems to be able to at least 300 // skip over it. 301 if ((flg & FHCRC) != 0) { 302 inData.readShort(); 303 } 304 305 // Reset 306 inf.reset(); 307 crc.reset(); 308 309 return true; 310 } 311 312 @Override 313 public int read() throws IOException { 314 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 315 } 316 317 /** 318 * {@inheritDoc} 319 * 320 * @since 1.1 321 */ 322 @Override 323 public int read(final byte[] b, int off, int len) throws IOException { 324 if (len == 0) { 325 return 0; 326 } 327 if (endReached) { 328 return -1; 329 } 330 331 int size = 0; 332 333 while (len > 0) { 334 if (inf.needsInput()) { 335 // Remember the current position because we may need to 336 // rewind after reading too much input. 337 in.mark(buf.length); 338 339 bufUsed = in.read(buf); 340 if (bufUsed == -1) { 341 throw new EOFException(); 342 } 343 344 inf.setInput(buf, 0, bufUsed); 345 } 346 347 final int ret; 348 try { 349 ret = inf.inflate(b, off, len); 350 } catch (final DataFormatException e) { // NOSONAR 351 throw new IOException("Gzip-compressed data is corrupt"); 352 } 353 354 crc.update(b, off, ret); 355 off += ret; 356 len -= ret; 357 size += ret; 358 count(ret); 359 360 if (inf.finished()) { 361 // We may have read too many bytes. Rewind the read 362 // position to match the actual amount used. 363 in.reset(); 364 365 final int skipAmount = bufUsed - inf.getRemaining(); 366 if (IOUtils.skip(in, skipAmount) != skipAmount) { 367 throw new IOException(); 368 } 369 370 bufUsed = 0; 371 372 final DataInput inData = new DataInputStream(in); 373 374 // CRC32 375 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 376 377 if (crcStored != crc.getValue()) { 378 throw new IOException("Gzip-compressed data is corrupt " 379 + "(CRC32 error)"); 380 } 381 382 // Uncompressed size modulo 2^32 (ISIZE in the spec) 383 final long isize = ByteUtils.fromLittleEndian(inData, 4); 384 385 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 386 throw new IOException("Gzip-compressed data is corrupt" 387 + "(uncompressed size mismatch)"); 388 } 389 390 // See if this is the end of the file. 391 if (!decompressConcatenated || !init(false)) { 392 inf.end(); 393 inf = null; 394 endReached = true; 395 return size == 0 ? -1 : size; 396 } 397 } 398 } 399 400 return size; 401 } 402}