001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.BufferedInputStream;
022import java.io.ByteArrayOutputStream;
023import java.io.DataInput;
024import java.io.DataInputStream;
025import java.io.EOFException;
026import java.io.IOException;
027import java.io.InputStream;
028import java.util.zip.CRC32;
029import java.util.zip.DataFormatException;
030import java.util.zip.Deflater;
031import java.util.zip.Inflater;
032
033import org.apache.commons.compress.compressors.CompressorInputStream;
034import org.apache.commons.compress.utils.ByteUtils;
035import org.apache.commons.compress.utils.CountingInputStream;
036import org.apache.commons.compress.utils.IOUtils;
037import org.apache.commons.compress.utils.InputStreamStatistics;
038
039/**
040 * Input stream that decompresses .gz files.
041 *
042 * <p>This supports decompressing concatenated .gz files which is important
043 * when decompressing standalone .gz files.</p>
044 *
045 * <p>
046 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
047 * files: it stops after the first member and silently ignores the rest.
048 * It doesn't leave the read position to point to the beginning of the next
049 * member, which makes it difficult workaround the lack of concatenation
050 * support.
051 * </p>
052 *
053 * <p>
054 * Instead of using {@code GZIPInputStream}, this class has its own .gz
055 * container format decoder. The actual decompression is done with
056 * {@link java.util.zip.Inflater}.
057 * </p>
058 *
059 * <p>If you use the constructor {@code GzipCompressorInputStream(in)}
060 * or {@code GzipCompressorInputStream(in, false)} with some {@code
061 * InputStream} {@code in} then {@link #read} will return -1 as soon
062 * as the first internal member has been read completely. The stream
063 * {@code in} will be positioned at the start of the second gzip
064 * member if there is one.</p>
065 *
066 * <p>If you use the constructor {@code GzipCompressorInputStream(in,
067 * true)} with some {@code InputStream} {@code in} then {@link #read}
068 * will return -1 once the stream {@code in} has been exhausted. The
069 * data read from a stream constructed this way will consist of the
070 * concatenated data of all gzip members contained inside {@code
071 * in}.</p>
072 *
073 * @see "https://tools.ietf.org/html/rfc1952"
074 */
075public class GzipCompressorInputStream extends CompressorInputStream
076    implements InputStreamStatistics {
077
078    // Header flags
079    // private static final int FTEXT = 0x01; // Uninteresting for us
080    private static final int FHCRC = 0x02;
081    private static final int FEXTRA = 0x04;
082    private static final int FNAME = 0x08;
083    private static final int FCOMMENT = 0x10;
084    private static final int FRESERVED = 0xE0;
085
086    /**
087     * Checks if the signature matches what is expected for a .gz file.
088     *
089     * @param signature the bytes to check
090     * @param length    the number of bytes to check
091     * @return          true if this is a .gz stream, false otherwise
092     *
093     * @since 1.1
094     */
095    public static boolean matches(final byte[] signature, final int length) {
096        return length >= 2 && signature[0] == 31 && signature[1] == -117;
097    }
098
099    private static byte[] readToNull(final DataInput inData) throws IOException {
100        try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
101            int b;
102            while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR
103                bos.write(b);
104            }
105            return bos.toByteArray();
106        }
107    }
108
109    private final CountingInputStream countingStream;
110
111    // Compressed input stream, possibly wrapped in a
112    // BufferedInputStream, always wrapped in countingStream above
113    private final InputStream in;
114
115    // True if decompressing multi member streams.
116    private final boolean decompressConcatenated;
117
118    // Buffer to hold the input data
119    private final byte[] buf = new byte[8192];
120
121    // Amount of data in buf.
122    private int bufUsed;
123
124    // Decompressor
125    private Inflater inf = new Inflater(true);
126
127    // CRC32 from uncompressed data
128    private final CRC32 crc = new CRC32();
129
130    // True once everything has been decompressed
131    private boolean endReached;
132
133    // used in no-arg read method
134    private final byte[] oneByte = new byte[1];
135
136    private final GzipParameters parameters = new GzipParameters();
137
138    /**
139     * Constructs a new input stream that decompresses gzip-compressed data
140     * from the specified input stream.
141     * <p>
142     * This is equivalent to
143     * {@code GzipCompressorInputStream(inputStream, false)} and thus
144     * will not decompress concatenated .gz files.
145     *
146     * @param inputStream  the InputStream from which this object should
147     *                     be created of
148     *
149     * @throws IOException if the stream could not be created
150     */
151    public GzipCompressorInputStream(final InputStream inputStream)
152            throws IOException {
153        this(inputStream, false);
154    }
155
156    /**
157     * Constructs a new input stream that decompresses gzip-compressed data
158     * from the specified input stream.
159     * <p>
160     * If {@code decompressConcatenated} is {@code false}:
161     * This decompressor might read more input than it will actually use.
162     * If {@code inputStream} supports {@code mark} and
163     * {@code reset}, then the input position will be adjusted
164     * so that it is right after the last byte of the compressed stream.
165     * If {@code mark} isn't supported, the input position will be
166     * undefined.
167     *
168     * @param inputStream  the InputStream from which this object should
169     *                     be created of
170     * @param decompressConcatenated
171     *                     if true, decompress until the end of the input;
172     *                     if false, stop after the first .gz member
173     *
174     * @throws IOException if the stream could not be created
175     */
176    public GzipCompressorInputStream(final InputStream inputStream,
177                                     final boolean decompressConcatenated)
178            throws IOException {
179        countingStream = new CountingInputStream(inputStream);
180        // Mark support is strictly needed for concatenated files only,
181        // but it's simpler if it is always available.
182        if (countingStream.markSupported()) {
183            in = countingStream;
184        } else {
185            in = new BufferedInputStream(countingStream);
186        }
187
188        this.decompressConcatenated = decompressConcatenated;
189        init(true);
190    }
191
192    /**
193     * Closes the input stream (unless it is System.in).
194     *
195     * @since 1.2
196     */
197    @Override
198    public void close() throws IOException {
199        if (inf != null) {
200            inf.end();
201            inf = null;
202        }
203
204        if (this.in != System.in) {
205            this.in.close();
206        }
207    }
208
209    /**
210     * @since 1.17
211     */
212    @Override
213    public long getCompressedCount() {
214        return countingStream.getBytesRead();
215    }
216
217    /**
218     * Provides the stream's meta data - may change with each stream
219     * when decompressing concatenated streams.
220     * @return the stream's meta data
221     * @since 1.8
222     */
223    public GzipParameters getMetaData() {
224        return parameters;
225    }
226
227    private boolean init(final boolean isFirstMember) throws IOException {
228        assert isFirstMember || decompressConcatenated;
229
230        // Check the magic bytes without a possibility of EOFException.
231        final int magic0 = in.read();
232
233        // If end of input was reached after decompressing at least
234        // one .gz member, we have reached the end of the file successfully.
235        if (magic0 == -1 && !isFirstMember) {
236            return false;
237        }
238
239        if (magic0 != 31 || in.read() != 139) {
240            throw new IOException(isFirstMember
241                                  ? "Input is not in the .gz format"
242                                  : "Garbage after a valid .gz stream");
243        }
244
245        // Parsing the rest of the header may throw EOFException.
246        final DataInput inData = new DataInputStream(in);
247        final int method = inData.readUnsignedByte();
248        if (method != Deflater.DEFLATED) {
249            throw new IOException("Unsupported compression method "
250                                  + method + " in the .gz header");
251        }
252
253        final int flg = inData.readUnsignedByte();
254        if ((flg & FRESERVED) != 0) {
255            throw new IOException(
256                    "Reserved flags are set in the .gz header");
257        }
258
259        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
260        switch (inData.readUnsignedByte()) { // extra flags
261        case 2:
262            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
263            break;
264        case 4:
265            parameters.setCompressionLevel(Deflater.BEST_SPEED);
266            break;
267        default:
268            // ignored for now
269            break;
270        }
271        parameters.setOperatingSystem(inData.readUnsignedByte());
272
273        // Extra field, ignored
274        if ((flg & FEXTRA) != 0) {
275            int xlen = inData.readUnsignedByte();
276            xlen |= inData.readUnsignedByte() << 8;
277
278            // This isn't as efficient as calling in.skip would be,
279            // but it's lazier to handle unexpected end of input this way.
280            // Most files don't have an extra field anyway.
281            while (xlen-- > 0) {
282                inData.readUnsignedByte();
283            }
284        }
285
286        // Original file name
287        if ((flg & FNAME) != 0) {
288            parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
289        }
290
291        // Comment
292        if ((flg & FCOMMENT) != 0) {
293            parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
294        }
295
296        // Header "CRC16" which is actually a truncated CRC32 (which isn't
297        // as good as real CRC16). I don't know if any encoder implementation
298        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
299        // doesn't support this field, but zlib seems to be able to at least
300        // skip over it.
301        if ((flg & FHCRC) != 0) {
302            inData.readShort();
303        }
304
305        // Reset
306        inf.reset();
307        crc.reset();
308
309        return true;
310    }
311
312    @Override
313    public int read() throws IOException {
314        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
315    }
316
317    /**
318     * {@inheritDoc}
319     *
320     * @since 1.1
321     */
322    @Override
323    public int read(final byte[] b, int off, int len) throws IOException {
324        if (len == 0) {
325            return 0;
326        }
327        if (endReached) {
328            return -1;
329        }
330
331        int size = 0;
332
333        while (len > 0) {
334            if (inf.needsInput()) {
335                // Remember the current position because we may need to
336                // rewind after reading too much input.
337                in.mark(buf.length);
338
339                bufUsed = in.read(buf);
340                if (bufUsed == -1) {
341                    throw new EOFException();
342                }
343
344                inf.setInput(buf, 0, bufUsed);
345            }
346
347            final int ret;
348            try {
349                ret = inf.inflate(b, off, len);
350            } catch (final DataFormatException e) { // NOSONAR
351                throw new IOException("Gzip-compressed data is corrupt");
352            }
353
354            crc.update(b, off, ret);
355            off += ret;
356            len -= ret;
357            size += ret;
358            count(ret);
359
360            if (inf.finished()) {
361                // We may have read too many bytes. Rewind the read
362                // position to match the actual amount used.
363                in.reset();
364
365                final int skipAmount = bufUsed - inf.getRemaining();
366                if (IOUtils.skip(in, skipAmount) != skipAmount) {
367                    throw new IOException();
368                }
369
370                bufUsed = 0;
371
372                final DataInput inData = new DataInputStream(in);
373
374                // CRC32
375                final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
376
377                if (crcStored != crc.getValue()) {
378                    throw new IOException("Gzip-compressed data is corrupt "
379                                          + "(CRC32 error)");
380                }
381
382                // Uncompressed size modulo 2^32 (ISIZE in the spec)
383                final long isize = ByteUtils.fromLittleEndian(inData, 4);
384
385                if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
386                    throw new IOException("Gzip-compressed data is corrupt"
387                                          + "(uncompressed size mismatch)");
388                }
389
390                // See if this is the end of the file.
391                if (!decompressConcatenated || !init(false)) {
392                    inf.end();
393                    inf = null;
394                    endReached = true;
395                    return size == 0 ? -1 : size;
396                }
397            }
398        }
399
400        return size;
401    }
402}