001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; 025import org.apache.commons.compress.utils.ByteUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p>This implementation uses an internal buffer in order to handle 031 * the back-references that are at the heart of the LZ77 algorithm. 032 * The size of the buffer must be at least as big as the biggest 033 * offset used in the compressed stream. The current version of the 034 * Snappy algorithm as defined by Google works on 32k blocks and 035 * doesn't contain offsets bigger than 32k which is the default block 036 * size used by this class.</p> 037 * 038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 039 * @since 1.7 040 */ 041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { 042 043 private enum State { 044 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE 045 } 046 047 /** Mask used to determine the type of "tag" is being processed */ 048 private static final int TAG_MASK = 0x03; 049 050 /** Default block size */ 051 public static final int DEFAULT_BLOCK_SIZE = 32768; 052 053 /** The size of the uncompressed data */ 054 private final int size; 055 056 /** Number of uncompressed bytes still to be read. */ 057 private int uncompressedBytesRemaining; 058 059 /** Current state of the stream */ 060 private State state = State.NO_BLOCK; 061 062 private boolean endReached; 063 064 /** 065 * Constructor using the default buffer size of 32k. 066 * 067 * @param is 068 * An InputStream to read compressed data from 069 * 070 * @throws IOException if reading fails 071 */ 072 public SnappyCompressorInputStream(final InputStream is) throws IOException { 073 this(is, DEFAULT_BLOCK_SIZE); 074 } 075 076 /** 077 * Constructor using a configurable buffer size. 078 * 079 * @param is 080 * An InputStream to read compressed data from 081 * @param blockSize 082 * The block size used in compression 083 * 084 * @throws IOException if reading fails 085 * @throws IllegalArgumentException if blockSize is not bigger than 0 086 */ 087 public SnappyCompressorInputStream(final InputStream is, final int blockSize) 088 throws IOException { 089 super(is, blockSize); 090 uncompressedBytesRemaining = size = (int) readSize(); 091 } 092 093 /** 094 * Try to fill the buffer with the next block of data. 095 */ 096 private void fill() throws IOException { 097 if (uncompressedBytesRemaining == 0) { 098 endReached = true; 099 return; 100 } 101 102 int b = readOneByte(); 103 if (b == -1) { 104 throw new IOException("Premature end of stream reading block start"); 105 } 106 int length = 0; 107 int offset = 0; 108 109 switch (b & TAG_MASK) { 110 111 case 0x00: 112 113 length = readLiteralLength(b); 114 if (length < 0) { 115 throw new IOException("Illegal block with a negative literal size found"); 116 } 117 uncompressedBytesRemaining -= length; 118 startLiteral(length); 119 state = State.IN_LITERAL; 120 break; 121 122 case 0x01: 123 124 /* 125 * These elements can encode lengths between [4..11] bytes and 126 * offsets between [0..2047] bytes. (len-4) occupies three bits 127 * and is stored in bits [2..4] of the tag byte. The offset 128 * occupies 11 bits, of which the upper three are stored in the 129 * upper three bits ([5..7]) of the tag byte, and the lower 130 * eight are stored in a byte following the tag byte. 131 */ 132 133 length = 4 + (b >> 2 & 0x07); 134 uncompressedBytesRemaining -= length; 135 offset = (b & 0xE0) << 3; 136 b = readOneByte(); 137 if (b == -1) { 138 throw new IOException("Premature end of stream reading back-reference length"); 139 } 140 offset |= b; 141 142 try { 143 startBackReference(offset, length); 144 } catch (final IllegalArgumentException ex) { 145 throw new IOException("Illegal block with bad offset found", ex); 146 } 147 state = State.IN_BACK_REFERENCE; 148 break; 149 150 case 0x02: 151 152 /* 153 * These elements can encode lengths between [1..64] and offsets 154 * from [0..65535]. (len-1) occupies six bits and is stored in 155 * the upper six bits ([2..7]) of the tag byte. The offset is 156 * stored as a little-endian 16-bit integer in the two bytes 157 * following the tag byte. 158 */ 159 160 length = (b >> 2) + 1; 161 if (length < 0) { 162 throw new IOException("Illegal block with a negative match length found"); 163 } 164 uncompressedBytesRemaining -= length; 165 166 offset = (int) ByteUtils.fromLittleEndian(supplier, 2); 167 168 try { 169 startBackReference(offset, length); 170 } catch (final IllegalArgumentException ex) { 171 throw new IOException("Illegal block with bad offset found", ex); 172 } 173 state = State.IN_BACK_REFERENCE; 174 break; 175 176 case 0x03: 177 178 /* 179 * These are like the copies with 2-byte offsets (see previous 180 * subsection), except that the offset is stored as a 32-bit 181 * integer instead of a 16-bit integer (and thus will occupy 182 * four bytes). 183 */ 184 185 length = (b >> 2) + 1; 186 if (length < 0) { 187 throw new IOException("Illegal block with a negative match length found"); 188 } 189 uncompressedBytesRemaining -= length; 190 191 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; 192 193 try { 194 startBackReference(offset, length); 195 } catch (final IllegalArgumentException ex) { 196 throw new IOException("Illegal block with bad offset found", ex); 197 } 198 state = State.IN_BACK_REFERENCE; 199 break; 200 default: 201 // impossible as TAG_MASK is two bits and all four possible cases have been covered 202 break; 203 } 204 } 205 206 /** 207 * Gets the uncompressed size of the stream 208 * 209 * @return the uncompressed size 210 */ 211 @Override 212 public int getSize() { 213 return size; 214 } 215 216 /** 217 * {@inheritDoc} 218 */ 219 @Override 220 public int read(final byte[] b, final int off, final int len) throws IOException { 221 if (len == 0) { 222 return 0; 223 } 224 if (endReached) { 225 return -1; 226 } 227 switch (state) { 228 case NO_BLOCK: 229 fill(); 230 return read(b, off, len); 231 case IN_LITERAL: 232 final int litLen = readLiteral(b, off, len); 233 if (!hasMoreDataInBlock()) { 234 state = State.NO_BLOCK; 235 } 236 return litLen > 0 ? litLen : read(b, off, len); 237 case IN_BACK_REFERENCE: 238 final int backReferenceLen = readBackReference(b, off, len); 239 if (!hasMoreDataInBlock()) { 240 state = State.NO_BLOCK; 241 } 242 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); 243 default: 244 throw new IOException("Unknown stream state " + state); 245 } 246 } 247 248 /* 249 * For literals up to and including 60 bytes in length, the 250 * upper six bits of the tag byte contain (len-1). The literal 251 * follows immediately thereafter in the bytestream. - For 252 * longer literals, the (len-1) value is stored after the tag 253 * byte, little-endian. The upper six bits of the tag byte 254 * describe how many bytes are used for the length; 60, 61, 62 255 * or 63 for 1-4 bytes, respectively. The literal itself follows 256 * after the length. 257 */ 258 private int readLiteralLength(final int b) throws IOException { 259 final int length; 260 switch (b >> 2) { 261 case 60: 262 length = readOneByte(); 263 if (length == -1) { 264 throw new IOException("Premature end of stream reading literal length"); 265 } 266 break; 267 case 61: 268 length = (int) ByteUtils.fromLittleEndian(supplier, 2); 269 break; 270 case 62: 271 length = (int) ByteUtils.fromLittleEndian(supplier, 3); 272 break; 273 case 63: 274 length = (int) ByteUtils.fromLittleEndian(supplier, 4); 275 break; 276 default: 277 length = b >> 2; 278 break; 279 } 280 281 return length + 1; 282 } 283 284 /** 285 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 286 * 1), stored as a little-endian varint. Varints consist of a series of 287 * bytes, where the lower 7 bits are data and the upper bit is set iff there 288 * are more bytes to be read. In other words, an uncompressed length of 64 289 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) 290 * would be stored as 0xFE 0xFF 0x7F. 291 * 292 * @return The size of the uncompressed data 293 * 294 * @throws IOException 295 * Could not read a byte 296 */ 297 private long readSize() throws IOException { 298 int index = 0; 299 long sz = 0; 300 int b = 0; 301 302 do { 303 b = readOneByte(); 304 if (b == -1) { 305 throw new IOException("Premature end of stream reading size"); 306 } 307 sz |= (b & 0x7f) << index++ * 7; 308 } while (0 != (b & 0x80)); 309 return sz; 310 } 311}