001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Arrays; 023import java.util.Objects; 024 025/** 026 * Helper class for compression algorithms that use the ideas of LZ77. 027 * 028 * <p>Most LZ77 derived algorithms split input data into blocks of 029 * uncompressed data (called literal blocks) and back-references 030 * (pairs of offsets and lengths) that state "add {@code length} 031 * bytes that are the same as those already written starting 032 * {@code offset} bytes before the current position. The details 033 * of how those blocks and back-references are encoded are quite 034 * different between the algorithms and some algorithms perform 035 * additional steps (Huffman encoding in the case of DEFLATE for 036 * example).</p> 037 * 038 * <p>This class attempts to extract the core logic - finding 039 * back-references - so it can be re-used. It follows the algorithm 040 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't 041 * implement the "lazy match" optimization. The three-byte hash 042 * function used in this class is the same as the one used by zlib and 043 * InfoZIP's ZIP implementation of DEFLATE. The whole class is 044 * strongly inspired by InfoZIP's implementation.</p> 045 * 046 * <p>LZ77 is used vaguely here (as well as many other places that 047 * talk about it :-), LZSS would likely be closer to the truth but 048 * LZ77 has become the synonym for a whole family of algorithms.</p> 049 * 050 * <p>The API consists of a compressor that is fed {@code byte}s 051 * and emits {@link Block}s to a registered callback where the blocks 052 * represent either {@link LiteralBlock literal blocks}, {@link 053 * BackReference back-references} or {@link EOD end of data 054 * markers}. In order to ensure the callback receives all information, 055 * the {@code #finish} method must be used once all data has been fed 056 * into the compressor.</p> 057 * 058 * <p>Several parameters influence the outcome of the "compression":</p> 059 * <dl> 060 * 061 * <dt>{@code windowSize}</dt> <dd>the size of the sliding 062 * window, must be a power of two - this determines the maximum 063 * offset a back-reference can take. The compressor maintains a 064 * buffer of twice of {@code windowSize} - real world values are 065 * in the area of 32k.</dd> 066 * 067 * <dt>{@code minBackReferenceLength}</dt> 068 * <dd>Minimal length of a back-reference found. A true minimum of 3 is 069 * hard-coded inside of this implementation but bigger lengths can be 070 * configured.</dd> 071 * 072 * <dt>{@code maxBackReferenceLength}</dt> 073 * <dd>Maximal length of a back-reference found.</dd> 074 * 075 * <dt>{@code maxOffset}</dt> 076 * <dd>Maximal offset of a back-reference.</dd> 077 * 078 * <dt>{@code maxLiteralLength}</dt> 079 * <dd>Maximal length of a literal block.</dd> 080 * </dl> 081 * 082 * @see "https://tools.ietf.org/html/rfc1951#section-4" 083 * @since 1.14 084 * @NotThreadSafe 085 */ 086public class LZ77Compressor { 087 088 /** 089 * Represents a back-reference. 090 */ 091 public static final class BackReference extends Block { 092 private final int offset, length; 093 public BackReference(final int offset, final int length) { 094 this.offset = offset; 095 this.length = length; 096 } 097 /** 098 * Provides the length of the back-reference. 099 * @return the length 100 */ 101 public int getLength() { 102 return length; 103 } 104 /** 105 * Provides the offset of the back-reference. 106 * @return the offset 107 */ 108 public int getOffset() { 109 return offset; 110 } 111 @Override 112 public BlockType getType() { 113 return BlockType.BACK_REFERENCE; 114 } 115 @Override 116 public String toString() { 117 return "BackReference with offset " + offset + " and length " + length; 118 } 119 } 120 121 /** 122 * Base class representing blocks the compressor may emit. 123 * 124 * <p>This class is not supposed to be subclassed by classes 125 * outside of Commons Compress so it is considered internal and 126 * changed that would break subclasses may get introduced with 127 * future releases.</p> 128 */ 129 public static abstract class Block { 130 /** Enumeration of the block types the compressor may emit. */ 131 public enum BlockType { 132 LITERAL, BACK_REFERENCE, EOD 133 } 134 public abstract BlockType getType(); 135 } 136 137 /** 138 * Callback invoked while the compressor processes data. 139 * 140 * <p>The callback is invoked on the same thread that receives the 141 * bytes to compress and may be invoked multiple times during the 142 * execution of {@link #compress} or {@link #finish}.</p> 143 */ 144 public interface Callback { 145 /** 146 * Consumes a block. 147 * @param b the block to consume 148 * @throws IOException in case of an error 149 */ 150 void accept(Block b) throws IOException; 151 } 152 153 /** A simple "we are done" marker. */ 154 public static final class EOD extends Block { 155 @Override 156 public BlockType getType() { 157 return BlockType.EOD; 158 } 159 } 160 161 /** 162 * Represents a literal block of data. 163 * 164 * <p>For performance reasons this encapsulates the real data, not 165 * a copy of it. Don't modify the data and process it inside of 166 * {@link Callback#accept} immediately as it will get overwritten 167 * sooner or later.</p> 168 */ 169 public static final class LiteralBlock extends Block { 170 private final byte[] data; 171 private final int offset, length; 172 public LiteralBlock(final byte[] data, final int offset, final int length) { 173 this.data = data; 174 this.offset = offset; 175 this.length = length; 176 } 177 /** 178 * The literal data. 179 * 180 * <p>This returns a life view of the actual data in order to 181 * avoid copying, modify the array at your own risk.</p> 182 * @return the data 183 */ 184 public byte[] getData() { 185 return data; 186 } 187 /** 188 * Length of literal block. 189 * @return the length 190 */ 191 public int getLength() { 192 return length; 193 } 194 /** 195 * Offset into data where the literal block starts. 196 * @return the offset 197 */ 198 public int getOffset() { 199 return offset; 200 } 201 @Override 202 public BlockType getType() { 203 return BlockType.LITERAL; 204 } 205 @Override 206 public String toString() { 207 return "LiteralBlock starting at " + offset + " with length " + length; 208 } 209 } 210 211 private static final Block THE_EOD = new EOD(); 212 213 static final int NUMBER_OF_BYTES_IN_HASH = 3; 214 private static final int NO_MATCH = -1; 215 216 // we use a 15 bit hash code as calculated in updateHash 217 private static final int HASH_SIZE = 1 << 15; 218 private static final int HASH_MASK = HASH_SIZE - 1; 219 220 private static final int H_SHIFT = 5; 221 private final Parameters params; 222 private final Callback callback; 223 224 // the sliding window, twice as big as "windowSize" parameter 225 private final byte[] window; 226 227 // the head of hash-chain - indexed by hash-code, points to the 228 // location inside of window of the latest sequence of bytes with 229 // the given hash. 230 private final int[] head; 231 // for each window-location points to the latest earlier location 232 // with the same hash. Only stores values for the latest 233 // "windowSize" elements, the index is "window location modulo 234 // windowSize". 235 private final int[] prev; 236 // bit mask used when indexing into prev 237 private final int wMask; 238 private boolean initialized; 239 // the position inside of window that shall be encoded right now 240 private int currentPosition; 241 // the number of bytes available to compress including the one at 242 // currentPosition 243 private int lookahead; 244 // the hash of the three bytes stating at the current position 245 private int insertHash; 246 247 // the position inside the window where the current literal 248 // block starts (in case we are inside a literal block). 249 private int blockStart; 250 251 // position of the current match 252 private int matchStart = NO_MATCH; 253 254 // number of missed insertString calls for the up to three last 255 // bytes of the last match that can only be performed once more 256 // data has been read 257 private int missedInserts; 258 259 /** 260 * Initializes a compressor with parameters and a callback. 261 * @param params the parameters 262 * @param callback the callback 263 * @throws NullPointerException if either parameter is {@code null} 264 */ 265 public LZ77Compressor(final Parameters params, final Callback callback) { 266 Objects.requireNonNull(params, "params"); 267 Objects.requireNonNull(callback, "callback"); 268 269 this.params = params; 270 this.callback = callback; 271 272 final int wSize = params.getWindowSize(); 273 window = new byte[wSize * 2]; 274 wMask = wSize - 1; 275 head = new int[HASH_SIZE]; 276 Arrays.fill(head, NO_MATCH); 277 prev = new int[wSize]; 278 } 279 280 private void catchUpMissedInserts() { 281 while (missedInserts > 0) { 282 insertString(currentPosition - missedInserts--); 283 } 284 } 285 286 private void compress() throws IOException { 287 final int minMatch = params.getMinBackReferenceLength(); 288 final boolean lazy = params.getLazyMatching(); 289 final int lazyThreshold = params.getLazyMatchingThreshold(); 290 291 while (lookahead >= minMatch) { 292 catchUpMissedInserts(); 293 int matchLength = 0; 294 final int hashHead = insertString(currentPosition); 295 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 296 // sets matchStart as a side effect 297 matchLength = longestMatch(hashHead); 298 299 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 300 // try to find a longer match using the next position 301 matchLength = longestMatchForNextPosition(matchLength); 302 } 303 } 304 if (matchLength >= minMatch) { 305 if (blockStart != currentPosition) { 306 // emit preceding literal block 307 flushLiteralBlock(); 308 blockStart = NO_MATCH; 309 } 310 flushBackReference(matchLength); 311 insertStringsInMatch(matchLength); 312 lookahead -= matchLength; 313 currentPosition += matchLength; 314 blockStart = currentPosition; 315 } else { 316 // no match, append to current or start a new literal 317 lookahead--; 318 currentPosition++; 319 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 320 flushLiteralBlock(); 321 blockStart = currentPosition; 322 } 323 } 324 } 325 } 326 /** 327 * Feeds bytes into the compressor which in turn may emit zero or 328 * more blocks to the callback during the execution of this 329 * method. 330 * @param data the data to compress - must not be null 331 * @throws IOException if the callback throws an exception 332 */ 333 public void compress(final byte[] data) throws IOException { 334 compress(data, 0, data.length); 335 } 336 /** 337 * Feeds bytes into the compressor which in turn may emit zero or 338 * more blocks to the callback during the execution of this 339 * method. 340 * @param data the data to compress - must not be null 341 * @param off the start offset of the data 342 * @param len the number of bytes to compress 343 * @throws IOException if the callback throws an exception 344 */ 345 public void compress(final byte[] data, int off, int len) throws IOException { 346 final int wSize = params.getWindowSize(); 347 while (len > wSize) { // chop into windowSize sized chunks 348 doCompress(data, off, wSize); 349 off += wSize; 350 len -= wSize; 351 } 352 if (len > 0) { 353 doCompress(data, off, len); 354 } 355 } 356 357 // performs the actual algorithm with the pre-condition len <= windowSize 358 private void doCompress(final byte[] data, final int off, final int len) throws IOException { 359 final int spaceLeft = window.length - currentPosition - lookahead; 360 if (len > spaceLeft) { 361 slide(); 362 } 363 System.arraycopy(data, off, window, currentPosition + lookahead, len); 364 lookahead += len; 365 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 366 initialize(); 367 } 368 if (initialized) { 369 compress(); 370 } 371 } 372 373 /** 374 * Tells the compressor to process all remaining data and signal 375 * end of data to the callback. 376 * 377 * <p>The compressor will in turn emit at least one block ({@link 378 * EOD}) but potentially multiple blocks to the callback during 379 * the execution of this method.</p> 380 * @throws IOException if the callback throws an exception 381 */ 382 public void finish() throws IOException { 383 if (blockStart != currentPosition || lookahead > 0) { 384 currentPosition += lookahead; 385 flushLiteralBlock(); 386 } 387 callback.accept(THE_EOD); 388 } 389 390 private void flushBackReference(final int matchLength) throws IOException { 391 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 392 } 393 394 private void flushLiteralBlock() throws IOException { 395 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 396 } 397 398 private void initialize() { 399 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 400 insertHash = nextHash(insertHash, window[i]); 401 } 402 initialized = true; 403 } 404 405 /** 406 * Inserts the current three byte sequence into the dictionary and 407 * returns the previous head of the hash-chain. 408 * 409 * <p>Updates {@code insertHash} and {@code prev} as a 410 * side effect.</p> 411 */ 412 private int insertString(final int pos) { 413 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 414 final int hashHead = head[insertHash]; 415 prev[pos & wMask] = hashHead; 416 head[insertHash] = pos; 417 return hashHead; 418 } 419 420 private void insertStringsInMatch(final int matchLength) { 421 // inserts strings contained in current match 422 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 423 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 424 // currentPosition has been inserted already 425 for (int i = 1; i <= stop; i++) { 426 insertString(currentPosition + i); 427 } 428 missedInserts = matchLength - stop - 1; 429 } 430 431 /** 432 * Searches the hash chain for real matches and returns the length 433 * of the longest match (0 if none were found) that isn't too far 434 * away (WRT maxOffset). 435 * 436 * <p>Sets matchStart to the index of the start position of the 437 * longest match as a side effect.</p> 438 */ 439 private int longestMatch(int matchHead) { 440 final int minLength = params.getMinBackReferenceLength(); 441 int longestMatchLength = minLength - 1; 442 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 443 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 444 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 445 final int maxCandidates = params.getMaxCandidates(); 446 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 447 int currentLength = 0; 448 for (int i = 0; i < maxPossibleLength; i++) { 449 if (window[matchHead + i] != window[currentPosition + i]) { 450 break; 451 } 452 currentLength++; 453 } 454 if (currentLength > longestMatchLength) { 455 longestMatchLength = currentLength; 456 matchStart = matchHead; 457 if (currentLength >= niceBackReferenceLength) { 458 // no need to search any further 459 break; 460 } 461 } 462 matchHead = prev[matchHead & wMask]; 463 } 464 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 465 } 466 467 private int longestMatchForNextPosition(final int prevMatchLength) { 468 // save a bunch of values to restore them if the next match isn't better than the current one 469 final int prevMatchStart = matchStart; 470 final int prevInsertHash = insertHash; 471 472 lookahead--; 473 currentPosition++; 474 final int hashHead = insertString(currentPosition); 475 final int prevHashHead = prev[currentPosition & wMask]; 476 int matchLength = longestMatch(hashHead); 477 478 if (matchLength <= prevMatchLength) { 479 // use the first match, as the next one isn't any better 480 matchLength = prevMatchLength; 481 matchStart = prevMatchStart; 482 483 // restore modified values 484 head[insertHash] = prevHashHead; 485 insertHash = prevInsertHash; 486 currentPosition--; 487 lookahead++; 488 } 489 return matchLength; 490 } 491 492 /** 493 * Assumes we are calculating the hash for three consecutive bytes 494 * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC 495 * the new hash for BCD is nextHash(H, D). 496 * 497 * <p>The hash is shifted by five bits on each update so all 498 * effects of A have been swapped after the third update.</p> 499 */ 500 private int nextHash(final int oldHash, final byte nextByte) { 501 final int nextVal = nextByte & 0xFF; 502 return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; 503 } 504 505 /** 506 * Adds some initial data to fill the window with. 507 * 508 * <p>This is used if the stream has been cut into blocks and 509 * back-references of one block may refer to data of the previous 510 * block(s). One such example is the LZ4 frame format using block 511 * dependency.</p> 512 * 513 * @param data the data to fill the window with. 514 * @throws IllegalStateException if the compressor has already started to accept data 515 */ 516 public void prefill(final byte[] data) { 517 if (currentPosition != 0 || lookahead != 0) { 518 throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore"); 519 } 520 521 // don't need more than windowSize for back-references 522 final int len = Math.min(params.getWindowSize(), data.length); 523 System.arraycopy(data, data.length - len, window, 0, len); 524 525 if (len >= NUMBER_OF_BYTES_IN_HASH) { 526 initialize(); 527 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 528 for (int i = 0; i < stop; i++) { 529 insertString(i); 530 } 531 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 532 } else { // not enough data to hash anything 533 missedInserts = len; 534 } 535 blockStart = currentPosition = len; 536 } 537 538 private void slide() throws IOException { 539 final int wSize = params.getWindowSize(); 540 if (blockStart != currentPosition && blockStart < wSize) { 541 flushLiteralBlock(); 542 blockStart = currentPosition; 543 } 544 System.arraycopy(window, wSize, window, 0, wSize); 545 currentPosition -= wSize; 546 matchStart -= wSize; 547 blockStart -= wSize; 548 for (int i = 0; i < HASH_SIZE; i++) { 549 final int h = head[i]; 550 head[i] = h >= wSize ? h - wSize : NO_MATCH; 551 } 552 for (int i = 0; i < wSize; i++) { 553 final int p = prev[i]; 554 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 555 } 556 } 557}