001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.util.Arrays;
023import java.util.Objects;
024
025/**
026 * Helper class for compression algorithms that use the ideas of LZ77.
027 *
028 * <p>Most LZ77 derived algorithms split input data into blocks of
029 * uncompressed data (called literal blocks) and back-references
030 * (pairs of offsets and lengths) that state "add {@code length}
031 * bytes that are the same as those already written starting
032 * {@code offset} bytes before the current position. The details
033 * of how those blocks and back-references are encoded are quite
034 * different between the algorithms and some algorithms perform
035 * additional steps (Huffman encoding in the case of DEFLATE for
036 * example).</p>
037 *
038 * <p>This class attempts to extract the core logic - finding
039 * back-references - so it can be re-used. It follows the algorithm
040 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
041 * implement the "lazy match" optimization. The three-byte hash
042 * function used in this class is the same as the one used by zlib and
043 * InfoZIP's ZIP implementation of DEFLATE. The whole class is
044 * strongly inspired by InfoZIP's implementation.</p>
045 *
046 * <p>LZ77 is used vaguely here (as well as many other places that
047 * talk about it :-), LZSS would likely be closer to the truth but
048 * LZ77 has become the synonym for a whole family of algorithms.</p>
049 *
050 * <p>The API consists of a compressor that is fed {@code byte}s
051 * and emits {@link Block}s to a registered callback where the blocks
052 * represent either {@link LiteralBlock literal blocks}, {@link
053 * BackReference back-references} or {@link EOD end of data
054 * markers}. In order to ensure the callback receives all information,
055 * the {@code #finish} method must be used once all data has been fed
056 * into the compressor.</p>
057 *
058 * <p>Several parameters influence the outcome of the "compression":</p>
059 * <dl>
060 *
061 *  <dt>{@code windowSize}</dt> <dd>the size of the sliding
062 *  window, must be a power of two - this determines the maximum
063 *  offset a back-reference can take. The compressor maintains a
064 *  buffer of twice of {@code windowSize} - real world values are
065 *  in the area of 32k.</dd>
066 *
067 *  <dt>{@code minBackReferenceLength}</dt>
068 *  <dd>Minimal length of a back-reference found. A true minimum of 3 is
069 *  hard-coded inside of this implementation but bigger lengths can be
070 *  configured.</dd>
071 *
072 *  <dt>{@code maxBackReferenceLength}</dt>
073 *  <dd>Maximal length of a back-reference found.</dd>
074 *
075 *  <dt>{@code maxOffset}</dt>
076 *  <dd>Maximal offset of a back-reference.</dd>
077 *
078 *  <dt>{@code maxLiteralLength}</dt>
079 *  <dd>Maximal length of a literal block.</dd>
080 * </dl>
081 *
082 * @see "https://tools.ietf.org/html/rfc1951#section-4"
083 * @since 1.14
084 * @NotThreadSafe
085 */
086public class LZ77Compressor {
087
088    /**
089     * Represents a back-reference.
090     */
091    public static final class BackReference extends Block {
092        private final int offset, length;
093        public BackReference(final int offset, final int length) {
094            this.offset = offset;
095            this.length = length;
096        }
097        /**
098         * Provides the length of the back-reference.
099         * @return the length
100         */
101        public int getLength() {
102            return length;
103        }
104        /**
105         * Provides the offset of the back-reference.
106         * @return the offset
107         */
108        public int getOffset() {
109            return offset;
110        }
111        @Override
112        public BlockType getType() {
113            return BlockType.BACK_REFERENCE;
114        }
115        @Override
116        public String toString() {
117            return "BackReference with offset " + offset + " and length " + length;
118        }
119    }
120
121    /**
122     * Base class representing blocks the compressor may emit.
123     *
124     * <p>This class is not supposed to be subclassed by classes
125     * outside of Commons Compress so it is considered internal and
126     * changed that would break subclasses may get introduced with
127     * future releases.</p>
128     */
129    public static abstract class Block {
130        /** Enumeration of the block types the compressor may emit. */
131        public enum BlockType {
132            LITERAL, BACK_REFERENCE, EOD
133        }
134        public abstract BlockType getType();
135    }
136
137    /**
138     * Callback invoked while the compressor processes data.
139     *
140     * <p>The callback is invoked on the same thread that receives the
141     * bytes to compress and may be invoked multiple times during the
142     * execution of {@link #compress} or {@link #finish}.</p>
143     */
144    public interface Callback {
145        /**
146         * Consumes a block.
147         * @param b the block to consume
148         * @throws IOException in case of an error
149         */
150        void accept(Block b) throws IOException;
151    }
152
153    /** A simple "we are done" marker. */
154    public static final class EOD extends Block {
155        @Override
156        public BlockType getType() {
157            return BlockType.EOD;
158        }
159    }
160
161    /**
162     * Represents a literal block of data.
163     *
164     * <p>For performance reasons this encapsulates the real data, not
165     * a copy of it. Don't modify the data and process it inside of
166     * {@link Callback#accept} immediately as it will get overwritten
167     * sooner or later.</p>
168     */
169    public static final class LiteralBlock extends Block {
170        private final byte[] data;
171        private final int offset, length;
172        public LiteralBlock(final byte[] data, final int offset, final int length) {
173            this.data = data;
174            this.offset = offset;
175            this.length = length;
176        }
177        /**
178         * The literal data.
179         *
180         * <p>This returns a life view of the actual data in order to
181         * avoid copying, modify the array at your own risk.</p>
182         * @return the data
183         */
184        public byte[] getData() {
185            return data;
186        }
187        /**
188         * Length of literal block.
189         * @return the length
190         */
191        public int getLength() {
192            return length;
193        }
194        /**
195         * Offset into data where the literal block starts.
196         * @return the offset
197         */
198        public int getOffset() {
199            return offset;
200        }
201        @Override
202        public BlockType getType() {
203            return BlockType.LITERAL;
204        }
205        @Override
206        public String toString() {
207            return "LiteralBlock starting at " + offset + " with length " + length;
208        }
209    }
210
211    private static final Block THE_EOD = new EOD();
212
213    static final int NUMBER_OF_BYTES_IN_HASH = 3;
214    private static final int NO_MATCH = -1;
215
216    // we use a 15 bit hash code as calculated in updateHash
217    private static final int HASH_SIZE = 1 << 15;
218    private static final int HASH_MASK = HASH_SIZE - 1;
219
220    private static final int H_SHIFT = 5;
221    private final Parameters params;
222    private final Callback callback;
223
224    // the sliding window, twice as big as "windowSize" parameter
225    private final byte[] window;
226
227    // the head of hash-chain - indexed by hash-code, points to the
228    // location inside of window of the latest sequence of bytes with
229    // the given hash.
230    private final int[] head;
231    // for each window-location points to the latest earlier location
232    // with the same hash. Only stores values for the latest
233    // "windowSize" elements, the index is "window location modulo
234    // windowSize".
235    private final int[] prev;
236    // bit mask used when indexing into prev
237    private final int wMask;
238    private boolean initialized;
239    // the position inside of window that shall be encoded right now
240    private int currentPosition;
241    // the number of bytes available to compress including the one at
242    // currentPosition
243    private int lookahead;
244    // the hash of the three bytes stating at the current position
245    private int insertHash;
246
247    // the position inside the window where the current literal
248    // block starts (in case we are inside a literal block).
249    private int blockStart;
250
251    // position of the current match
252    private int matchStart = NO_MATCH;
253
254    // number of missed insertString calls for the up to three last
255    // bytes of the last match that can only be performed once more
256    // data has been read
257    private int missedInserts;
258
259    /**
260     * Initializes a compressor with parameters and a callback.
261     * @param params the parameters
262     * @param callback the callback
263     * @throws NullPointerException if either parameter is {@code null}
264     */
265    public LZ77Compressor(final Parameters params, final Callback callback) {
266        Objects.requireNonNull(params, "params");
267        Objects.requireNonNull(callback, "callback");
268
269        this.params = params;
270        this.callback = callback;
271
272        final int wSize = params.getWindowSize();
273        window = new byte[wSize * 2];
274        wMask = wSize - 1;
275        head = new int[HASH_SIZE];
276        Arrays.fill(head, NO_MATCH);
277        prev = new int[wSize];
278    }
279
280    private void catchUpMissedInserts() {
281        while (missedInserts > 0) {
282            insertString(currentPosition - missedInserts--);
283        }
284    }
285
286    private void compress() throws IOException {
287        final int minMatch = params.getMinBackReferenceLength();
288        final boolean lazy = params.getLazyMatching();
289        final int lazyThreshold = params.getLazyMatchingThreshold();
290
291        while (lookahead >= minMatch) {
292            catchUpMissedInserts();
293            int matchLength = 0;
294            final int hashHead = insertString(currentPosition);
295            if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
296                // sets matchStart as a side effect
297                matchLength = longestMatch(hashHead);
298
299                if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
300                    // try to find a longer match using the next position
301                    matchLength = longestMatchForNextPosition(matchLength);
302                }
303            }
304            if (matchLength >= minMatch) {
305                if (blockStart != currentPosition) {
306                    // emit preceding literal block
307                    flushLiteralBlock();
308                    blockStart = NO_MATCH;
309                }
310                flushBackReference(matchLength);
311                insertStringsInMatch(matchLength);
312                lookahead -= matchLength;
313                currentPosition += matchLength;
314                blockStart = currentPosition;
315            } else {
316                // no match, append to current or start a new literal
317                lookahead--;
318                currentPosition++;
319                if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
320                    flushLiteralBlock();
321                    blockStart = currentPosition;
322                }
323            }
324        }
325    }
326    /**
327     * Feeds bytes into the compressor which in turn may emit zero or
328     * more blocks to the callback during the execution of this
329     * method.
330     * @param data the data to compress - must not be null
331     * @throws IOException if the callback throws an exception
332     */
333    public void compress(final byte[] data) throws IOException {
334        compress(data, 0, data.length);
335    }
336    /**
337     * Feeds bytes into the compressor which in turn may emit zero or
338     * more blocks to the callback during the execution of this
339     * method.
340     * @param data the data to compress - must not be null
341     * @param off the start offset of the data
342     * @param len the number of bytes to compress
343     * @throws IOException if the callback throws an exception
344     */
345    public void compress(final byte[] data, int off, int len) throws IOException {
346        final int wSize = params.getWindowSize();
347        while (len > wSize) { // chop into windowSize sized chunks
348            doCompress(data, off, wSize);
349            off += wSize;
350            len -= wSize;
351        }
352        if (len > 0) {
353            doCompress(data, off, len);
354        }
355    }
356
357    // performs the actual algorithm with the pre-condition len <= windowSize
358    private void doCompress(final byte[] data, final int off, final int len) throws IOException {
359        final int spaceLeft = window.length - currentPosition - lookahead;
360        if (len > spaceLeft) {
361            slide();
362        }
363        System.arraycopy(data, off, window, currentPosition + lookahead, len);
364        lookahead += len;
365        if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
366            initialize();
367        }
368        if (initialized) {
369            compress();
370        }
371    }
372
373    /**
374     * Tells the compressor to process all remaining data and signal
375     * end of data to the callback.
376     *
377     * <p>The compressor will in turn emit at least one block ({@link
378     * EOD}) but potentially multiple blocks to the callback during
379     * the execution of this method.</p>
380     * @throws IOException if the callback throws an exception
381     */
382    public void finish() throws IOException {
383        if (blockStart != currentPosition || lookahead > 0) {
384            currentPosition += lookahead;
385            flushLiteralBlock();
386        }
387        callback.accept(THE_EOD);
388    }
389
390    private void flushBackReference(final int matchLength) throws IOException {
391        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
392    }
393
394    private void flushLiteralBlock() throws IOException {
395        callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
396    }
397
398    private void initialize() {
399        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
400            insertHash = nextHash(insertHash, window[i]);
401        }
402        initialized = true;
403    }
404
405    /**
406     * Inserts the current three byte sequence into the dictionary and
407     * returns the previous head of the hash-chain.
408     *
409     * <p>Updates {@code insertHash} and {@code prev} as a
410     * side effect.</p>
411     */
412    private int insertString(final int pos) {
413        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
414        final int hashHead = head[insertHash];
415        prev[pos & wMask] = hashHead;
416        head[insertHash] = pos;
417        return hashHead;
418    }
419
420    private void insertStringsInMatch(final int matchLength) {
421        // inserts strings contained in current match
422        // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
423        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
424        // currentPosition has been inserted already
425        for (int i = 1; i <= stop; i++) {
426            insertString(currentPosition + i);
427        }
428        missedInserts = matchLength - stop - 1;
429    }
430
431    /**
432     * Searches the hash chain for real matches and returns the length
433     * of the longest match (0 if none were found) that isn't too far
434     * away (WRT maxOffset).
435     *
436     * <p>Sets matchStart to the index of the start position of the
437     * longest match as a side effect.</p>
438     */
439    private int longestMatch(int matchHead) {
440        final int minLength = params.getMinBackReferenceLength();
441        int longestMatchLength = minLength - 1;
442        final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
443        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
444        final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
445        final int maxCandidates = params.getMaxCandidates();
446        for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
447            int currentLength = 0;
448            for (int i = 0; i < maxPossibleLength; i++) {
449                if (window[matchHead + i] != window[currentPosition + i]) {
450                    break;
451                }
452                currentLength++;
453            }
454            if (currentLength > longestMatchLength) {
455                longestMatchLength = currentLength;
456                matchStart = matchHead;
457                if (currentLength >= niceBackReferenceLength) {
458                    // no need to search any further
459                    break;
460                }
461            }
462            matchHead = prev[matchHead & wMask];
463        }
464        return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
465    }
466
467    private int longestMatchForNextPosition(final int prevMatchLength) {
468        // save a bunch of values to restore them if the next match isn't better than the current one
469        final int prevMatchStart = matchStart;
470        final int prevInsertHash = insertHash;
471
472        lookahead--;
473        currentPosition++;
474        final int hashHead = insertString(currentPosition);
475        final int prevHashHead = prev[currentPosition & wMask];
476        int matchLength = longestMatch(hashHead);
477
478        if (matchLength <= prevMatchLength) {
479            // use the first match, as the next one isn't any better
480            matchLength = prevMatchLength;
481            matchStart = prevMatchStart;
482
483            // restore modified values
484            head[insertHash] = prevHashHead;
485            insertHash = prevInsertHash;
486            currentPosition--;
487            lookahead++;
488        }
489        return matchLength;
490    }
491
492    /**
493     * Assumes we are calculating the hash for three consecutive bytes
494     * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
495     * the new hash for BCD is nextHash(H, D).
496     *
497     * <p>The hash is shifted by five bits on each update so all
498     * effects of A have been swapped after the third update.</p>
499     */
500    private int nextHash(final int oldHash, final byte nextByte) {
501        final int nextVal = nextByte & 0xFF;
502        return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
503    }
504
505    /**
506     * Adds some initial data to fill the window with.
507     *
508     * <p>This is used if the stream has been cut into blocks and
509     * back-references of one block may refer to data of the previous
510     * block(s). One such example is the LZ4 frame format using block
511     * dependency.</p>
512     *
513     * @param data the data to fill the window with.
514     * @throws IllegalStateException if the compressor has already started to accept data
515     */
516    public void prefill(final byte[] data) {
517        if (currentPosition != 0 || lookahead != 0) {
518            throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
519        }
520
521        // don't need more than windowSize for back-references
522        final int len = Math.min(params.getWindowSize(), data.length);
523        System.arraycopy(data, data.length - len, window, 0, len);
524
525        if (len >= NUMBER_OF_BYTES_IN_HASH) {
526            initialize();
527            final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
528            for (int i = 0; i < stop; i++) {
529                insertString(i);
530            }
531            missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
532        } else { // not enough data to hash anything
533            missedInserts = len;
534        }
535        blockStart = currentPosition = len;
536    }
537
538    private void slide() throws IOException {
539        final int wSize = params.getWindowSize();
540        if (blockStart != currentPosition && blockStart < wSize) {
541            flushLiteralBlock();
542            blockStart = currentPosition;
543        }
544        System.arraycopy(window, wSize, window, 0, wSize);
545        currentPosition -= wSize;
546        matchStart -= wSize;
547        blockStart -= wSize;
548        for (int i = 0; i < HASH_SIZE; i++) {
549            final int h = head[i];
550            head[i] = h >= wSize ? h - wSize : NO_MATCH;
551        }
552        for (int i = 0; i < wSize; i++) {
553            final int p = prev[i];
554            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
555        }
556    }
557}