001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz4;
020
021import java.io.IOException;
022import java.io.OutputStream;
023import java.util.Arrays;
024import java.util.Deque;
025import java.util.Iterator;
026import java.util.LinkedList;
027
028import org.apache.commons.compress.compressors.CompressorOutputStream;
029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
030import org.apache.commons.compress.compressors.lz77support.Parameters;
031import org.apache.commons.compress.utils.ByteUtils;
032
033/**
034 * CompressorOutputStream for the LZ4 block format.
035 *
036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
037 * @since 1.14
038 * @NotThreadSafe
039 */
040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
041
042    final static class Pair {
043        private static int lengths(final int litLength, final int brLength) {
044            final int l = Math.min(litLength, 15);
045            final int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15);
046            return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br;
047        }
048        private static void writeLength(int length, final OutputStream out) throws IOException {
049            while (length >= 255) {
050                out.write(255);
051                length -= 255;
052            }
053            out.write(length);
054        }
055        private final Deque<byte[]> literals = new LinkedList<>();
056
057        private int brOffset, brLength;
058
059        private boolean written;
060
061        byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
062            final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(),
063                block.getOffset() + block.getLength());
064            literals.add(copy);
065            return copy;
066        }
067
068        private int backReferenceLength() {
069            return brLength;
070        }
071
072        boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
073            return hasBackReference()
074                && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
075        }
076
077        boolean hasBackReference() {
078            return brOffset > 0;
079        }
080
081        private boolean hasBeenWritten() {
082            return written;
083        }
084
085        int length() {
086            return literalLength() + brLength;
087        }
088
089        private int literalLength() {
090            return literals.stream().mapToInt(b -> b.length).sum();
091        }
092
093        private void prependLiteral(final byte[] data) {
094            literals.addFirst(data);
095        }
096
097        private void prependTo(final Pair other) {
098            final Iterator<byte[]> listBackwards = literals.descendingIterator();
099            while (listBackwards.hasNext()) {
100                other.prependLiteral(listBackwards.next());
101            }
102        }
103
104        void setBackReference(final LZ77Compressor.BackReference block) {
105            if (hasBackReference()) {
106                throw new IllegalStateException();
107            }
108            brOffset = block.getOffset();
109            brLength = block.getLength();
110        }
111
112        private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
113            final Pair p = new Pair();
114            p.literals.addAll(literals);
115            p.brOffset = brOffset;
116            p.brLength = newBackReferenceLength;
117            return p;
118        }
119
120        void writeTo(final OutputStream out) throws IOException {
121            final int litLength = literalLength();
122            out.write(lengths(litLength, brLength));
123            if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
124                writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
125            }
126            for (final byte[] b : literals) {
127                out.write(b);
128            }
129            if (hasBackReference()) {
130                ByteUtils.toLittleEndian(out, brOffset, 2);
131                if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
132                    writeLength(brLength - MIN_BACK_REFERENCE_LENGTH
133                        - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
134                }
135            }
136            written = true;
137        }
138    }
139    private static final int MIN_BACK_REFERENCE_LENGTH = 4;
140
141    /*
142
143      The LZ4 block format has a few properties that make it less
144      straight-forward than one would hope:
145
146      * literal blocks and back-references must come in pairs (except
147        for the very last literal block), so consecutive literal
148        blocks created by the compressor must be merged into a single
149        block.
150
151      * the start of a literal/back-reference pair contains the length
152        of the back-reference (at least some part of it) so we can't
153        start writing the literal before we know how long the next
154        back-reference is going to be.
155
156      * there are special rules for the final blocks
157
158        > There are specific parsing rules to respect in order to remain
159        > compatible with assumptions made by the decoder :
160        >
161        >     1. The last 5 bytes are always literals
162        >
163        >     2. The last match must start at least 12 bytes before end of
164        >        block. Consequently, a block with less than 13 bytes cannot be
165        >        compressed.
166
167        which means any back-reference may need to get rewritten as a
168        literal block unless we know the next block is at least of
169        length 5 and the sum of this block's length and offset and the
170        next block's length is at least twelve.
171
172    */
173
174    private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
175    /**
176     * Returns a builder correctly configured for the LZ4 algorithm.
177     * @return a builder correctly configured for the LZ4 algorithm
178     */
179    public static Parameters.Builder createParameterBuilder() {
180        final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
181        return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE)
182            .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
183            .withMaxBackReferenceLength(maxLen)
184            .withMaxOffset(maxLen)
185            .withMaxLiteralLength(maxLen);
186    }
187
188    private final LZ77Compressor compressor;
189
190    private final OutputStream os;
191
192    // used in one-arg write method
193    private final byte[] oneByte = new byte[1];
194    private boolean finished;
195
196    private final Deque<Pair> pairs = new LinkedList<>();
197
198    // keeps track of the last window-size bytes (64k) in order to be
199    // able to expand back-references when needed
200    private final Deque<byte[]> expandedBlocks = new LinkedList<>();
201
202    /**
203     * Creates a new LZ4 output stream.
204     *
205     * @param os
206     *            An OutputStream to read compressed data from
207     */
208    public BlockLZ4CompressorOutputStream(final OutputStream os) {
209        this(os, createParameterBuilder().build());
210    }
211
212    /**
213     * Creates a new LZ4 output stream.
214     *
215     * @param os
216     *            An OutputStream to read compressed data from
217     * @param params
218     *            The parameters to use for LZ77 compression.
219     */
220    public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) {
221        this.os = os;
222        compressor = new LZ77Compressor(params,
223            block -> {
224                switch (block.getType()) {
225                case LITERAL:
226                    addLiteralBlock((LZ77Compressor.LiteralBlock) block);
227                    break;
228                case BACK_REFERENCE:
229                    addBackReference((LZ77Compressor.BackReference) block);
230                    break;
231                case EOD:
232                    writeFinalLiteralBlock();
233                    break;
234                }
235            });
236    }
237
238    private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
239        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
240        last.setBackReference(block);
241        recordBackReference(block);
242        clearUnusedBlocksAndPairs();
243    }
244
245    private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
246        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
247        recordLiteral(last.addLiteral(block));
248        clearUnusedBlocksAndPairs();
249    }
250
251    private void clearUnusedBlocks() {
252        int blockLengths = 0;
253        int blocksToKeep = 0;
254        for (final byte[] b : expandedBlocks) {
255            blocksToKeep++;
256            blockLengths += b.length;
257            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
258                break;
259            }
260        }
261        final int size = expandedBlocks.size();
262        for (int i = blocksToKeep; i < size; i++) {
263            expandedBlocks.removeLast();
264        }
265    }
266
267    private void clearUnusedBlocksAndPairs() {
268        clearUnusedBlocks();
269        clearUnusedPairs();
270    }
271
272    private void clearUnusedPairs() {
273        int pairLengths = 0;
274        int pairsToKeep = 0;
275        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
276            final Pair p = it.next();
277            pairsToKeep++;
278            pairLengths += p.length();
279            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
280                break;
281            }
282        }
283        final int size = pairs.size();
284        for (int i = pairsToKeep; i < size; i++) {
285            final Pair p = pairs.peekFirst();
286            if (!p.hasBeenWritten()) {
287                break;
288            }
289            pairs.removeFirst();
290        }
291    }
292
293    @Override
294    public void close() throws IOException {
295        try {
296            finish();
297        } finally {
298            os.close();
299        }
300    }
301
302    private byte[] expand(final int offset, final int length) {
303        final byte[] expanded = new byte[length];
304        if (offset == 1) { // surprisingly common special case
305            final byte[] block = expandedBlocks.peekFirst();
306            final byte b = block[block.length - 1];
307            if (b != 0) { // the fresh array contains 0s anyway
308                Arrays.fill(expanded, b);
309            }
310        } else {
311            expandFromList(expanded, offset, length);
312        }
313        return expanded;
314    }
315
316    private void expandFromList(final byte[] expanded, final int offset, final int length) {
317        int offsetRemaining = offset;
318        int lengthRemaining = length;
319        int writeOffset = 0;
320        while (lengthRemaining > 0) {
321            // find block that contains offsetRemaining
322            byte[] block = null;
323            final int copyLen;
324            final int copyOffset;
325            if (offsetRemaining > 0) {
326                int blockOffset = 0;
327                for (final byte[] b : expandedBlocks) {
328                    if (b.length + blockOffset >= offsetRemaining) {
329                        block = b;
330                        break;
331                    }
332                    blockOffset += b.length;
333                }
334                if (block == null) {
335                    // should not be possible
336                    throw new IllegalStateException("Failed to find a block containing offset " + offset);
337                }
338                copyOffset = blockOffset + block.length - offsetRemaining;
339                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
340            } else {
341                // offsetRemaining is negative or 0 and points into the expanded bytes
342                block = expanded;
343                copyOffset = -offsetRemaining;
344                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
345            }
346            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
347            offsetRemaining -= copyLen;
348            lengthRemaining -= copyLen;
349            writeOffset += copyLen;
350        }
351    }
352
353    /**
354     * Compresses all remaining data and writes it to the stream,
355     * doesn't close the underlying stream.
356     * @throws IOException if an error occurs
357     */
358    public void finish() throws IOException {
359        if (!finished) {
360            compressor.finish();
361            finished = true;
362        }
363    }
364
365    /**
366     * Adds some initial data to fill the window with.
367     *
368     * @param data the data to fill the window with.
369     * @param off offset of real data into the array
370     * @param len amount of data
371     * @throws IllegalStateException if the stream has already started to write data
372     * @see LZ77Compressor#prefill
373     */
374    public void prefill(final byte[] data, final int off, final int len) {
375        if (len > 0) {
376            final byte[] b = Arrays.copyOfRange(data, off, off + len);
377            compressor.prefill(b);
378            recordLiteral(b);
379        }
380    }
381
382    private void recordBackReference(final LZ77Compressor.BackReference block) {
383        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
384    }
385
386    private void recordLiteral(final byte[] b) {
387        expandedBlocks.addFirst(b);
388    }
389
390    private void rewriteLastPairs() {
391        final LinkedList<Pair> lastPairs = new LinkedList<>();
392        final LinkedList<Integer> pairLength = new LinkedList<>();
393        int offset = 0;
394        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
395            final Pair p = it.next();
396            if (p.hasBeenWritten()) {
397                break;
398            }
399            final int len = p.length();
400            pairLength.addFirst(len);
401            lastPairs.addFirst(p);
402            offset += len;
403            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
404                break;
405            }
406        }
407        lastPairs.forEach(pairs::remove);
408        // lastPairs may contain between one and four Pairs:
409        // * the last pair may be a one byte literal
410        // * all other Pairs contain a back-reference which must be four bytes long at minimum
411        // we could merge them all into a single literal block but
412        // this may harm compression. For example compressing
413        // "bla.tar" from our tests yields a last block containing a
414        // back-reference of length > 2k and we'd end up with a last
415        // literal of that size rather than a 2k back-reference and a
416        // 12 byte literal at the end.
417
418        // Instead we merge all but the first of lastPairs into a new
419        // literal-only Pair "replacement" and look at the
420        // back-reference in the first of lastPairs and see if we can
421        // split it. We can split it if it is longer than 16 -
422        // replacement.length (i.e. the minimal length of four is kept
423        // while making sure the last literal is at least twelve bytes
424        // long). If we can't split it, we expand the first of the pairs
425        // as well.
426
427        // this is not optimal, we could get better compression
428        // results with more complex approaches as the last literal
429        // only needs to be five bytes long if the previous
430        // back-reference has an offset big enough
431
432        final int lastPairsSize = lastPairs.size();
433        int toExpand = 0;
434        for (int i = 1; i < lastPairsSize; i++) {
435            toExpand += pairLength.get(i);
436        }
437        final Pair replacement = new Pair();
438        if (toExpand > 0) {
439            replacement.prependLiteral(expand(toExpand, toExpand));
440        }
441        final Pair splitCandidate = lastPairs.get(0);
442        final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
443        final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
444        if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
445            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
446            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
447        } else {
448            if (splitCandidate.hasBackReference()) {
449                replacement.prependLiteral(expand(toExpand + brLen, brLen));
450            }
451            splitCandidate.prependTo(replacement);
452        }
453        pairs.add(replacement);
454    }
455
456    @Override
457    public void write(final byte[] data, final int off, final int len) throws IOException {
458        compressor.compress(data, off, len);
459    }
460
461    @Override
462    public void write(final int b) throws IOException {
463        oneByte[0] = (byte) (b & 0xff);
464        write(oneByte);
465    }
466
467    private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
468        writeWritablePairs(length);
469        Pair last = pairs.peekLast();
470        if (last == null || last.hasBackReference()) {
471            last = new Pair();
472            pairs.addLast(last);
473        }
474        return last;
475    }
476
477    private void writeFinalLiteralBlock() throws IOException {
478        rewriteLastPairs();
479        for (final Pair p : pairs) {
480            if (!p.hasBeenWritten()) {
481                p.writeTo(os);
482            }
483        }
484        pairs.clear();
485    }
486
487    private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
488        int unwrittenLength = lengthOfBlocksAfterLastPair;
489        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
490            final Pair p = it.next();
491            if (p.hasBeenWritten()) {
492                break;
493            }
494            unwrittenLength += p.length();
495        }
496        for (final Pair p : pairs) {
497            if (p.hasBeenWritten()) {
498                continue;
499            }
500            unwrittenLength -= p.length();
501            if (!p.canBeWritten(unwrittenLength)) {
502                break;
503            }
504            p.writeTo(os);
505        }
506    }
507}