001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024
025import org.apache.commons.compress.compressors.CompressorInputStream;
026import org.apache.commons.compress.utils.ByteUtils;
027import org.apache.commons.compress.utils.IOUtils;
028
029/**
030 * Encapsulates code common to LZ77 decompressors.
031 *
032 * <p>Assumes the stream consists of blocks of literal data and
033 * back-references (called copies) in any order. Of course the first
034 * block must be a literal block for the scheme to work - unless the
035 * {@link #prefill prefill} method has been used to provide initial
036 * data that is never returned by {@link #read read} but only used for
037 * back-references.</p>
038 *
039 * <p>Subclasses must override the three-arg {@link #read read} method
040 * as the no-arg version delegates to it and the default
041 * implementation delegates to the no-arg version, leading to infinite
042 * mutual recursion and a {@code StackOverflowError} otherwise.</p>
043 *
044 * <p>The contract for subclasses' {@code read} implementation is:</p>
045 * <ul>
046 *
047 *  <li>keep track of the current state of the stream. Is it inside a
048 *  literal block or a back-reference or in-between blocks?</li>
049 *
050 *  <li>Use {@link #readOneByte} to access the underlying stream
051 *  directly.</li>
052 *
053 *  <li>If a new literal block starts, use {@link #startLiteral} to
054 *  tell this class about it and read the literal data using {@link
055 *  #readLiteral} until it returns {@code 0}. {@link
056 *  #hasMoreDataInBlock} will return {@code false} before the next
057 *  call to {@link #readLiteral} would return {@code 0}.</li>
058 *
059 *  <li>If a new back-reference starts, use {@link #startBackReference} to
060 *  tell this class about it and read the literal data using {@link
061 *  #readBackReference} until it returns {@code 0}. {@link
062 *  #hasMoreDataInBlock} will return {@code false} before the next
063 *  call to {@link #readBackReference} would return {@code 0}.</li>
064 *
065 *  <li>If the end of the stream has been reached, return {@code -1}
066 *  as this class' methods will never do so themselves.</li>
067 *
068 * </ul>
069 *
070 * <p>{@link #readOneByte} and {@link #readLiteral} update the counter
071 * for bytes read.</p>
072 *
073 * @since 1.14
074 */
075public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream {
076
077    /** Size of the window - must be bigger than the biggest offset expected. */
078    private final int windowSize;
079
080    /**
081     * Buffer to write decompressed bytes to for back-references, will
082     * be three times windowSize big.
083     *
084     * <p>Three times so we can slide the whole buffer a windowSize to
085     * the left once we've read twice windowSize and still have enough
086     * data inside of it to satisfy back-references.</p>
087     */
088    private final byte[] buf;
089
090    /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
091    private int writeIndex;
092
093    /** Index of the next byte to be read. */
094    private int readIndex;
095
096    /** The underlying stream to read compressed data from */
097    private final InputStream in;
098
099    /** Number of bytes still to be read from the current literal or back-reference. */
100    private long bytesRemaining;
101
102    /** Offset of the current back-reference. */
103    private int backReferenceOffset;
104
105    /** uncompressed size */
106    private int size = 0;
107
108    // used in no-arg read method
109    private final byte[] oneByte = new byte[1];
110
111    /**
112     * Supplier that delegates to {@link #readOneByte}.
113     */
114    protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() {
115        @Override
116        public int getAsByte() throws IOException {
117            return readOneByte();
118        }
119    };
120
121    /**
122     * Creates a new LZ77 input stream.
123     *
124     * @param is
125     *            An InputStream to read compressed data from
126     * @param windowSize
127     *            Size of the window kept for back-references, must be bigger than the biggest offset expected.
128     *
129     * @throws IOException if reading fails
130     */
131    public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException {
132        this.in = is;
133        this.windowSize = windowSize;
134        buf = new byte[3 * windowSize];
135        writeIndex = readIndex = 0;
136        bytesRemaining = 0;
137    }
138
139    /** {@inheritDoc} */
140    @Override
141    public int read() throws IOException {
142        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
143    }
144
145    /** {@inheritDoc} */
146    @Override
147    public void close() throws IOException {
148        in.close();
149    }
150
151    /** {@inheritDoc} */
152    @Override
153    public int available() {
154        return writeIndex - readIndex;
155    }
156
157    /**
158     * Get the uncompressed size of the stream
159     *
160     * @return the uncompressed size
161     */
162    public int getSize() {
163        return size;
164    }
165
166    /**
167     * Adds some initial data to fill the window with.
168     *
169     * <p>This is used if the stream has been cut into blocks and
170     * back-references of one block may refer to data of the previous
171     * block(s). One such example is the LZ4 frame format using block
172     * dependency.</p>
173     *
174     * @param data the data to fill the window with.
175     * @throws IllegalStateException if the stream has already started to read data
176     */
177    public void prefill(byte[] data) {
178        if (writeIndex != 0) {
179            throw new IllegalStateException("the stream has already been read from, can't prefill anymore");
180        }
181        // we don't need more data than the big offset could refer to, so cap it
182        int len = Math.min(windowSize, data.length);
183        // we need the last data as we are dealing with *back*-references
184        System.arraycopy(data, data.length - len, buf, 0, len);
185        writeIndex += len;
186        readIndex += len;
187    }
188
189    /**
190     * Used by subclasses to signal the next block contains the given
191     * amount of literal data.
192     * @param length the length of the block
193     */
194    protected final void startLiteral(long length) {
195        bytesRemaining = length;
196    }
197
198    /**
199     * Is there still data remaining inside the current block?
200     * @return true if there is still data remaining inside the current block.
201     */
202    protected final boolean hasMoreDataInBlock() {
203        return bytesRemaining > 0;
204    }
205
206    /**
207     * Reads data from the current literal block.
208     * @param b buffer to write data to
209     * @param off offset to start writing to
210     * @param len maximum amount of data to read
211     * @return number of bytes read, may be 0. Will never return -1 as
212     * EOF-detection is the responsibility of the subclass
213     * @throws IOException if the underlying stream throws or signals
214     * an EOF before the amount of data promised for the block have
215     * been read
216     */
217    protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
218        final int avail = available();
219        if (len > avail) {
220            tryToReadLiteral(len - avail);
221        }
222        return readFromBuffer(b, off, len);
223    }
224
225    private void tryToReadLiteral(int bytesToRead) throws IOException {
226        // min of "what is still inside the literal", "what does the user want" and "how muc can fit into the buffer"
227        final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining),
228                                             buf.length - writeIndex);
229        final int bytesRead = reallyTryToRead > 0
230            ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead)
231            : 0 /* happens for bytesRemaining == 0 */;
232        count(bytesRead);
233        if (reallyTryToRead != bytesRead) {
234            throw new IOException("Premature end of stream reading literal");
235        }
236        writeIndex += reallyTryToRead;
237        bytesRemaining -= reallyTryToRead;
238    }
239
240    private int readFromBuffer(final byte[] b, final int off, final int len) {
241        final int readable = Math.min(len, available());
242        if (readable > 0) {
243            System.arraycopy(buf, readIndex, b, off, readable);
244            readIndex += readable;
245            if (readIndex > 2 * windowSize) {
246                slideBuffer();
247            }
248        }
249        size += readable;
250        return readable;
251    }
252
253    private void slideBuffer() {
254        System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
255        writeIndex -= windowSize;
256        readIndex -= windowSize;
257    }
258
259    /**
260     * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
261     * @param offset the offset of the back-reference
262     * @param length the length of the back-reference
263     */
264    protected final void startBackReference(int offset, long length) {
265        backReferenceOffset = offset;
266        bytesRemaining = length;
267    }
268
269    /**
270     * Reads data from the current back-reference.
271     * @param b buffer to write data to
272     * @param off offset to start writing to
273     * @param len maximum amount of data to read
274     * @return number of bytes read, may be 0. Will never return -1 as
275     * EOF-detection is the responsibility of the subclass
276     */
277    protected final int readBackReference(final byte[] b, final int off, final int len) {
278        final int avail = available();
279        if (len > avail) {
280            tryToCopy(len - avail);
281        }
282        return readFromBuffer(b, off, len);
283    }
284
285    private void tryToCopy(int bytesToCopy) {
286        // this will fit into the buffer without sliding and not
287        // require more than is available inside the back-reference
288        int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining),
289                            buf.length - writeIndex);
290        if (copy == 0) {
291            // NOP
292        } else if (backReferenceOffset == 1) { // pretty common special case
293            final byte last = buf[writeIndex - 1];
294            Arrays.fill(buf, writeIndex, writeIndex + copy, last);
295            writeIndex += copy;
296        } else if (copy < backReferenceOffset) {
297            System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
298            writeIndex += copy;
299        } else {
300            // back-reference overlaps with the bytes created from it
301            // like go back two bytes and then copy six (by copying
302            // the last two bytes three time).
303            final int fullRots = copy / backReferenceOffset;
304            for (int i = 0; i < fullRots; i++) {
305                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
306                writeIndex += backReferenceOffset;
307            }
308
309            final int pad = copy - (backReferenceOffset * fullRots);
310            if (pad > 0) {
311                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
312                writeIndex += pad;
313            }
314        }
315        bytesRemaining -= copy;
316    }
317
318    /**
319     * Reads a single byte from the real input stream and ensures the data is accounted for.
320     *
321     * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
322     * @throws IOException if the underlying stream throws
323     */
324    protected final int readOneByte() throws IOException {
325        final int b = in.read();
326        if (b != -1) {
327            count(1);
328            return b & 0xFF;
329        }
330        return -1;
331    }
332}