001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024 025import org.apache.commons.compress.compressors.CompressorInputStream; 026import org.apache.commons.compress.utils.ByteUtils; 027import org.apache.commons.compress.utils.IOUtils; 028 029/** 030 * Encapsulates code common to LZ77 decompressors. 031 * 032 * <p>Assumes the stream consists of blocks of literal data and 033 * back-references (called copies) in any order. Of course the first 034 * block must be a literal block for the scheme to work - unless the 035 * {@link #prefill prefill} method has been used to provide initial 036 * data that is never returned by {@link #read read} but only used for 037 * back-references.</p> 038 * 039 * <p>Subclasses must override the three-arg {@link #read read} method 040 * as the no-arg version delegates to it and the default 041 * implementation delegates to the no-arg version, leading to infinite 042 * mutual recursion and a {@code StackOverflowError} otherwise.</p> 043 * 044 * <p>The contract for subclasses' {@code read} implementation is:</p> 045 * <ul> 046 * 047 * <li>keep track of the current state of the stream. Is it inside a 048 * literal block or a back-reference or in-between blocks?</li> 049 * 050 * <li>Use {@link #readOneByte} to access the underlying stream 051 * directly.</li> 052 * 053 * <li>If a new literal block starts, use {@link #startLiteral} to 054 * tell this class about it and read the literal data using {@link 055 * #readLiteral} until it returns {@code 0}. {@link 056 * #hasMoreDataInBlock} will return {@code false} before the next 057 * call to {@link #readLiteral} would return {@code 0}.</li> 058 * 059 * <li>If a new back-reference starts, use {@link #startBackReference} to 060 * tell this class about it and read the literal data using {@link 061 * #readBackReference} until it returns {@code 0}. {@link 062 * #hasMoreDataInBlock} will return {@code false} before the next 063 * call to {@link #readBackReference} would return {@code 0}.</li> 064 * 065 * <li>If the end of the stream has been reached, return {@code -1} 066 * as this class' methods will never do so themselves.</li> 067 * 068 * </ul> 069 * 070 * <p>{@link #readOneByte} and {@link #readLiteral} update the counter 071 * for bytes read.</p> 072 * 073 * @since 1.14 074 */ 075public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream { 076 077 /** Size of the window - must be bigger than the biggest offset expected. */ 078 private final int windowSize; 079 080 /** 081 * Buffer to write decompressed bytes to for back-references, will 082 * be three times windowSize big. 083 * 084 * <p>Three times so we can slide the whole buffer a windowSize to 085 * the left once we've read twice windowSize and still have enough 086 * data inside of it to satisfy back-references.</p> 087 */ 088 private final byte[] buf; 089 090 /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */ 091 private int writeIndex; 092 093 /** Index of the next byte to be read. */ 094 private int readIndex; 095 096 /** The underlying stream to read compressed data from */ 097 private final InputStream in; 098 099 /** Number of bytes still to be read from the current literal or back-reference. */ 100 private long bytesRemaining; 101 102 /** Offset of the current back-reference. */ 103 private int backReferenceOffset; 104 105 /** uncompressed size */ 106 private int size = 0; 107 108 // used in no-arg read method 109 private final byte[] oneByte = new byte[1]; 110 111 /** 112 * Supplier that delegates to {@link #readOneByte}. 113 */ 114 protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() { 115 @Override 116 public int getAsByte() throws IOException { 117 return readOneByte(); 118 } 119 }; 120 121 /** 122 * Creates a new LZ77 input stream. 123 * 124 * @param is 125 * An InputStream to read compressed data from 126 * @param windowSize 127 * Size of the window kept for back-references, must be bigger than the biggest offset expected. 128 * 129 * @throws IOException if reading fails 130 */ 131 public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException { 132 this.in = is; 133 this.windowSize = windowSize; 134 buf = new byte[3 * windowSize]; 135 writeIndex = readIndex = 0; 136 bytesRemaining = 0; 137 } 138 139 /** {@inheritDoc} */ 140 @Override 141 public int read() throws IOException { 142 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 143 } 144 145 /** {@inheritDoc} */ 146 @Override 147 public void close() throws IOException { 148 in.close(); 149 } 150 151 /** {@inheritDoc} */ 152 @Override 153 public int available() { 154 return writeIndex - readIndex; 155 } 156 157 /** 158 * Get the uncompressed size of the stream 159 * 160 * @return the uncompressed size 161 */ 162 public int getSize() { 163 return size; 164 } 165 166 /** 167 * Adds some initial data to fill the window with. 168 * 169 * <p>This is used if the stream has been cut into blocks and 170 * back-references of one block may refer to data of the previous 171 * block(s). One such example is the LZ4 frame format using block 172 * dependency.</p> 173 * 174 * @param data the data to fill the window with. 175 * @throws IllegalStateException if the stream has already started to read data 176 */ 177 public void prefill(byte[] data) { 178 if (writeIndex != 0) { 179 throw new IllegalStateException("the stream has already been read from, can't prefill anymore"); 180 } 181 // we don't need more data than the big offset could refer to, so cap it 182 int len = Math.min(windowSize, data.length); 183 // we need the last data as we are dealing with *back*-references 184 System.arraycopy(data, data.length - len, buf, 0, len); 185 writeIndex += len; 186 readIndex += len; 187 } 188 189 /** 190 * Used by subclasses to signal the next block contains the given 191 * amount of literal data. 192 * @param length the length of the block 193 */ 194 protected final void startLiteral(long length) { 195 bytesRemaining = length; 196 } 197 198 /** 199 * Is there still data remaining inside the current block? 200 * @return true if there is still data remaining inside the current block. 201 */ 202 protected final boolean hasMoreDataInBlock() { 203 return bytesRemaining > 0; 204 } 205 206 /** 207 * Reads data from the current literal block. 208 * @param b buffer to write data to 209 * @param off offset to start writing to 210 * @param len maximum amount of data to read 211 * @return number of bytes read, may be 0. Will never return -1 as 212 * EOF-detection is the responsibility of the subclass 213 * @throws IOException if the underlying stream throws or signals 214 * an EOF before the amount of data promised for the block have 215 * been read 216 */ 217 protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException { 218 final int avail = available(); 219 if (len > avail) { 220 tryToReadLiteral(len - avail); 221 } 222 return readFromBuffer(b, off, len); 223 } 224 225 private void tryToReadLiteral(int bytesToRead) throws IOException { 226 // min of "what is still inside the literal", "what does the user want" and "how muc can fit into the buffer" 227 final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), 228 buf.length - writeIndex); 229 final int bytesRead = reallyTryToRead > 0 230 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) 231 : 0 /* happens for bytesRemaining == 0 */; 232 count(bytesRead); 233 if (reallyTryToRead != bytesRead) { 234 throw new IOException("Premature end of stream reading literal"); 235 } 236 writeIndex += reallyTryToRead; 237 bytesRemaining -= reallyTryToRead; 238 } 239 240 private int readFromBuffer(final byte[] b, final int off, final int len) { 241 final int readable = Math.min(len, available()); 242 if (readable > 0) { 243 System.arraycopy(buf, readIndex, b, off, readable); 244 readIndex += readable; 245 if (readIndex > 2 * windowSize) { 246 slideBuffer(); 247 } 248 } 249 size += readable; 250 return readable; 251 } 252 253 private void slideBuffer() { 254 System.arraycopy(buf, windowSize, buf, 0, windowSize * 2); 255 writeIndex -= windowSize; 256 readIndex -= windowSize; 257 } 258 259 /** 260 * Used by subclasses to signal the next block contains a back-reference with the given coordinates. 261 * @param offset the offset of the back-reference 262 * @param length the length of the back-reference 263 */ 264 protected final void startBackReference(int offset, long length) { 265 backReferenceOffset = offset; 266 bytesRemaining = length; 267 } 268 269 /** 270 * Reads data from the current back-reference. 271 * @param b buffer to write data to 272 * @param off offset to start writing to 273 * @param len maximum amount of data to read 274 * @return number of bytes read, may be 0. Will never return -1 as 275 * EOF-detection is the responsibility of the subclass 276 */ 277 protected final int readBackReference(final byte[] b, final int off, final int len) { 278 final int avail = available(); 279 if (len > avail) { 280 tryToCopy(len - avail); 281 } 282 return readFromBuffer(b, off, len); 283 } 284 285 private void tryToCopy(int bytesToCopy) { 286 // this will fit into the buffer without sliding and not 287 // require more than is available inside the back-reference 288 int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), 289 buf.length - writeIndex); 290 if (copy == 0) { 291 // NOP 292 } else if (backReferenceOffset == 1) { // pretty common special case 293 final byte last = buf[writeIndex - 1]; 294 Arrays.fill(buf, writeIndex, writeIndex + copy, last); 295 writeIndex += copy; 296 } else if (copy < backReferenceOffset) { 297 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy); 298 writeIndex += copy; 299 } else { 300 // back-reference overlaps with the bytes created from it 301 // like go back two bytes and then copy six (by copying 302 // the last two bytes three time). 303 final int fullRots = copy / backReferenceOffset; 304 for (int i = 0; i < fullRots; i++) { 305 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset); 306 writeIndex += backReferenceOffset; 307 } 308 309 final int pad = copy - (backReferenceOffset * fullRots); 310 if (pad > 0) { 311 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad); 312 writeIndex += pad; 313 } 314 } 315 bytesRemaining -= copy; 316 } 317 318 /** 319 * Reads a single byte from the real input stream and ensures the data is accounted for. 320 * 321 * @return the byte read as value between 0 and 255 or -1 if EOF has been reached. 322 * @throws IOException if the underlying stream throws 323 */ 324 protected final int readOneByte() throws IOException { 325 final int b = in.read(); 326 if (b != -1) { 327 count(1); 328 return b & 0xFF; 329 } 330 return -1; 331 } 332}