001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz4; 020 021import java.io.IOException; 022import java.io.OutputStream; 023import java.util.Arrays; 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; 027 028import org.apache.commons.compress.compressors.CompressorOutputStream; 029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 030import org.apache.commons.compress.compressors.lz77support.Parameters; 031import org.apache.commons.compress.utils.ByteUtils; 032 033/** 034 * CompressorOutputStream for the LZ4 block format. 035 * 036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> 037 * @since 1.14 038 * @NotThreadSafe 039 */ 040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { 041 042 final static class Pair { 043 private static int lengths(final int litLength, final int brLength) { 044 final int l = Math.min(litLength, 15); 045 final int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15); 046 return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br; 047 } 048 private static void writeLength(int length, final OutputStream out) throws IOException { 049 while (length >= 255) { 050 out.write(255); 051 length -= 255; 052 } 053 out.write(length); 054 } 055 private final Deque<byte[]> literals = new LinkedList<>(); 056 057 private int brOffset, brLength; 058 059 private boolean written; 060 061 byte[] addLiteral(final LZ77Compressor.LiteralBlock block) { 062 final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), 063 block.getOffset() + block.getLength()); 064 literals.add(copy); 065 return copy; 066 } 067 068 private int backReferenceLength() { 069 return brLength; 070 } 071 072 boolean canBeWritten(final int lengthOfBlocksAfterThisPair) { 073 return hasBackReference() 074 && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 075 } 076 077 boolean hasBackReference() { 078 return brOffset > 0; 079 } 080 081 private boolean hasBeenWritten() { 082 return written; 083 } 084 085 int length() { 086 return literalLength() + brLength; 087 } 088 089 private int literalLength() { 090 return literals.stream().mapToInt(b -> b.length).sum(); 091 } 092 093 private void prependLiteral(final byte[] data) { 094 literals.addFirst(data); 095 } 096 097 private void prependTo(final Pair other) { 098 final Iterator<byte[]> listBackwards = literals.descendingIterator(); 099 while (listBackwards.hasNext()) { 100 other.prependLiteral(listBackwards.next()); 101 } 102 } 103 104 void setBackReference(final LZ77Compressor.BackReference block) { 105 if (hasBackReference()) { 106 throw new IllegalStateException(); 107 } 108 brOffset = block.getOffset(); 109 brLength = block.getLength(); 110 } 111 112 private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) { 113 final Pair p = new Pair(); 114 p.literals.addAll(literals); 115 p.brOffset = brOffset; 116 p.brLength = newBackReferenceLength; 117 return p; 118 } 119 120 void writeTo(final OutputStream out) throws IOException { 121 final int litLength = literalLength(); 122 out.write(lengths(litLength, brLength)); 123 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 124 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 125 } 126 for (final byte[] b : literals) { 127 out.write(b); 128 } 129 if (hasBackReference()) { 130 ByteUtils.toLittleEndian(out, brOffset, 2); 131 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 132 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH 133 - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 134 } 135 } 136 written = true; 137 } 138 } 139 private static final int MIN_BACK_REFERENCE_LENGTH = 4; 140 141 /* 142 143 The LZ4 block format has a few properties that make it less 144 straight-forward than one would hope: 145 146 * literal blocks and back-references must come in pairs (except 147 for the very last literal block), so consecutive literal 148 blocks created by the compressor must be merged into a single 149 block. 150 151 * the start of a literal/back-reference pair contains the length 152 of the back-reference (at least some part of it) so we can't 153 start writing the literal before we know how long the next 154 back-reference is going to be. 155 156 * there are special rules for the final blocks 157 158 > There are specific parsing rules to respect in order to remain 159 > compatible with assumptions made by the decoder : 160 > 161 > 1. The last 5 bytes are always literals 162 > 163 > 2. The last match must start at least 12 bytes before end of 164 > block. Consequently, a block with less than 13 bytes cannot be 165 > compressed. 166 167 which means any back-reference may need to get rewritten as a 168 literal block unless we know the next block is at least of 169 length 5 and the sum of this block's length and offset and the 170 next block's length is at least twelve. 171 172 */ 173 174 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; 175 /** 176 * Returns a builder correctly configured for the LZ4 algorithm. 177 * @return a builder correctly configured for the LZ4 algorithm 178 */ 179 public static Parameters.Builder createParameterBuilder() { 180 final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; 181 return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE) 182 .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) 183 .withMaxBackReferenceLength(maxLen) 184 .withMaxOffset(maxLen) 185 .withMaxLiteralLength(maxLen); 186 } 187 188 private final LZ77Compressor compressor; 189 190 private final OutputStream os; 191 192 // used in one-arg write method 193 private final byte[] oneByte = new byte[1]; 194 private boolean finished; 195 196 private final Deque<Pair> pairs = new LinkedList<>(); 197 198 // keeps track of the last window-size bytes (64k) in order to be 199 // able to expand back-references when needed 200 private final Deque<byte[]> expandedBlocks = new LinkedList<>(); 201 202 /** 203 * Creates a new LZ4 output stream. 204 * 205 * @param os 206 * An OutputStream to read compressed data from 207 */ 208 public BlockLZ4CompressorOutputStream(final OutputStream os) { 209 this(os, createParameterBuilder().build()); 210 } 211 212 /** 213 * Creates a new LZ4 output stream. 214 * 215 * @param os 216 * An OutputStream to read compressed data from 217 * @param params 218 * The parameters to use for LZ77 compression. 219 */ 220 public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) { 221 this.os = os; 222 compressor = new LZ77Compressor(params, 223 block -> { 224 switch (block.getType()) { 225 case LITERAL: 226 addLiteralBlock((LZ77Compressor.LiteralBlock) block); 227 break; 228 case BACK_REFERENCE: 229 addBackReference((LZ77Compressor.BackReference) block); 230 break; 231 case EOD: 232 writeFinalLiteralBlock(); 233 break; 234 } 235 }); 236 } 237 238 private void addBackReference(final LZ77Compressor.BackReference block) throws IOException { 239 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 240 last.setBackReference(block); 241 recordBackReference(block); 242 clearUnusedBlocksAndPairs(); 243 } 244 245 private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { 246 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 247 recordLiteral(last.addLiteral(block)); 248 clearUnusedBlocksAndPairs(); 249 } 250 251 private void clearUnusedBlocks() { 252 int blockLengths = 0; 253 int blocksToKeep = 0; 254 for (final byte[] b : expandedBlocks) { 255 blocksToKeep++; 256 blockLengths += b.length; 257 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 258 break; 259 } 260 } 261 final int size = expandedBlocks.size(); 262 for (int i = blocksToKeep; i < size; i++) { 263 expandedBlocks.removeLast(); 264 } 265 } 266 267 private void clearUnusedBlocksAndPairs() { 268 clearUnusedBlocks(); 269 clearUnusedPairs(); 270 } 271 272 private void clearUnusedPairs() { 273 int pairLengths = 0; 274 int pairsToKeep = 0; 275 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 276 final Pair p = it.next(); 277 pairsToKeep++; 278 pairLengths += p.length(); 279 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 280 break; 281 } 282 } 283 final int size = pairs.size(); 284 for (int i = pairsToKeep; i < size; i++) { 285 final Pair p = pairs.peekFirst(); 286 if (!p.hasBeenWritten()) { 287 break; 288 } 289 pairs.removeFirst(); 290 } 291 } 292 293 @Override 294 public void close() throws IOException { 295 try { 296 finish(); 297 } finally { 298 os.close(); 299 } 300 } 301 302 private byte[] expand(final int offset, final int length) { 303 final byte[] expanded = new byte[length]; 304 if (offset == 1) { // surprisingly common special case 305 final byte[] block = expandedBlocks.peekFirst(); 306 final byte b = block[block.length - 1]; 307 if (b != 0) { // the fresh array contains 0s anyway 308 Arrays.fill(expanded, b); 309 } 310 } else { 311 expandFromList(expanded, offset, length); 312 } 313 return expanded; 314 } 315 316 private void expandFromList(final byte[] expanded, final int offset, final int length) { 317 int offsetRemaining = offset; 318 int lengthRemaining = length; 319 int writeOffset = 0; 320 while (lengthRemaining > 0) { 321 // find block that contains offsetRemaining 322 byte[] block = null; 323 final int copyLen; 324 final int copyOffset; 325 if (offsetRemaining > 0) { 326 int blockOffset = 0; 327 for (final byte[] b : expandedBlocks) { 328 if (b.length + blockOffset >= offsetRemaining) { 329 block = b; 330 break; 331 } 332 blockOffset += b.length; 333 } 334 if (block == null) { 335 // should not be possible 336 throw new IllegalStateException("Failed to find a block containing offset " + offset); 337 } 338 copyOffset = blockOffset + block.length - offsetRemaining; 339 copyLen = Math.min(lengthRemaining, block.length - copyOffset); 340 } else { 341 // offsetRemaining is negative or 0 and points into the expanded bytes 342 block = expanded; 343 copyOffset = -offsetRemaining; 344 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); 345 } 346 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); 347 offsetRemaining -= copyLen; 348 lengthRemaining -= copyLen; 349 writeOffset += copyLen; 350 } 351 } 352 353 /** 354 * Compresses all remaining data and writes it to the stream, 355 * doesn't close the underlying stream. 356 * @throws IOException if an error occurs 357 */ 358 public void finish() throws IOException { 359 if (!finished) { 360 compressor.finish(); 361 finished = true; 362 } 363 } 364 365 /** 366 * Adds some initial data to fill the window with. 367 * 368 * @param data the data to fill the window with. 369 * @param off offset of real data into the array 370 * @param len amount of data 371 * @throws IllegalStateException if the stream has already started to write data 372 * @see LZ77Compressor#prefill 373 */ 374 public void prefill(final byte[] data, final int off, final int len) { 375 if (len > 0) { 376 final byte[] b = Arrays.copyOfRange(data, off, off + len); 377 compressor.prefill(b); 378 recordLiteral(b); 379 } 380 } 381 382 private void recordBackReference(final LZ77Compressor.BackReference block) { 383 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); 384 } 385 386 private void recordLiteral(final byte[] b) { 387 expandedBlocks.addFirst(b); 388 } 389 390 private void rewriteLastPairs() { 391 final LinkedList<Pair> lastPairs = new LinkedList<>(); 392 final LinkedList<Integer> pairLength = new LinkedList<>(); 393 int offset = 0; 394 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 395 final Pair p = it.next(); 396 if (p.hasBeenWritten()) { 397 break; 398 } 399 final int len = p.length(); 400 pairLength.addFirst(len); 401 lastPairs.addFirst(p); 402 offset += len; 403 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { 404 break; 405 } 406 } 407 lastPairs.forEach(pairs::remove); 408 // lastPairs may contain between one and four Pairs: 409 // * the last pair may be a one byte literal 410 // * all other Pairs contain a back-reference which must be four bytes long at minimum 411 // we could merge them all into a single literal block but 412 // this may harm compression. For example compressing 413 // "bla.tar" from our tests yields a last block containing a 414 // back-reference of length > 2k and we'd end up with a last 415 // literal of that size rather than a 2k back-reference and a 416 // 12 byte literal at the end. 417 418 // Instead we merge all but the first of lastPairs into a new 419 // literal-only Pair "replacement" and look at the 420 // back-reference in the first of lastPairs and see if we can 421 // split it. We can split it if it is longer than 16 - 422 // replacement.length (i.e. the minimal length of four is kept 423 // while making sure the last literal is at least twelve bytes 424 // long). If we can't split it, we expand the first of the pairs 425 // as well. 426 427 // this is not optimal, we could get better compression 428 // results with more complex approaches as the last literal 429 // only needs to be five bytes long if the previous 430 // back-reference has an offset big enough 431 432 final int lastPairsSize = lastPairs.size(); 433 int toExpand = 0; 434 for (int i = 1; i < lastPairsSize; i++) { 435 toExpand += pairLength.get(i); 436 } 437 final Pair replacement = new Pair(); 438 if (toExpand > 0) { 439 replacement.prependLiteral(expand(toExpand, toExpand)); 440 } 441 final Pair splitCandidate = lastPairs.get(0); 442 final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; 443 final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; 444 if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { 445 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); 446 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); 447 } else { 448 if (splitCandidate.hasBackReference()) { 449 replacement.prependLiteral(expand(toExpand + brLen, brLen)); 450 } 451 splitCandidate.prependTo(replacement); 452 } 453 pairs.add(replacement); 454 } 455 456 @Override 457 public void write(final byte[] data, final int off, final int len) throws IOException { 458 compressor.compress(data, off, len); 459 } 460 461 @Override 462 public void write(final int b) throws IOException { 463 oneByte[0] = (byte) (b & 0xff); 464 write(oneByte); 465 } 466 467 private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException { 468 writeWritablePairs(length); 469 Pair last = pairs.peekLast(); 470 if (last == null || last.hasBackReference()) { 471 last = new Pair(); 472 pairs.addLast(last); 473 } 474 return last; 475 } 476 477 private void writeFinalLiteralBlock() throws IOException { 478 rewriteLastPairs(); 479 for (final Pair p : pairs) { 480 if (!p.hasBeenWritten()) { 481 p.writeTo(os); 482 } 483 } 484 pairs.clear(); 485 } 486 487 private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException { 488 int unwrittenLength = lengthOfBlocksAfterLastPair; 489 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 490 final Pair p = it.next(); 491 if (p.hasBeenWritten()) { 492 break; 493 } 494 unwrittenLength += p.length(); 495 } 496 for (final Pair p : pairs) { 497 if (p.hasBeenWritten()) { 498 continue; 499 } 500 unwrittenLength -= p.length(); 501 if (!p.canBeWritten(unwrittenLength)) { 502 break; 503 } 504 p.writeTo(os); 505 } 506 } 507}