From c86b1cd405dee7e6d015961b9828cd279563b706 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Mon, 20 May 2024 20:08:11 -0700 Subject: [PATCH] Replace implementation with Aircompressor code The core compression and decompression code has been replaced with the latest code from Aircompressor. The existing interfaces have been retained where possible. The notable exception is the deprecated SnappyInputStream and SnappyOutputStream output stream implementations have been removed. Any existing data in these formats should be converted to the specification defined framed formats using an older version of this library. --- .github/workflows/main.yml | 19 + README.md | 216 -------- pom.xml | 71 +-- .../snappy/AbstractSnappyInputStream.java | 307 ----------- .../snappy/AbstractSnappyOutputStream.java | 239 -------- .../java/org/iq80/snappy/BufferRecycler.java | 183 ------ .../org/iq80/snappy/CorruptionException.java | 21 +- src/main/java/org/iq80/snappy/Crc32C.java | 45 +- .../org/iq80/snappy/HadoopSnappyCodec.java | 175 ++++-- .../iq80/snappy/HadoopSnappyInputStream.java | 163 ++++++ .../iq80/snappy/HadoopSnappyOutputStream.java | 115 ++++ .../iq80/snappy/IncompatibleJvmException.java | 23 + src/main/java/org/iq80/snappy/Main.java | 76 --- src/main/java/org/iq80/snappy/SlowMemory.java | 76 --- src/main/java/org/iq80/snappy/Snappy.java | 112 ++-- .../org/iq80/snappy/SnappyCompressor.java | 519 ------------------ .../{Memory.java => SnappyConstants.java} | 24 +- .../org/iq80/snappy/SnappyDecompressor.java | 434 --------------- .../iq80/snappy/SnappyFramedInputStream.java | 272 ++++++++- .../iq80/snappy/SnappyFramedOutputStream.java | 214 +++++++- .../org/iq80/snappy/SnappyInputStream.java | 115 ---- .../org/iq80/snappy/SnappyInternalUtils.java | 96 +--- .../org/iq80/snappy/SnappyOutputStream.java | 128 ----- .../org/iq80/snappy/SnappyRawCompressor.java | 411 ++++++++++++++ .../iq80/snappy/SnappyRawDecompressor.java | 320 +++++++++++ .../java/org/iq80/snappy/UnsafeMemory.java | 104 ---- src/main/java/org/iq80/snappy/UnsafeUtil.java | 44 ++ .../java/org/iq80/snappy/BenchmarkDriver.java | 12 +- .../java/org/iq80/snappy/SnappyBench.java | 4 +- .../iq80/snappy/SnappyFramedStreamTest.java | 9 - .../org/iq80/snappy/SnappyStreamTest.java | 171 ------ src/test/java/org/iq80/snappy/SnappyTest.java | 99 ++-- 32 files changed, 1862 insertions(+), 2955 deletions(-) create mode 100644 .github/workflows/main.yml delete mode 100644 src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java delete mode 100644 src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java delete mode 100644 src/main/java/org/iq80/snappy/BufferRecycler.java create mode 100644 src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java create mode 100644 src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java create mode 100644 src/main/java/org/iq80/snappy/IncompatibleJvmException.java delete mode 100644 src/main/java/org/iq80/snappy/Main.java delete mode 100644 src/main/java/org/iq80/snappy/SlowMemory.java delete mode 100644 src/main/java/org/iq80/snappy/SnappyCompressor.java rename src/main/java/org/iq80/snappy/{Memory.java => SnappyConstants.java} (51%) delete mode 100644 src/main/java/org/iq80/snappy/SnappyDecompressor.java delete mode 100644 src/main/java/org/iq80/snappy/SnappyInputStream.java delete mode 100644 src/main/java/org/iq80/snappy/SnappyOutputStream.java create mode 100644 src/main/java/org/iq80/snappy/SnappyRawCompressor.java create mode 100644 src/main/java/org/iq80/snappy/SnappyRawDecompressor.java delete mode 100644 src/main/java/org/iq80/snappy/UnsafeMemory.java create mode 100644 src/main/java/org/iq80/snappy/UnsafeUtil.java delete mode 100644 src/test/java/org/iq80/snappy/SnappyStreamTest.java diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..78ec220 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,19 @@ +name: ci + +on: + - push + - pull_request + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: 8 + - name: Maven Install + run: mvn install -B -V -DskipTests -Dair.check.skip-all + - name: Maven Tests + run: mvn install -B -P ci diff --git a/README.md b/README.md index 2a5b49c..60cb7c6 100644 --- a/README.md +++ b/README.md @@ -1,219 +1,3 @@ # 🚚 MOVED 🚚 ### __Future development of Snappy without JNI has moved to [aircompressor](https://github.com/airlift/aircompressor)__ - -
- -# Snappy in Java - -This is a rewrite (port) of [Snappy](http://code.google.com/p/snappy/) written in -pure Java. This compression code produces a byte-for-byte exact copy of the output -created by the original C++ code, and extremely fast. - -# Performance - -The Snappy micro-benchmark has been ported, and can be used to measure -the performance of this code against the excellent Snappy JNI wrapper from -[xerial](http://code.google.com/p/snappy-java/). As you can see in the results -below, the pure Java port is 20-30% faster for block compress, 0-10% slower -for block uncompress, and 0-5% slower for round-trip block compression. These -results were run with Java 7 on a Core i7, 64-bit Mac. - -As a second more independent test, the performance has been measured using the -Ning JVM compression benchmark against Snappy JNI, and the pure Java -[Ning LZF](https://github.com/ning/compress) codec. The -[results](http://dain.github.com/snappy/) show that the pure Java Snappy is -20-30% faster than JNI Snappy for compression, and is typically 10-20% slower -for decompression. Both, the pure Java Snappy and JNI Snappy implementations -are faster that the Ning LZF codec. These results were run with Java 6 on a -Core i7, 64-bit Mac. - -The difference in performance between these two tests is due to the difference -in JVM version; Java 7 is consistently 5-10% faster than Java 6 in the -compression code. As with all benchmarks your mileage will vary, so test with -your actual use case. - - - -### Block Compress -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   294.9MB/s   384.8MB/s  +30.5%  html
-urls       702087     49.1%     49.1%   178.7MB/s   226.5MB/s  +26.8%  urls
-jpg        126958      0.1%      0.1%     2.7GB/s     3.2GB/s  +17.4%  jpg (not compressible)
-pdf         94330     17.9%     17.9%   642.4MB/s   910.3MB/s  +41.7%  pdf
-html4      409600     76.4%     76.4%   289.2MB/s   377.3MB/s  +30.5%  html4
-cp          24603     51.9%     51.9%   166.4MB/s   233.7MB/s  +40.5%  cp
-c           11150     57.6%     57.6%   177.1MB/s   295.4MB/s  +66.8%  c
-lsp          3721     51.6%     51.6%   245.5MB/s   278.0MB/s  +13.2%  lsp
-xls       1029744     58.7%     58.7%   263.0MB/s   292.5MB/s  +11.2%  xls
-txt1       152089     40.2%     40.2%   116.8MB/s   163.1MB/s  +39.7%  txt1
-txt2       125179     35.9%     35.9%   112.5MB/s   153.4MB/s  +36.3%  txt2
-txt3       426754     42.9%     42.9%   123.3MB/s   169.8MB/s  +37.6%  txt3
-txt4       481861     31.7%     31.7%   107.8MB/s   146.2MB/s  +35.6%  txt4
-bin        513216     81.8%     81.8%   413.1MB/s   497.8MB/s  +20.5%  bin
-sum         38240     48.1%     48.1%   162.4MB/s   213.9MB/s  +31.7%  sum
-man          4227     40.6%     40.6%   194.6MB/s   241.7MB/s  +24.2%  man
-pb         118588     76.8%     76.8%   363.7MB/s   450.3MB/s  +23.8%  pb
-gaviota    184320     61.7%     61.7%   166.7MB/s   253.7MB/s  +52.2%  gaviota
-
- - -### Block Uncompress -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%     1.5GB/s     1.3GB/s  -12.2%  html
-urls       702087     49.1%     49.1%   969.2MB/s   827.5MB/s  -14.6%  urls
-jpg        126958      0.1%      0.1%    18.6GB/s    19.4GB/s   +4.2%  jpg (not compressible)
-pdf         94330     17.9%     17.9%     4.1GB/s     3.7GB/s   -8.8%  pdf
-html4      409600     76.4%     76.4%     1.5GB/s     1.2GB/s  -16.8%  html4
-cp          24603     51.9%     51.9%   965.2MB/s   956.0MB/s   -1.0%  cp
-c           11150     57.6%     57.6%   989.1MB/s   924.9MB/s   -6.5%  c
-lsp          3721     51.6%     51.6%   991.6MB/s   964.8MB/s   -2.7%  lsp
-xls       1029744     58.7%     58.7%   798.4MB/s   747.3MB/s   -6.4%  xls
-txt1       152089     40.2%     40.2%   643.8MB/s   580.8MB/s   -9.8%  txt1
-txt2       125179     35.9%     35.9%   610.0MB/s   549.6MB/s   -9.9%  txt2
-txt3       426754     42.9%     42.9%   683.8MB/s   614.4MB/s  -10.2%  txt3
-txt4       481861     31.7%     31.7%   565.4MB/s   505.5MB/s  -10.6%  txt4
-bin        513216     81.8%     81.8%     1.5GB/s     1.2GB/s  -20.4%  bin
-sum         38240     48.1%     48.1%   838.1MB/s   771.6MB/s   -7.9%  sum
-man          4227     40.6%     40.6%   856.9MB/s   847.2MB/s   -1.1%  man
-pb         118588     76.8%     76.8%     1.7GB/s     1.5GB/s  -12.9%  pb
-gaviota    184320     61.7%     61.7%   769.1MB/s   693.4MB/s   -9.9%  gaviota
-
- - -### Block Round Trip -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   300.3MB/s   287.1MB/s   -4.4%  html
-urls       702087     49.1%     49.1%   182.7MB/s   177.0MB/s   -3.2%  urls
-jpg        126958      0.1%      0.1%     2.6GB/s     2.6GB/s   +1.1%  jpg (not compressible)
-pdf         94330     17.9%     17.9%   695.3MB/s   680.0MB/s   -2.2%  pdf
-html4      409600     76.4%     76.4%   296.4MB/s   282.1MB/s   -4.8%  html4
-cp          24603     51.9%     51.9%   177.0MB/s   172.5MB/s   -2.5%  cp
-c           11150     57.6%     57.6%   221.7MB/s   218.3MB/s   -1.5%  c
-lsp          3721     51.6%     51.6%   217.3MB/s   216.3MB/s   -0.5%  lsp
-xls       1029744     58.7%     58.7%   213.3MB/s   209.9MB/s   -1.6%  xls
-txt1       152089     40.2%     40.2%   129.4MB/s   126.3MB/s   -2.4%  txt1
-txt2       125179     35.9%     35.9%   121.7MB/s   118.8MB/s   -2.4%  txt2
-txt3       426754     42.9%     42.9%   135.2MB/s   132.8MB/s   -1.8%  txt3
-txt4       481861     31.7%     31.7%   115.2MB/s   113.0MB/s   -1.9%  txt4
-bin        513216     81.8%     81.8%   371.2MB/s   350.7MB/s   -5.5%  bin
-sum         38240     48.1%     48.1%   164.2MB/s   160.0MB/s   -2.6%  sum
-man          4227     40.6%     40.6%   184.8MB/s   185.3MB/s   +0.3%  man
-pb         118588     76.8%     76.8%   344.1MB/s   326.3MB/s   -5.2%  pb
-gaviota    184320     61.7%     61.7%   188.0MB/s   185.2MB/s   -1.5%  gaviota
-
- -# Stream Format - -There is no defined stream format for Snappy, but there is an effort to create -a common format with the Google Snappy project. - -The stream format used in this library has a couple of unique features not -found in the other Snappy stream formats. Like the other formats, the user -input is broken into blocks and each block is compressed. If the compressed -block is smaller that the user input, the compressed block is written, -otherwise the uncompressed original is written. This dramatically improves the -speed of uncompressible input such as JPG images. Additionally, a checksum of -the user input data for each block is written to the stream. This safety check -assures that the stream has not been corrupted in transit or by a bad Snappy -implementation. Finally, like gzip, compressed Snappy files can be -concatenated together without issue, since the input stream will ignore a -Snappy stream header in the middle of a stream. This makes combining files in -Hadoop and S3 trivial. - -The the SnappyOutputStream javadocs contain formal definition of the stream -format. - -## Stream Performance - -The streaming mode performance can not be directly compared to other -compression algorithms since most formats do not contain a checksum. The basic -streaming code is significantly faster that the Snappy JNI library due to -the completely unoptimized stream implementation in Snappy JNI, but once the -check sum is enabled the performance drops off by about 20%. - -### Stream Compress (no checksums) -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   275.8MB/s   373.5MB/s  +35.4%  html
-urls       702087     49.1%     49.1%   176.5MB/s   225.2MB/s  +27.6%  urls
-jpg        126958      0.1%     -0.0%     1.7GB/s     2.0GB/s  +15.8%  jpg (not compressible)
-pdf         94330     17.8%     16.0%   557.2MB/s   793.2MB/s  +42.4%  pdf
-html4      409600     76.4%     76.4%   281.0MB/s   369.9MB/s  +31.7%  html4
-cp          24603     51.8%     51.8%   151.7MB/s   214.3MB/s  +41.3%  cp
-c           11150     57.4%     57.5%   149.1MB/s   243.3MB/s  +63.1%  c
-lsp          3721     51.1%     51.2%   141.3MB/s   181.1MB/s  +28.2%  lsp
-xls       1029744     58.6%     58.6%   253.9MB/s   290.5MB/s  +14.4%  xls
-txt1       152089     40.2%     40.2%   114.8MB/s   159.4MB/s  +38.8%  txt1
-txt2       125179     35.9%     35.9%   110.0MB/s   150.4MB/s  +36.7%  txt2
-txt3       426754     42.9%     42.9%   121.0MB/s   167.9MB/s  +38.8%  txt3
-txt4       481861     31.6%     31.6%   105.1MB/s   143.2MB/s  +36.2%  txt4
-bin        513216     81.8%     81.8%   387.7MB/s   484.5MB/s  +25.0%  bin
-sum         38240     48.1%     48.1%   153.0MB/s   203.1MB/s  +32.8%  sum
-man          4227     40.2%     40.3%   125.9MB/s   171.9MB/s  +36.5%  man
-pb         118588     76.8%     76.8%   342.2MB/s   431.4MB/s  +26.1%  pb
-gaviota    184320     61.7%     61.7%   161.1MB/s   246.1MB/s  +52.7%  gaviota
-
- - -### Stream Uncompress (no checksums) -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%     1.2GB/s     1.2GB/s   +0.4%  html
-urls       702087     49.1%     49.1%   853.9MB/s   786.6MB/s   -7.9%  urls
-jpg        126958      0.1%     -0.0%     3.0GB/s    10.3GB/s +239.0%  jpg (not compressible)
-pdf         94330     17.8%     16.0%     2.0GB/s     3.4GB/s  +71.5%  pdf
-html4      409600     76.4%     76.4%     1.2GB/s     1.1GB/s   -8.4%  html4
-cp          24603     51.8%     51.8%   785.2MB/s   905.6MB/s  +15.3%  cp
-c           11150     57.4%     57.5%   778.9MB/s   889.7MB/s  +14.2%  c
-lsp          3721     51.1%     51.2%   739.0MB/s   905.5MB/s  +22.5%  lsp
-xls       1029744     58.6%     58.6%   730.3MB/s   718.8MB/s   -1.6%  xls
-txt1       152089     40.2%     40.2%   582.4MB/s   559.0MB/s   -4.0%  txt1
-txt2       125179     35.9%     35.9%   540.7MB/s   526.4MB/s   -2.6%  txt2
-txt3       426754     42.9%     42.9%   620.5MB/s   583.9MB/s   -5.9%  txt3
-txt4       481861     31.6%     31.6%   519.4MB/s   487.0MB/s   -6.2%  txt4
-bin        513216     81.8%     81.8%     1.2GB/s     1.1GB/s  -11.6%  bin
-sum         38240     48.1%     48.1%   693.4MB/s   742.4MB/s   +7.1%  sum
-man          4227     40.2%     40.3%   637.3MB/s   784.3MB/s  +23.1%  man
-pb         118588     76.8%     76.8%     1.4GB/s     1.4GB/s   +0.4%  pb
-gaviota    184320     61.7%     61.7%   688.5MB/s   668.2MB/s   -3.0%  gaviota
-
- - -### Stream RoundTrip (no checksums) -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   223.8MB/s   272.5MB/s  +21.8%  html
-urls       702087     49.1%     49.1%   142.8MB/s   174.1MB/s  +22.0%  urls
-jpg        126958      0.1%     -0.0%     1.1GB/s     1.6GB/s  +52.1%  jpg (not compressible)
-pdf         94330     17.8%     16.0%   421.9MB/s   610.1MB/s  +44.6%  pdf
-html4      409600     76.4%     76.4%   226.2MB/s   275.5MB/s  +21.8%  html4
-cp          24603     51.8%     51.8%   125.3MB/s   160.3MB/s  +27.9%  cp
-c           11150     57.4%     57.5%   125.1MB/s   183.2MB/s  +46.5%  c
-lsp          3721     51.1%     51.2%   130.6MB/s   149.5MB/s  +14.5%  lsp
-xls       1029744     58.6%     58.6%   188.2MB/s   206.1MB/s   +9.5%  xls
-txt1       152089     40.2%     40.2%    95.3MB/s   123.3MB/s  +29.4%  txt1
-txt2       125179     35.9%     35.9%    91.4MB/s   116.8MB/s  +27.9%  txt2
-txt3       426754     42.9%     42.9%   101.3MB/s   130.3MB/s  +28.6%  txt3
-txt4       481861     31.6%     31.6%    87.9MB/s   111.1MB/s  +26.3%  txt4
-bin        513216     81.8%     81.8%   294.7MB/s   337.9MB/s  +14.7%  bin
-sum         38240     48.1%     48.1%   122.9MB/s   152.9MB/s  +24.3%  sum
-man          4227     40.2%     40.3%   113.0MB/s   139.1MB/s  +23.1%  man
-pb         118588     76.8%     76.8%   269.5MB/s   313.8MB/s  +16.4%  pb
-gaviota    184320     61.7%     61.7%   131.1MB/s   180.3MB/s  +37.6%  gaviota
-
diff --git a/pom.xml b/pom.xml index feb27ca..f7e9d2c 100644 --- a/pom.xml +++ b/pom.xml @@ -62,6 +62,11 @@ true + + maven_central + Maven Central + https://repo.maven.apache.org/maven2/ + @@ -78,29 +83,36 @@ + org.apache.hadoop - hadoop-core - 0.20.2 + hadoop-common + 3.4.0 true provided com.google.guava guava - 13.0.1 + 33.2.0-jre test org.xerial.snappy snappy-java - 1.0.4.1 + 1.1.10.4 test org.testng testng - 6.0.1 + 7.5.1 test @@ -110,7 +122,7 @@ org.apache.maven.plugins maven-enforcer-plugin - 1.0 + 3.4.1 enforce-versions @@ -123,7 +135,7 @@ 3.0.0 - 1.6 + 1.8 @@ -134,47 +146,6 @@ org.apache.maven.plugins maven-source-plugin - - - org.apache.maven.plugins - maven-jar-plugin - 2.3.2 - - - binary - package - - jar - - - bin - - - org.iq80.snappy.Main - - - - - - - - - org.skife.maven - really-executable-jar-maven-plugin - 1.0.3 - - - package - - really-executable-jar - - - bin - - - - - @@ -212,8 +183,8 @@ maven-compiler-plugin 2.3.2 - 1.6 - 1.6 + 1.8 + 1.8 diff --git a/src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java b/src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java deleted file mode 100644 index 322bbde..0000000 --- a/src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; - -import static java.lang.Math.min; -import static org.iq80.snappy.SnappyInternalUtils.checkNotNull; -import static org.iq80.snappy.SnappyInternalUtils.checkPositionIndexes; -import static org.iq80.snappy.SnappyInternalUtils.readBytes; - -/** - * A common base class for frame based snappy input streams. - */ -abstract class AbstractSnappyInputStream - extends InputStream -{ - private final InputStream in; - private final byte[] frameHeader; - private final boolean verifyChecksums; - private final BufferRecycler recycler; - - /** - * A single frame read from the underlying {@link InputStream}. - */ - private byte[] input; - - /** - * The decompressed data from {@link #input}. - */ - private byte[] uncompressed; - - /** - * Indicates if this instance has been closed. - */ - private boolean closed; - - /** - * Indicates if we have reached the EOF on {@link #in}. - */ - private boolean eof; - - /** - * The position in {@link #input} to read to. - */ - private int valid; - - /** - * The next position to read from {@link #buffer}. - */ - private int position; - - /** - * Buffer is a reference to the real buffer of uncompressed data for the - * current block: uncompressed if the block is compressed, or input if it is - * not. - */ - private byte[] buffer; - - /** - * Creates a Snappy input stream to read data from the specified underlying - * input stream. - * - * @param in the underlying input stream - * @param verifyChecksums if true, checksums in input stream will be verified - * @param expectedHeader the expected stream header - */ - public AbstractSnappyInputStream(InputStream in, int maxBlockSize, int frameHeaderSize, boolean verifyChecksums, byte[] expectedHeader) - throws IOException - { - this.in = in; - this.verifyChecksums = verifyChecksums; - this.recycler = BufferRecycler.instance(); - allocateBuffersBasedOnSize(maxBlockSize + 5); - this.frameHeader = new byte[frameHeaderSize]; - - // stream must begin with stream header - byte[] actualHeader = new byte[expectedHeader.length]; - - int read = readBytes(in, actualHeader, 0, actualHeader.length); - if (read < expectedHeader.length) { - throw new EOFException("encountered EOF while reading stream header"); - } - if (!Arrays.equals(expectedHeader, actualHeader)) { - throw new IOException("invalid stream header"); - } - } - - private void allocateBuffersBasedOnSize(int size) - { - input = recycler.allocInputBuffer(size); - uncompressed = recycler.allocDecodeBuffer(size); - } - - @Override - public int read() - throws IOException - { - if (closed) { - return -1; - } - if (!ensureBuffer()) { - return -1; - } - return buffer[position++] & 0xFF; - } - - @Override - public int read(byte[] output, int offset, int length) - throws IOException - { - checkNotNull(output, "output is null"); - checkPositionIndexes(offset, offset + length, output.length); - if (closed) { - throw new IOException("Stream is closed"); - } - - if (length == 0) { - return 0; - } - if (!ensureBuffer()) { - return -1; - } - - int size = min(length, available()); - System.arraycopy(buffer, position, output, offset, size); - position += size; - return size; - } - - @Override - public int available() - throws IOException - { - if (closed) { - return 0; - } - return valid - position; - } - - @Override - public void close() - throws IOException - { - try { - in.close(); - } - finally { - if (!closed) { - closed = true; - recycler.releaseInputBuffer(input); - recycler.releaseDecodeBuffer(uncompressed); - } - } - } - - enum FrameAction - { - RAW, SKIP, UNCOMPRESS - } - - public static final class FrameMetaData - { - final int length; - final FrameAction frameAction; - - /** - * @param frameAction - * @param length - */ - public FrameMetaData(FrameAction frameAction, int length) - { - this.frameAction = frameAction; - this.length = length; - } - } - - public static final class FrameData - { - final int checkSum; - final int offset; - - public FrameData(int checkSum, int offset) - { - this.checkSum = checkSum; - this.offset = offset; - } - } - - private boolean ensureBuffer() - throws IOException - { - if (available() > 0) { - return true; - } - if (eof) { - return false; - } - - if (!readBlockHeader()) { - eof = true; - return false; - } - - // get action based on header - FrameMetaData frameMetaData = getFrameMetaData(frameHeader); - - if (FrameAction.SKIP == frameMetaData.frameAction) { - SnappyInternalUtils.skip(in, frameMetaData.length); - return ensureBuffer(); - } - - if (frameMetaData.length > input.length) { - allocateBuffersBasedOnSize(frameMetaData.length); - } - - int actualRead = readBytes(in, input, 0, frameMetaData.length); - if (actualRead != frameMetaData.length) { - throw new EOFException("unexpectd EOF when reading frame"); - } - - FrameData frameData = getFrameData(frameHeader, input, actualRead); - - if (FrameAction.UNCOMPRESS == frameMetaData.frameAction) { - int uncompressedLength = Snappy.getUncompressedLength(input, - frameData.offset); - - if (uncompressedLength > uncompressed.length) { - uncompressed = recycler.allocDecodeBuffer(uncompressedLength); - } - - this.valid = Snappy.uncompress(input, frameData.offset, actualRead - - frameData.offset, uncompressed, 0); - this.buffer = uncompressed; - this.position = 0; - } - else { - // we need to start reading at the offset - this.position = frameData.offset; - this.buffer = input; - // valid is until the end of the read data, regardless of offset - // indicating where we start - this.valid = actualRead; - } - - if (verifyChecksums) { - int actualCrc32c = Crc32C.maskedCrc32c(buffer, position, valid - position); - if (frameData.checkSum != actualCrc32c) { - throw new IOException("Corrupt input: invalid checksum"); - } - } - - return true; - } - - /** - * Use the content of the frameHeader to describe what type of frame we have - * and the action to take. - */ - protected abstract FrameMetaData getFrameMetaData(byte[] frameHeader) - throws IOException; - - /** - * Take the frame header and the content of the frame to describe metadata - * about the content. - * - * @param frameHeader The frame header. - * @param content The content of the of the frame. Content begins at index {@code 0}. - * @param length The length of the content. - * @return Metadata about the content of the frame. - */ - protected abstract FrameData getFrameData(byte[] frameHeader, byte[] content, int length); - - private boolean readBlockHeader() - throws IOException - { - int read = readBytes(in, frameHeader, 0, frameHeader.length); - - if (read == -1) { - return false; - } - - if (read < frameHeader.length) { - throw new EOFException("encountered EOF while reading block header"); - } - - return true; - } -} diff --git a/src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java b/src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java deleted file mode 100644 index abaf3fa..0000000 --- a/src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.OutputStream; - -import static org.iq80.snappy.Crc32C.maskedCrc32c; -import static org.iq80.snappy.Snappy.maxCompressedLength; -import static org.iq80.snappy.SnappyInternalUtils.checkArgument; -import static org.iq80.snappy.SnappyInternalUtils.checkNotNull; -import static org.iq80.snappy.SnappyInternalUtils.checkPositionIndexes; - -/** - * This is a base class supporting both the {@link SnappyOutputStream} and - * {@link SnappyFramedOutputStream}. - *

- *

- * Delegates writing the header bytes and individual frames to the specific - * implementations. Implementations may also override the crc32 checksum - * calculation. - *

- * - * @since 0.4 - */ -abstract class AbstractSnappyOutputStream - extends OutputStream -{ - private final BufferRecycler recycler; - private final int blockSize; - private final byte[] buffer; - private final byte[] outputBuffer; - private final double minCompressionRatio; - - private final OutputStream out; - - private int position; - private boolean closed; - - /** - * @param out The underlying {@link OutputStream} to write to. Must not be {@code null}. - * @param blockSize The block size (of raw data) to compress before writing frames to out. - * @param minCompressionRatio Defines the minimum compression ratio ({@code compressedLength / rawLength}) that must be achieved to - * write the compressed data. This must be in (0, 1.0]. - */ - public AbstractSnappyOutputStream(OutputStream out, int blockSize, double minCompressionRatio) - throws IOException - { - this.out = checkNotNull(out, "out is null"); - checkArgument(minCompressionRatio > 0 && minCompressionRatio <= 1.0, "minCompressionRatio %1s must be between (0,1.0].", minCompressionRatio); - this.minCompressionRatio = minCompressionRatio; - this.recycler = BufferRecycler.instance(); - this.blockSize = blockSize; - this.buffer = recycler.allocOutputBuffer(blockSize); - this.outputBuffer = recycler.allocEncodingBuffer(maxCompressedLength(blockSize)); - - writeHeader(out); - } - - /** - * Writes the implementation specific header or "marker bytes" to - * out. - * - * @param out The underlying {@link OutputStream}. - */ - protected abstract void writeHeader(OutputStream out) - throws IOException; - - @Override - public void write(int b) - throws IOException - { - if (closed) { - throw new IOException("Stream is closed"); - } - if (position >= blockSize) { - flushBuffer(); - } - buffer[position++] = (byte) b; - } - - @Override - public void write(byte[] input, int offset, int length) - throws IOException - { - checkNotNull(input, "input is null"); - checkPositionIndexes(offset, offset + length, input.length); - if (closed) { - throw new IOException("Stream is closed"); - } - - int free = blockSize - position; - - // easy case: enough free space in buffer for entire input - if (free >= length) { - copyToBuffer(input, offset, length); - return; - } - - // fill partial buffer as much as possible and flush - if (position > 0) { - copyToBuffer(input, offset, free); - flushBuffer(); - offset += free; - length -= free; - } - - // write remaining full blocks directly from input array - while (length >= blockSize) { - writeCompressed(input, offset, blockSize); - offset += blockSize; - length -= blockSize; - } - - // copy remaining partial block into now-empty buffer - copyToBuffer(input, offset, length); - } - - @Override - public final void flush() - throws IOException - { - if (closed) { - throw new IOException("Stream is closed"); - } - flushBuffer(); - out.flush(); - } - - @Override - public final void close() - throws IOException - { - if (closed) { - return; - } - try { - flush(); - out.close(); - } - finally { - closed = true; - recycler.releaseOutputBuffer(outputBuffer); - recycler.releaseEncodeBuffer(buffer); - } - } - - private void copyToBuffer(byte[] input, int offset, int length) - { - System.arraycopy(input, offset, buffer, position, length); - position += length; - } - - /** - * Compresses and writes out any buffered data. This does nothing if there - * is no currently buffered data. - */ - private void flushBuffer() - throws IOException - { - if (position > 0) { - writeCompressed(buffer, 0, position); - position = 0; - } - } - - /** - * {@link #calculateCRC32C(byte[], int, int) Calculates} the crc, compresses - * the data, determines if the compression ratio is acceptable and calls - * {@link #writeBlock(OutputStream, byte[], int, int, boolean, int)} to - * actually write the frame. - * - * @param input The byte[] containing the raw data to be compressed. - * @param offset The offset into input where the data starts. - * @param length The amount of data in input. - */ - private void writeCompressed(byte[] input, int offset, int length) - throws IOException - { - // crc is based on the user supplied input data - int crc32c = calculateCRC32C(input, offset, length); - - int compressed = Snappy - .compress(input, offset, length, outputBuffer, 0); - - // only use the compressed data if compression ratio is <= the minCompressionRatio - if (((double) compressed / (double) length) <= minCompressionRatio) { - writeBlock(out, outputBuffer, 0, compressed, true, crc32c); - } - else { - // otherwise use the uncompressed data. - writeBlock(out, input, offset, length, false, crc32c); - } - } - - /** - * Calculates a CRC32C checksum over the data. - *

- * This can be overridden to provider alternative implementations (such as - * returning 0 if checksums are not desired). - *

- * - * @return The CRC32 checksum. - */ - protected int calculateCRC32C(byte[] data, int offset, int length) - { - return maskedCrc32c(data, offset, length); - } - - /** - * Write a frame (block) to out. - * - * @param out The {@link OutputStream} to write to. - * @param data The data to write. - * @param offset The offset in data to start at. - * @param length The length of data to use. - * @param compressed Indicates if data is the compressed or raw content. - * This is based on whether the compression ratio desired is - * reached. - * @param crc32c The calculated checksum. - */ - protected abstract void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) - throws IOException; -} diff --git a/src/main/java/org/iq80/snappy/BufferRecycler.java b/src/main/java/org/iq80/snappy/BufferRecycler.java deleted file mode 100644 index 6dfcc0d..0000000 --- a/src/main/java/org/iq80/snappy/BufferRecycler.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.lang.ref.SoftReference; - -/** - * Simple helper class to encapsulate details of basic buffer - * recycling scheme, which helps a lot (as per profiling) for - * smaller encoding cases. - * - * @author tatu - */ -class BufferRecycler -{ - private static final int MIN_ENCODING_BUFFER = 4000; - - private static final int MIN_OUTPUT_BUFFER = 8000; - - /** - * This ThreadLocal contains a {@link java.lang.ref.SoftReference} - * to a {@link BufferRecycler} used to provide a low-cost - * buffer recycling for buffers we need for encoding, decoding. - */ - protected static final ThreadLocal> recyclerRef = new ThreadLocal>(); - - private byte[] inputBuffer; - private byte[] outputBuffer; - - private byte[] decodingBuffer; - private byte[] encodingBuffer; - - private short[] encodingHash; - - /** - * Accessor to get thread-local recycler instance - */ - public static BufferRecycler instance() - { - SoftReference ref = recyclerRef.get(); - - BufferRecycler bufferRecycler; - if (ref == null) { - bufferRecycler = null; - } - else { - bufferRecycler = ref.get(); - } - - if (bufferRecycler == null) { - bufferRecycler = new BufferRecycler(); - recyclerRef.set(new SoftReference(bufferRecycler)); - } - return bufferRecycler; - } - - public void clear() - { - inputBuffer = null; - outputBuffer = null; - decodingBuffer = null; - encodingBuffer = null; - encodingHash = null; - } - - /////////////////////////////////////////////////////////////////////// - // Buffers for encoding (output) - /////////////////////////////////////////////////////////////////////// - - public byte[] allocEncodingBuffer(int minSize) - { - byte[] buf = encodingBuffer; - if (buf == null || buf.length < minSize) { - buf = new byte[Math.max(minSize, MIN_ENCODING_BUFFER)]; - } - else { - encodingBuffer = null; - } - return buf; - } - - public void releaseEncodeBuffer(byte[] buffer) - { - if (encodingBuffer == null || buffer.length > encodingBuffer.length) { - encodingBuffer = buffer; - } - } - - public byte[] allocOutputBuffer(int minSize) - { - byte[] buf = outputBuffer; - if (buf == null || buf.length < minSize) { - buf = new byte[Math.max(minSize, MIN_OUTPUT_BUFFER)]; - } - else { - outputBuffer = null; - } - return buf; - } - - public void releaseOutputBuffer(byte[] buffer) - { - if (outputBuffer == null || (buffer != null && buffer.length > outputBuffer.length)) { - outputBuffer = buffer; - } - } - - public short[] allocEncodingHash(int suggestedSize) - { - short[] buf = encodingHash; - if (buf == null || buf.length < suggestedSize) { - buf = new short[suggestedSize]; - } - else { - encodingHash = null; - } - return buf; - } - - public void releaseEncodingHash(short[] buffer) - { - if (encodingHash == null || (buffer != null && buffer.length > encodingHash.length)) { - encodingHash = buffer; - } - } - - /////////////////////////////////////////////////////////////////////// - // Buffers for decoding (input) - /////////////////////////////////////////////////////////////////////// - - public byte[] allocInputBuffer(int minSize) - { - byte[] buf = inputBuffer; - if (buf == null || buf.length < minSize) { - buf = new byte[Math.max(minSize, MIN_OUTPUT_BUFFER)]; - } - else { - inputBuffer = null; - } - return buf; - } - - public void releaseInputBuffer(byte[] buffer) - { - if (inputBuffer == null || (buffer != null && buffer.length > inputBuffer.length)) { - inputBuffer = buffer; - } - } - - public byte[] allocDecodeBuffer(int size) - { - byte[] buf = decodingBuffer; - if (buf == null || buf.length < size) { - buf = new byte[size]; - } - else { - decodingBuffer = null; - } - return buf; - } - - public void releaseDecodeBuffer(byte[] buffer) - { - if (decodingBuffer == null || (buffer != null && buffer.length > decodingBuffer.length)) { - decodingBuffer = buffer; - } - } -} diff --git a/src/main/java/org/iq80/snappy/CorruptionException.java b/src/main/java/org/iq80/snappy/CorruptionException.java index 24e797f..d91a637 100644 --- a/src/main/java/org/iq80/snappy/CorruptionException.java +++ b/src/main/java/org/iq80/snappy/CorruptionException.java @@ -1,8 +1,4 @@ /* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -20,22 +16,21 @@ public class CorruptionException extends RuntimeException { - public CorruptionException() - { - } + private final long offset; - public CorruptionException(String message) + public CorruptionException(long offset) { - super(message); + this(offset, "Malformed input"); } - public CorruptionException(String message, Throwable cause) + public CorruptionException(long offset, String reason) { - super(message, cause); + super(reason + ": offset=" + offset); + this.offset = offset; } - public CorruptionException(Throwable cause) + public long getOffset() { - super(cause); + return offset; } } diff --git a/src/main/java/org/iq80/snappy/Crc32C.java b/src/main/java/org/iq80/snappy/Crc32C.java index cced9b0..9679946 100644 --- a/src/main/java/org/iq80/snappy/Crc32C.java +++ b/src/main/java/org/iq80/snappy/Crc32C.java @@ -47,24 +47,15 @@ public static int maskedCrc32c(byte[] data, int offset, int length) * Return a masked representation of crc. *

* Motivation: it is problematic to compute the CRC of a string that - * contains embedded CRCs. Therefore we recommend that CRCs stored + * contains embedded CRCs. Therefore, we recommend that CRCs stored * somewhere (e.g., in files) should be masked before being stored. */ - public static int mask(int crc) + private static int mask(int crc) { // Rotate right by 15 bits and add a constant. return ((crc >>> 15) | (crc << 17)) + MASK_DELTA; } - /** - * Return the crc whose masked representation is masked_crc. - */ - public static int unmask(int maskedCrc) - { - int rot = maskedCrc - MASK_DELTA; - return ((rot >>> 17) | (rot << 15)); - } - /** * the current CRC value, bit-flipped */ @@ -73,17 +64,17 @@ public static int unmask(int maskedCrc) /** * Create a new PureJavaCrc32 object. */ - public Crc32C() + private Crc32C() { reset(); } - public int getMaskedValue() + private int getMaskedValue() { return mask(getIntValue()); } - public int getIntValue() + private int getIntValue() { return ~crc; } @@ -107,9 +98,13 @@ public void update(byte[] b, int off, int len) int localCrc = crc; while (len > 7) { int c0 = b[off++] ^ localCrc; - int c1 = b[off++] ^ (localCrc >>>= 8); - int c2 = b[off++] ^ (localCrc >>>= 8); - int c3 = b[off++] ^ (localCrc >>>= 8); + localCrc >>>= 8; + int c1 = b[off++] ^ localCrc; + localCrc >>>= 8; + int c2 = b[off++] ^ localCrc; + localCrc >>>= 8; + int c3 = b[off++] ^ localCrc; + localCrc = (T8_7[c0 & 0xff] ^ T8_6[c1 & 0xff]) ^ (T8_5[c2 & 0xff] ^ T8_4[c3 & 0xff]); @@ -137,7 +132,7 @@ public void update(int b) // java -cp build/test/classes/:build/classes/ \ // org.apache.hadoop.util.TestPureJavaCrc32\$Table 82F63B78 - static final int[] T8_0 = new int[] { + private static final int[] T8_0 = new int[] { 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, @@ -203,7 +198,7 @@ public void update(int b) 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 }; - static final int[] T8_1 = new int[] { + private static final int[] T8_1 = new int[] { 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, @@ -269,7 +264,7 @@ public void update(int b) 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 }; - static final int[] T8_2 = new int[] { + private static final int[] T8_2 = new int[] { 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, @@ -335,7 +330,7 @@ public void update(int b) 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 }; - static final int[] T8_3 = new int[] { + private static final int[] T8_3 = new int[] { 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, @@ -401,7 +396,7 @@ public void update(int b) 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 }; - static final int[] T8_4 = new int[] { + private static final int[] T8_4 = new int[] { 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44, 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, @@ -467,7 +462,7 @@ public void update(int b) 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3 }; - static final int[] T8_5 = new int[] { + private static final int[] T8_5 = new int[] { 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD, 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, @@ -533,7 +528,7 @@ public void update(int b) 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C }; - static final int[] T8_6 = new int[] { + private static final int[] T8_6 = new int[] { 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089, 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, @@ -599,7 +594,7 @@ public void update(int b) 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F }; - static final int[] T8_7 = new int[] { + private static final int[] T8_7 = new int[] { 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504, 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, diff --git a/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java b/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java index 7ec8b81..aa942e5 100644 --- a/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java +++ b/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java @@ -17,69 +17,95 @@ */ package org.iq80.snappy; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.Decompressor; +import org.apache.hadoop.io.compress.DoNotPool; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY; + public class HadoopSnappyCodec - implements CompressionCodec + implements Configurable, CompressionCodec { + private Configuration conf; + + @Override + public Configuration getConf() + { + return conf; + } + + @Override + public void setConf(Configuration conf) + { + this.conf = conf; + } + @Override public CompressionOutputStream createOutputStream(OutputStream outputStream) throws IOException { - return new SnappyCompressionOutputStream(outputStream); + return new HadoopSnappyOutputStream(outputStream, getBufferSize()); } @Override public CompressionOutputStream createOutputStream(OutputStream outputStream, Compressor compressor) throws IOException { - throw new UnsupportedOperationException("Snappy Compressor is not supported"); + if (!(compressor instanceof HadoopSnappyCompressor)) { + throw new IllegalArgumentException("Compressor is not the Snappy decompressor"); + } + return new HadoopSnappyOutputStream(outputStream, getBufferSize()); } @Override public Class getCompressorType() { - throw new UnsupportedOperationException("Snappy Compressor is not supported"); + return HadoopSnappyCompressor.class; } @Override public Compressor createCompressor() { - throw new UnsupportedOperationException("Snappy Compressor is not supported"); + return new HadoopSnappyCompressor(); } @Override public CompressionInputStream createInputStream(InputStream inputStream) throws IOException { - return new SnappyCompressionInputStream(inputStream); + return new HadoopSnappyInputStream(inputStream); } @Override - public CompressionInputStream createInputStream(InputStream inputStream, Decompressor decompressor) + public CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException { - throw new UnsupportedOperationException("Snappy Decompressor is not supported"); + if (!(decompressor instanceof HadoopSnappyDecompressor)) { + throw new IllegalArgumentException("Decompressor is not the Snappy decompressor"); + } + return new HadoopSnappyInputStream(in); } @Override public Class getDecompressorType() { - throw new UnsupportedOperationException("Snappy Decompressor is not supported"); + return HadoopSnappyDecompressor.class; } @Override public Decompressor createDecompressor() { - throw new UnsupportedOperationException("Snappy Decompressor is not supported"); + return new HadoopSnappyDecompressor(); } @Override @@ -88,72 +114,141 @@ public String getDefaultExtension() return ".snappy"; } - private static class SnappyCompressionOutputStream - extends CompressionOutputStream + private int getBufferSize() { - public SnappyCompressionOutputStream(OutputStream outputStream) - throws IOException + // Favor using the configured buffer size. This is not as critical for Snappy + // since Snappy always writes the compressed chunk size, so we always know the + // correct buffer size to create. + int maxUncompressedLength; + if (conf != null) { + maxUncompressedLength = conf.getInt(IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_DEFAULT); + } + else { + maxUncompressedLength = IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_DEFAULT; + } + return maxUncompressedLength; + } + + /** + * No Hadoop code seems to actually use the compressor, so just return a dummy one so the createOutputStream method + * with a compressor can function. This interface can be implemented if needed. + */ + @DoNotPool + private static class HadoopSnappyCompressor + implements Compressor + { + @Override + public void setInput(byte[] b, int off, int len) { - super(new SnappyOutputStream(outputStream)); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override - public void write(byte[] b, int off, int len) - throws IOException + public boolean needsInput() { - out.write(b, off, len); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); + } + + @Override + public void setDictionary(byte[] b, int off, int len) + { + throw new UnsupportedOperationException("Snappy block compressor is not supported"); + } + + @Override + public long getBytesRead() + { + throw new UnsupportedOperationException("Snappy block compressor is not supported"); + } + + @Override + public long getBytesWritten() + { + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override public void finish() - throws IOException { - out.flush(); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override - public void resetState() - throws IOException + public boolean finished() { - out.flush(); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override - public void write(int b) - throws IOException + public int compress(byte[] b, int off, int len) { - out.write(b); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } + + @Override + public void reset() {} + + @Override + public void end() {} + + @Override + public void reinit(Configuration conf) {} } - private static class SnappyCompressionInputStream - extends CompressionInputStream + /** + * No Hadoop code seems to actually use the decompressor, so just return a dummy one so the createInputStream method + * with a decompressor can function. This interface can be implemented if needed. + */ + @DoNotPool + private static class HadoopSnappyDecompressor + implements Decompressor { - public SnappyCompressionInputStream(InputStream inputStream) - throws IOException + @Override + public void setInput(byte[] b, int off, int len) + { + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); + } + + @Override + public boolean needsInput() { - super(new SnappyInputStream(inputStream)); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } @Override - public int read(byte[] b, int off, int len) - throws IOException + public void setDictionary(byte[] b, int off, int len) { - return in.read(b, off, len); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } @Override - public void resetState() - throws IOException + public boolean needsDictionary() { - throw new UnsupportedOperationException("resetState not supported for Snappy"); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } @Override - public int read() - throws IOException + public boolean finished() { - return in.read(); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } + + @Override + public int decompress(byte[] b, int off, int len) + { + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); + } + + @Override + public int getRemaining() + { + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); + } + + @Override + public void reset() {} + + @Override + public void end() {} } } diff --git a/src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java b/src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java new file mode 100644 index 0000000..c3e0706 --- /dev/null +++ b/src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java @@ -0,0 +1,163 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import org.apache.hadoop.io.compress.CompressionInputStream; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; + +class HadoopSnappyInputStream + extends CompressionInputStream +{ + private final InputStream in; + + private int uncompressedBlockLength; + private byte[] uncompressedChunk = new byte[0]; + private int uncompressedChunkOffset; + private int uncompressedChunkLength; + + private byte[] compressed = new byte[0]; + + public HadoopSnappyInputStream(InputStream in) + throws IOException + { + super(in); + this.in = in; + } + + @Override + public int read() + throws IOException + { + if (uncompressedChunkOffset >= uncompressedChunkLength) { + readNextChunk(uncompressedChunk, 0, uncompressedChunk.length); + if (uncompressedChunkLength == 0) { + return -1; + } + } + return uncompressedChunk[uncompressedChunkOffset++] & 0xFF; + } + + @Override + public int read(byte[] output, int offset, int length) + throws IOException + { + if (uncompressedChunkOffset >= uncompressedChunkLength) { + boolean directDecompress = readNextChunk(output, offset, length); + if (uncompressedChunkLength == 0) { + return -1; + } + if (directDecompress) { + uncompressedChunkOffset += uncompressedChunkLength; + return uncompressedChunkLength; + } + } + int size = Math.min(length, uncompressedChunkLength - uncompressedChunkOffset); + System.arraycopy(uncompressedChunk, uncompressedChunkOffset, output, offset, size); + uncompressedChunkOffset += size; + return size; + } + + @Override + public void resetState() + { + uncompressedBlockLength = 0; + uncompressedChunkOffset = 0; + uncompressedChunkLength = 0; + } + + private boolean readNextChunk(byte[] userBuffer, int userOffset, int userLength) + throws IOException + { + uncompressedBlockLength -= uncompressedChunkOffset; + uncompressedChunkOffset = 0; + uncompressedChunkLength = 0; + while (uncompressedBlockLength == 0) { + uncompressedBlockLength = readBigEndianInt(); + if (uncompressedBlockLength == -1) { + uncompressedBlockLength = 0; + return false; + } + } + + int compressedChunkLength = readBigEndianInt(); + if (compressedChunkLength == -1) { + return false; + } + + if (compressed.length < compressedChunkLength) { + // over allocate buffer which makes decompression easier + compressed = new byte[compressedChunkLength + SIZE_OF_LONG]; + } + readInput(compressedChunkLength, compressed); + + uncompressedChunkLength = Snappy.getUncompressedLength(compressed, 0); + if (uncompressedChunkLength > uncompressedBlockLength) { + throw new IOException("Chunk uncompressed size is greater than block size"); + } + + boolean directUncompress = true; + if (uncompressedChunkLength > userLength) { + if (uncompressedChunk.length < uncompressedChunkLength) { + // over allocate buffer which makes decompression easier + uncompressedChunk = new byte[uncompressedChunkLength + SIZE_OF_LONG]; + } + directUncompress = false; + userBuffer = uncompressedChunk; + userOffset = 0; + userLength = uncompressedChunk.length; + } + + int bytes = Snappy.uncompress(compressed, 0, compressedChunkLength, userBuffer, userOffset, userLength); + if (uncompressedChunkLength != bytes) { + throw new IOException("Expected to read " + uncompressedChunkLength + " bytes, but data only contained " + bytes + " bytes"); + } + return directUncompress; + } + + private void readInput(int length, byte[] buffer) + throws IOException + { + int offset = 0; + while (offset < length) { + int size = in.read(buffer, offset, length - offset); + if (size == -1) { + throw new EOFException("encountered EOF while reading block data"); + } + offset += size; + } + } + + private int readBigEndianInt() + throws IOException + { + int b1 = in.read(); + if (b1 < 0) { + return -1; + } + int b2 = in.read(); + int b3 = in.read(); + int b4 = in.read(); + + // If any of the other bits are negative, the stream it truncated + if ((b2 | b3 | b4) < 0) { + throw new IOException("Stream is truncated"); + } + return ((b1 << 24) + (b2 << 16) + (b3 << 8) + (b4)); + } +} diff --git a/src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java b/src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java new file mode 100644 index 0000000..9f3fdd3 --- /dev/null +++ b/src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java @@ -0,0 +1,115 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import org.apache.hadoop.io.compress.CompressionOutputStream; + +import java.io.IOException; +import java.io.OutputStream; + +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; + +class HadoopSnappyOutputStream + extends CompressionOutputStream +{ + private final byte[] inputBuffer; + private final int inputMaxSize; + private int inputOffset; + + private final byte[] outputBuffer; + + public HadoopSnappyOutputStream(OutputStream out, int bufferSize) + { + super(out); + inputBuffer = new byte[bufferSize]; + // leave extra space free at end of buffers to make compression (slightly) faster + inputMaxSize = inputBuffer.length - compressionOverhead(bufferSize); + outputBuffer = new byte[Snappy.maxCompressedLength(inputMaxSize) + SIZE_OF_LONG]; + } + + @Override + public void write(int b) + throws IOException + { + inputBuffer[inputOffset++] = (byte) b; + if (inputOffset >= inputMaxSize) { + writeNextChunk(inputBuffer, 0, this.inputOffset); + } + } + + @Override + public void write(byte[] buffer, int offset, int length) + throws IOException + { + while (length > 0) { + int chunkSize = Math.min(length, inputMaxSize - inputOffset); + // favor writing directly from the user buffer to avoid the extra copy + if (inputOffset == 0 && length > inputMaxSize) { + writeNextChunk(buffer, offset, chunkSize); + } + else { + System.arraycopy(buffer, offset, inputBuffer, inputOffset, chunkSize); + inputOffset += chunkSize; + + if (inputOffset >= inputMaxSize) { + writeNextChunk(inputBuffer, 0, inputOffset); + } + } + length -= chunkSize; + offset += chunkSize; + } + } + + @Override + public void finish() + throws IOException + { + if (inputOffset > 0) { + writeNextChunk(inputBuffer, 0, this.inputOffset); + } + } + + @Override + public void resetState() + throws IOException + { + finish(); + } + + private void writeNextChunk(byte[] input, int inputOffset, int inputLength) + throws IOException + { + int compressedSize = Snappy.compress(input, inputOffset, inputLength, outputBuffer, 0); + + writeBigEndianInt(inputLength); + writeBigEndianInt(compressedSize); + out.write(outputBuffer, 0, compressedSize); + + this.inputOffset = 0; + } + + private void writeBigEndianInt(int value) + throws IOException + { + out.write(value >>> 24); + out.write(value >>> 16); + out.write(value >>> 8); + out.write(value); + } + + private static int compressionOverhead(int size) + { + return (size / 6) + 32; + } +} \ No newline at end of file diff --git a/src/main/java/org/iq80/snappy/IncompatibleJvmException.java b/src/main/java/org/iq80/snappy/IncompatibleJvmException.java new file mode 100644 index 0000000..e1dc6c8 --- /dev/null +++ b/src/main/java/org/iq80/snappy/IncompatibleJvmException.java @@ -0,0 +1,23 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +public class IncompatibleJvmException + extends RuntimeException +{ + public IncompatibleJvmException(String message) + { + super(message); + } +} diff --git a/src/main/java/org/iq80/snappy/Main.java b/src/main/java/org/iq80/snappy/Main.java deleted file mode 100644 index 9de9ed4..0000000 --- a/src/main/java/org/iq80/snappy/Main.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -public class Main -{ - public static void main(String[] args) - throws Exception - { - if ((args.length == 1) && (args[0].equals("-c"))) { - compress(); - } - else if ((args.length == 1) && (args[0].equals("-d"))) { - uncompress(); - } - else { - usage(); - } - } - - private static void usage() - { - System.err.println("Usage: java -jar snappy.jar OPTION"); - System.err.println("Compress or uncompress with Snappy."); - System.err.println(); - System.err.println(" -c compress from stdin to stdout"); - System.err.println(" -d uncompress from stdin to stdout"); - System.exit(100); - } - - private static void compress() - throws IOException - { - copy(System.in, new SnappyOutputStream(System.out)); - } - - private static void uncompress() - throws IOException - { - copy(new SnappyInputStream(System.in), System.out); - } - - private static void copy(InputStream in, OutputStream out) - throws IOException - { - byte[] buf = new byte[4096]; - while (true) { - int r = in.read(buf); - if (r == -1) { - out.close(); - in.close(); - return; - } - out.write(buf, 0, r); - } - } -} diff --git a/src/main/java/org/iq80/snappy/SlowMemory.java b/src/main/java/org/iq80/snappy/SlowMemory.java deleted file mode 100644 index f1f1336..0000000 --- a/src/main/java/org/iq80/snappy/SlowMemory.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -class SlowMemory - implements Memory -{ - @Override - public boolean fastAccessSupported() - { - return false; - } - - @Override - public int lookupShort(short[] data, int index) - { - return data[index] & 0xFFFF; - } - - @Override - public int loadByte(byte[] data, int index) - { - return data[index] & 0xFF; - } - - @Override - public int loadInt(byte[] data, int index) - { - return (data[index] & 0xff) | - (data[index + 1] & 0xff) << 8 | - (data[index + 2] & 0xff) << 16 | - (data[index + 3] & 0xff) << 24; - } - - @Override - public void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex) - { - for (int i = 0; i < 8; i++) { - dest[destIndex + i] = src[srcIndex + i]; - } - } - - @Override - public long loadLong(byte[] data, int index) - { - return (data[index] & 0xffL) | - (data[index + 1] & 0xffL) << 8 | - (data[index + 2] & 0xffL) << 16 | - (data[index + 3] & 0xffL) << 24 | - (data[index + 4] & 0xffL) << 32 | - (data[index + 5] & 0xffL) << 40 | - (data[index + 6] & 0xffL) << 48 | - (data[index + 7] & 0xffL) << 56; - } - - @Override - public void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length) - { - System.arraycopy(input, inputIndex, output, outputIndex, length); - } -} diff --git a/src/main/java/org/iq80/snappy/Snappy.java b/src/main/java/org/iq80/snappy/Snappy.java index ee071ef..d201f43 100644 --- a/src/main/java/org/iq80/snappy/Snappy.java +++ b/src/main/java/org/iq80/snappy/Snappy.java @@ -17,83 +17,60 @@ */ package org.iq80.snappy; -import java.io.IOException; -import java.io.InputStream; import java.util.Arrays; -import static org.iq80.snappy.SnappyFramed.HEADER_BYTES; -import static org.iq80.snappy.SnappyInternalUtils.checkArgument; -import static org.iq80.snappy.SnappyInternalUtils.checkNotNull; -import static org.iq80.snappy.SnappyOutputStream.STREAM_HEADER; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; public final class Snappy { - - private static final int MAX_HEADER_LENGTH = Math.max(STREAM_HEADER.length, HEADER_BYTES.length); - - private Snappy() - { - } - - /** - * Uses the stream marker bytes to determine if the {@link SnappyFramedInputStream} or - * {@link SnappyInputStream} should be used to decompress the content of source. - * - * @param source The compressed content to decompress. Must {@link InputStream#markSupported() - * support} {@link InputStream#mark(int).} - * @param verifyChecksums Indicates if the crc32-c checksums should be calculated and verified. - * @return An appropriate {@link InputStream} implementation to decompress the content. - * @throws IllegalArgumentException If source does not {@link InputStream#markSupported() - * support} mark/reset or does not contain the appropriate marker bytes for either implementation. - */ - @SuppressWarnings("deprecation") - public static InputStream determineSnappyInputStream(InputStream source, boolean verifyChecksums) - throws IOException - { - checkNotNull(source, "source is null"); - checkArgument(source.markSupported(), "source does not support mark/reset"); - - // read the header and then reset to start of stream - source.mark(MAX_HEADER_LENGTH); - byte[] buffer = new byte[MAX_HEADER_LENGTH]; - int read = SnappyInternalUtils.readBytes(source, buffer, 0, MAX_HEADER_LENGTH); - source.reset(); - - if (read != STREAM_HEADER.length || read != HEADER_BYTES.length) { - throw new IllegalArgumentException("invalid header"); - } - - if (buffer[0] == HEADER_BYTES[0]) { - checkArgument(Arrays.equals(Arrays.copyOf(buffer, HEADER_BYTES.length), HEADER_BYTES), "invalid header"); - return new SnappyFramedInputStream(source, verifyChecksums); - } - else { - checkArgument(Arrays.equals(Arrays.copyOf(buffer, STREAM_HEADER.length), STREAM_HEADER), "invalid header"); - return new SnappyInputStream(source, verifyChecksums); - } - } + private Snappy() {} public static int getUncompressedLength(byte[] compressed, int compressedOffset) throws CorruptionException { - return SnappyDecompressor.getUncompressedLength(compressed, compressedOffset); + long compressedAddress = ARRAY_BYTE_BASE_OFFSET + compressedOffset; + long compressedLimit = ARRAY_BYTE_BASE_OFFSET + compressed.length; + + return SnappyRawDecompressor.getUncompressedLength(compressed, compressedAddress, compressedLimit); } public static byte[] uncompress(byte[] compressed, int compressedOffset, int compressedSize) throws CorruptionException { - return SnappyDecompressor.uncompress(compressed, compressedOffset, compressedSize); + byte[] output = new byte[getUncompressedLength(compressed, compressedOffset)]; + int uncompressedSize = uncompress(compressed, compressedOffset, compressedSize, output, 0); + if (uncompressedSize != output.length) { + throw new CorruptionException(0, format("Recorded length is %s bytes but actual length after decompression is %s bytes ", + output.length, + uncompressedSize)); + } + return output; } public static int uncompress(byte[] compressed, int compressedOffset, int compressedSize, byte[] uncompressed, int uncompressedOffset) throws CorruptionException { - return SnappyDecompressor.uncompress(compressed, compressedOffset, compressedSize, uncompressed, uncompressedOffset); + return uncompress(compressed, compressedOffset, compressedSize, uncompressed, uncompressedOffset, uncompressed.length - uncompressedOffset); + } + + public static int uncompress(byte[] compressed, int compressedOffset, int compressedSize, byte[] uncompressed, int uncompressedOffset, int uncompressedLength) + { + verifyRange(compressed, compressedOffset, compressedSize); + verifyRange(uncompressed, uncompressedOffset, uncompressedLength); + + long inputAddress = ARRAY_BYTE_BASE_OFFSET + compressedOffset; + long inputLimit = inputAddress + compressedSize; + long outputAddress = ARRAY_BYTE_BASE_OFFSET + uncompressedOffset; + long outputLimit = outputAddress + uncompressed.length - uncompressedOffset; + + return SnappyRawDecompressor.decompress(compressed, inputAddress, inputLimit, uncompressed, outputAddress, outputLimit); } public static int maxCompressedLength(int sourceLength) { - return SnappyCompressor.maxCompressedLength(sourceLength); + return SnappyRawCompressor.maxCompressedLength(sourceLength); } public static int compress( @@ -103,14 +80,18 @@ public static int compress( byte[] compressed, int compressedOffset) { - return SnappyCompressor.compress(uncompressed, - uncompressedOffset, - uncompressedLength, - compressed, - compressedOffset); - } + verifyRange(uncompressed, uncompressedOffset, uncompressedLength); + verifyRange(compressed, compressedOffset, compressed.length - compressedOffset); + + long inputAddress = ARRAY_BYTE_BASE_OFFSET + uncompressedOffset; + long inputLimit = inputAddress + uncompressedLength; + long outputAddress = ARRAY_BYTE_BASE_OFFSET + compressedOffset; + long outputLimit = outputAddress + compressed.length - compressedOffset; + short[] table = new short[SnappyRawCompressor.MAX_HASH_TABLE_SIZE]; + return SnappyRawCompressor.compress(uncompressed, inputAddress, inputLimit, compressed, outputAddress, outputLimit, table); + } public static byte[] compress(byte[] data) { byte[] compressedOut = new byte[maxCompressedLength(data.length)]; @@ -119,8 +100,11 @@ public static byte[] compress(byte[] data) return trimmedBuffer; } - static final int LITERAL = 0; - static final int COPY_1_BYTE_OFFSET = 1; // 3 bit length + 3 bits of offset in opcode - static final int COPY_2_BYTE_OFFSET = 2; - static final int COPY_4_BYTE_OFFSET = 3; + private static void verifyRange(byte[] data, int offset, int length) + { + requireNonNull(data, "data is null"); + if (offset < 0 || length < 0 || offset + length > data.length) { + throw new IllegalArgumentException(format("Invalid offset or length (%s, %s) in array of length %s", offset, length, data.length)); + } + } } diff --git a/src/main/java/org/iq80/snappy/SnappyCompressor.java b/src/main/java/org/iq80/snappy/SnappyCompressor.java deleted file mode 100644 index 54ff780..0000000 --- a/src/main/java/org/iq80/snappy/SnappyCompressor.java +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.nio.ByteOrder; -import java.util.Arrays; - -import static org.iq80.snappy.Snappy.COPY_1_BYTE_OFFSET; -import static org.iq80.snappy.Snappy.COPY_2_BYTE_OFFSET; -import static org.iq80.snappy.Snappy.LITERAL; - -final class SnappyCompressor -{ - private static final boolean NATIVE_LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; - - // *** DO NOT CHANGE THE VALUE OF kBlockSize *** - // - // New Compression code chops up the input into blocks of at most - // the following size. This ensures that back-references in the - // output never cross kBlockSize block boundaries. This can be - // helpful in implementing blocked decompression. However the - // decompression code should not rely on this guarantee since older - // compression code may not obey it. - private static final int BLOCK_LOG = 15; - private static final int BLOCK_SIZE = 1 << BLOCK_LOG; - - private static final int INPUT_MARGIN_BYTES = 15; - - private static final int MAX_HASH_TABLE_BITS = 14; - private static final int MAX_HASH_TABLE_SIZE = 1 << MAX_HASH_TABLE_BITS; - - public static int maxCompressedLength(int sourceLength) - { - // Compressed data can be defined as: - // compressed := item* literal* - // item := literal* copy - // - // The trailing literal sequence has a space blowup of at most 62/60 - // since a literal of length 60 needs one tag byte + one extra byte - // for length information. - // - // Item blowup is trickier to measure. Suppose the "copy" op copies - // 4 bytes of data. Because of a special check in the encoding code, - // we produce a 4-byte copy only if the offset is < 65536. Therefore - // the copy op takes 3 bytes to encode, and this type of item leads - // to at most the 62/60 blowup for representing literals. - // - // Suppose the "copy" op copies 5 bytes of data. If the offset is big - // enough, it will take 5 bytes to encode the copy op. Therefore the - // worst case here is a one-byte literal followed by a five-byte copy. - // I.e., 6 bytes of input turn into 7 bytes of "compressed" data. - // - // This last factor dominates the blowup, so the final estimate is: - return 32 + sourceLength + sourceLength / 6; - } - - public static int compress( - final byte[] uncompressed, - final int uncompressedOffset, - final int uncompressedLength, - final byte[] compressed, - final int compressedOffset) - { - // First write the uncompressed size to the output as a variable length int - int compressedIndex = writeUncompressedLength(compressed, compressedOffset, uncompressedLength); - - int hashTableSize = getHashTableSize(uncompressedLength); - BufferRecycler recycler = BufferRecycler.instance(); - short[] table = recycler.allocEncodingHash(hashTableSize); - - for (int read = 0; read < uncompressedLength; read += BLOCK_SIZE) { - // Get encoding table for compression - Arrays.fill(table, (short) 0); - - compressedIndex = compressFragment( - uncompressed, - uncompressedOffset + read, - Math.min(uncompressedLength - read, BLOCK_SIZE), - compressed, - compressedIndex, - table); - } - - recycler.releaseEncodingHash(table); - - return compressedIndex - compressedOffset; - } - - private static int compressFragment( - final byte[] input, - final int inputOffset, - final int inputSize, - final byte[] output, - int outputIndex, - final short[] table) - { - int ipIndex = inputOffset; - assert inputSize <= BLOCK_SIZE; - final int ipEndIndex = inputOffset + inputSize; - - int hashTableSize = getHashTableSize(inputSize); - // todo given that hashTableSize is required to be a power of 2, this is overly complex - final int shift = 32 - log2Floor(hashTableSize); - assert (hashTableSize & (hashTableSize - 1)) == 0 : "table must be power of two"; - assert 0xFFFFFFFF >>> shift == hashTableSize - 1; - - // Bytes in [nextEmitIndex, ipIndex) will be emitted as literal bytes. Or - // [nextEmitIndex, ipEndIndex) after the main loop. - int nextEmitIndex = ipIndex; - - if (inputSize >= INPUT_MARGIN_BYTES) { - final int ipLimit = inputOffset + inputSize - INPUT_MARGIN_BYTES; - while (ipIndex <= ipLimit) { - assert nextEmitIndex <= ipIndex; - - // The body of this loop calls EmitLiteral once and then EmitCopy one or - // more times. (The exception is that when we're close to exhausting - // the input we exit and emit a literal.) - // - // In the first iteration of this loop we're just starting, so - // there's nothing to copy, so calling EmitLiteral once is - // necessary. And we only start a new iteration when the - // current iteration has determined that a call to EmitLiteral will - // precede the next call to EmitCopy (if any). - // - // Step 1: Scan forward in the input looking for a 4-byte-long match. - // If we get close to exhausting the input exit and emit a final literal. - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned, look at every third byte, etc.. When a match is found, - // immediately go back to looking at every byte. This is a small loss - // (~5% performance, ~0.1% density) for compressible data due to more - // bookkeeping, but for non-compressible data (such as JPEG) it's a huge - // win since the compressor quickly "realizes" the data is incompressible - // and doesn't bother looking for matches everywhere. - // - // The "skip" variable keeps track of how many bytes there are since the - // last match; dividing it by 32 (ie. right-shifting by five) gives the - // number of bytes to move ahead for each iteration. - int skip = 32; - - int[] candidateResult = findCandidate(input, ipIndex, ipLimit, inputOffset, shift, table, skip); - ipIndex = candidateResult[0]; - int candidateIndex = candidateResult[1]; - skip = candidateResult[2]; - if (ipIndex + bytesBetweenHashLookups(skip) > ipLimit) { - break; - } - - // Step 2: A 4-byte match has been found. We'll later see if more - // than 4 bytes match. But, prior to the match, input - // bytes [nextEmit, ip) are unmatched. Emit them as "literal bytes." - assert nextEmitIndex + 16 <= ipEndIndex; - outputIndex = emitLiteral(output, outputIndex, input, nextEmitIndex, ipIndex - nextEmitIndex, true); - - // Step 3: Call EmitCopy, and then see if another EmitCopy could - // be our next move. Repeat until we find no match for the - // input immediately after what was consumed by the last EmitCopy call. - // - // If we exit this loop normally then we need to call EmitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can exit - // this loop via goto if we get close to exhausting the input. - int[] indexes = emitCopies(input, inputOffset, inputSize, ipIndex, output, outputIndex, table, shift, candidateIndex); - ipIndex = indexes[0]; - outputIndex = indexes[1]; - nextEmitIndex = ipIndex; - } - } - - // goto emitRemainder hack - if (nextEmitIndex < ipEndIndex) { - // Emit the remaining bytes as a literal - outputIndex = emitLiteral(output, outputIndex, input, nextEmitIndex, ipEndIndex - nextEmitIndex, false); - } - return outputIndex; - } - - private static int[] findCandidate(byte[] input, int ipIndex, int ipLimit, int inputOffset, int shift, short[] table, int skip) - { - - int candidateIndex = 0; - for (ipIndex += 1; ipIndex + bytesBetweenHashLookups(skip) <= ipLimit; ipIndex += bytesBetweenHashLookups(skip++)) { - // hash the 4 bytes starting at the input pointer - int currentInt = SnappyInternalUtils.loadInt(input, ipIndex); - int hash = hashBytes(currentInt, shift); - - // get the position of a 4 bytes sequence with the same hash - candidateIndex = inputOffset + table[hash]; - assert candidateIndex >= 0; - assert candidateIndex < ipIndex; - - // update the hash to point to the current position - table[hash] = (short) (ipIndex - inputOffset); - - // if the 4 byte sequence a the candidate index matches the sequence at the - // current position, proceed to the next phase - if (currentInt == SnappyInternalUtils.loadInt(input, candidateIndex)) { - break; - } - } - return new int[] {ipIndex, candidateIndex, skip}; - } - - private static int bytesBetweenHashLookups(int skip) - { - return (skip >>> 5); - } - - private static int[] emitCopies( - byte[] input, - final int inputOffset, - final int inputSize, - int ipIndex, - byte[] output, - int outputIndex, - short[] table, - int shift, - int candidateIndex) - { - // Step 3: Call EmitCopy, and then see if another EmitCopy could - // be our next move. Repeat until we find no match for the - // input immediately after what was consumed by the last EmitCopy call. - // - // If we exit this loop normally then we need to call EmitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can exit - // this loop via goto if we get close to exhausting the input. - int inputBytes; - do { - // We have a 4-byte match at ip, and no need to emit any - // "literal bytes" prior to ip. - int matched = 4 + findMatchLength(input, candidateIndex + 4, input, ipIndex + 4, inputOffset + inputSize); - int offset = ipIndex - candidateIndex; - assert SnappyInternalUtils.equals(input, ipIndex, input, candidateIndex, matched); - ipIndex += matched; - - // emit the copy operation for this chunk - outputIndex = emitCopy(output, outputIndex, offset, matched); - - // are we done? - if (ipIndex >= inputOffset + inputSize - INPUT_MARGIN_BYTES) { - return new int[] {ipIndex, outputIndex}; - } - - // We could immediately start working at ip now, but to improve - // compression we first update table[Hash(ip - 1, ...)]. - int prevInt; - if (SnappyInternalUtils.HAS_UNSAFE) { - long foo = SnappyInternalUtils.loadLong(input, ipIndex - 1); - prevInt = (int) foo; - inputBytes = (int) (foo >>> 8); - } - else { - prevInt = SnappyInternalUtils.loadInt(input, ipIndex - 1); - inputBytes = SnappyInternalUtils.loadInt(input, ipIndex); - } - - // add hash starting with previous byte - int prevHash = hashBytes(prevInt, shift); - table[prevHash] = (short) (ipIndex - inputOffset - 1); - - // update hash of current byte - int curHash = hashBytes(inputBytes, shift); - - candidateIndex = inputOffset + table[curHash]; - table[curHash] = (short) (ipIndex - inputOffset); - - } while (inputBytes == SnappyInternalUtils.loadInt(input, candidateIndex)); - return new int[] {ipIndex, outputIndex}; - } - - private static int emitLiteral( - byte[] output, - int outputIndex, - byte[] literal, - final int literalIndex, - final int length, - final boolean allowFastPath) - { - SnappyInternalUtils.checkPositionIndexes(literalIndex, literalIndex + length, literal.length); - - int n = length - 1; // Zero-length literals are disallowed - if (n < 60) { - // Size fits in tag byte - output[outputIndex++] = (byte) (LITERAL | n << 2); - - // The vast majority of copies are below 16 bytes, for which a - // call to memcpy is overkill. This fast path can sometimes - // copy up to 15 bytes too much, but that is okay in the - // main loop, since we have a bit to go on for both sides: - // - // - The input will always have kInputMarginBytes = 15 extra - // available bytes, as long as we're in the main loop, and - // if not, allowFastPath = false. - // - The output will always have 32 spare bytes (see - // MaxCompressedLength). - if (allowFastPath && length <= 16) { - SnappyInternalUtils.copyLong(literal, literalIndex, output, outputIndex); - SnappyInternalUtils.copyLong(literal, literalIndex + 8, output, outputIndex + 8); - outputIndex += length; - return outputIndex; - } - } - else if (n < (1 << 8)) { - output[outputIndex++] = (byte) (LITERAL | 59 + 1 << 2); - output[outputIndex++] = (byte) (n); - } - else if (n < (1 << 16)) { - output[outputIndex++] = (byte) (LITERAL | 59 + 2 << 2); - output[outputIndex++] = (byte) (n); - output[outputIndex++] = (byte) (n >>> 8); - } - else if (n < (1 << 24)) { - output[outputIndex++] = (byte) (LITERAL | 59 + 3 << 2); - output[outputIndex++] = (byte) (n); - output[outputIndex++] = (byte) (n >>> 8); - output[outputIndex++] = (byte) (n >>> 16); - } - else { - output[outputIndex++] = (byte) (LITERAL | 59 + 4 << 2); - output[outputIndex++] = (byte) (n); - output[outputIndex++] = (byte) (n >>> 8); - output[outputIndex++] = (byte) (n >>> 16); - output[outputIndex++] = (byte) (n >>> 24); - } - - SnappyInternalUtils.checkPositionIndexes(literalIndex, literalIndex + length, literal.length); - - System.arraycopy(literal, literalIndex, output, outputIndex, length); - outputIndex += length; - return outputIndex; - } - - private static int emitCopyLessThan64( - byte[] output, - int outputIndex, - int offset, - int length) - { - assert offset >= 0; - assert length <= 64; - assert length >= 4; - assert offset < 65536; - - if ((length < 12) && (offset < 2048)) { - int lenMinus4 = length - 4; - assert (lenMinus4 < 8); // Must fit in 3 bits - output[outputIndex++] = (byte) (COPY_1_BYTE_OFFSET | ((lenMinus4) << 2) | ((offset >>> 8) << 5)); - output[outputIndex++] = (byte) (offset); - } - else { - output[outputIndex++] = (byte) (COPY_2_BYTE_OFFSET | ((length - 1) << 2)); - output[outputIndex++] = (byte) (offset); - output[outputIndex++] = (byte) (offset >>> 8); - } - return outputIndex; - } - - private static int emitCopy( - byte[] output, - int outputIndex, - int offset, - int length) - { - // Emit 64 byte copies but make sure to keep at least four bytes reserved - while (length >= 68) { - outputIndex = emitCopyLessThan64(output, outputIndex, offset, 64); - length -= 64; - } - - // Emit an extra 60 byte copy if have too much data to fit in one copy - if (length > 64) { - outputIndex = emitCopyLessThan64(output, outputIndex, offset, 60); - length -= 60; - } - - // Emit remainder - outputIndex = emitCopyLessThan64(output, outputIndex, offset, length); - return outputIndex; - } - - private static int findMatchLength( - byte[] s1, - int s1Index, - byte[] s2, - final int s2Index, - int s2Limit) - { - assert (s2Limit >= s2Index); - - if (SnappyInternalUtils.HAS_UNSAFE) { - int matched = 0; - - while (s2Index + matched <= s2Limit - 4 && SnappyInternalUtils.loadInt(s2, s2Index + matched) == SnappyInternalUtils.loadInt(s1, s1Index + matched)) { - matched += 4; - } - - if (NATIVE_LITTLE_ENDIAN && s2Index + matched <= s2Limit - 4) { - int x = SnappyInternalUtils.loadInt(s2, s2Index + matched) ^ SnappyInternalUtils.loadInt(s1, s1Index + matched); - int matchingBits = Integer.numberOfTrailingZeros(x); - matched += matchingBits >> 3; - } - else { - while (s2Index + matched < s2Limit && s1[s1Index + matched] == s2[s2Index + matched]) { - ++matched; - } - } - return matched; - } - else { - int length = s2Limit - s2Index; - for (int matched = 0; matched < length; matched++) { - if (s1[s1Index + matched] != s2[s2Index + matched]) { - return matched; - } - } - return length; - } - } - - private static int getHashTableSize(int inputSize) - { - // Use smaller hash table when input.size() is smaller, since we - // fill the table, incurring O(hash table size) overhead for - // compression, and if the input is short, we won't need that - // many hash table entries anyway. - assert (MAX_HASH_TABLE_SIZE >= 256); - - int hashTableSize = 256; - while (hashTableSize < MAX_HASH_TABLE_SIZE && hashTableSize < inputSize) { - hashTableSize <<= 1; - } - assert 0 == (hashTableSize & (hashTableSize - 1)) : "hash must be power of two"; - assert hashTableSize <= MAX_HASH_TABLE_SIZE : "hash table too large"; - return hashTableSize; - -// // todo should be faster but is not -// int newHashTableSize; -// if (inputSize < 256) { -// newHashTableSize = 256; -// } else if (inputSize > kMaxHashTableSize) { -// newHashTableSize = kMaxHashTableSize; -// } else { -// int leadingZeros = Integer.numberOfLeadingZeros(inputSize - 1); -// newHashTableSize = 1 << (32 - leadingZeros); -// } -// -// assert 0 == (newHashTableSize & (newHashTableSize - 1)) : "hash must be power of two"; -// assert newHashTableSize <= kMaxHashTableSize : "hash table too large"; -// return newHashTableSize; - } - - // Any hash function will produce a valid compressed bitstream, but a good - // hash function reduces the number of collisions and thus yields better - // compression for compressible input, and more speed for incompressible - // input. Of course, it doesn't hurt if the hash function is reasonably fast - // either, as it gets called a lot. - private static int hashBytes(int bytes, int shift) - { - int kMul = 0x1e35a7bd; - return (bytes * kMul) >>> shift; - } - - private static int log2Floor(int n) - { - return n == 0 ? -1 : 31 ^ Integer.numberOfLeadingZeros(n); - } - - /** - * Writes the uncompressed length as variable length integer. - */ - private static int writeUncompressedLength(byte[] compressed, int compressedOffset, int uncompressedLength) - { - int highBitMask = 0x80; - if (uncompressedLength < (1 << 7) && uncompressedLength >= 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength); - } - else if (uncompressedLength < (1 << 14) && uncompressedLength > 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 7); - } - else if (uncompressedLength < (1 << 21) && uncompressedLength > 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 7) | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 14); - } - else if (uncompressedLength < (1 << 28) && uncompressedLength > 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 7) | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 14) | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 21); - } - else { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 7) | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 14) | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 21) | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 28); - } - return compressedOffset; - } -} diff --git a/src/main/java/org/iq80/snappy/Memory.java b/src/main/java/org/iq80/snappy/SnappyConstants.java similarity index 51% rename from src/main/java/org/iq80/snappy/Memory.java rename to src/main/java/org/iq80/snappy/SnappyConstants.java index 53972d1..838322c 100644 --- a/src/main/java/org/iq80/snappy/Memory.java +++ b/src/main/java/org/iq80/snappy/SnappyConstants.java @@ -1,8 +1,4 @@ /* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -17,19 +13,15 @@ */ package org.iq80.snappy; -interface Memory +final class SnappyConstants { - boolean fastAccessSupported(); - - int lookupShort(short[] data, int index); - - int loadByte(byte[] data, int index); - - int loadInt(byte[] data, int index); - - void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex); + static final int SIZE_OF_SHORT = 2; + static final int SIZE_OF_INT = 4; + static final int SIZE_OF_LONG = 8; - long loadLong(byte[] data, int index); + static final int LITERAL = 0; + static final int COPY_1_BYTE_OFFSET = 1; // 3 bit length + 3 bits of offset in opcode + static final int COPY_2_BYTE_OFFSET = 2; - void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length); + private SnappyConstants() {} } diff --git a/src/main/java/org/iq80/snappy/SnappyDecompressor.java b/src/main/java/org/iq80/snappy/SnappyDecompressor.java deleted file mode 100644 index 0b1b01b..0000000 --- a/src/main/java/org/iq80/snappy/SnappyDecompressor.java +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import static org.iq80.snappy.SnappyInternalUtils.copyLong; -import static org.iq80.snappy.SnappyInternalUtils.loadByte; -import static org.iq80.snappy.SnappyInternalUtils.lookupShort; - -final class SnappyDecompressor -{ - private static final int MAX_INCREMENT_COPY_OVERFLOW = 20; - - public static int getUncompressedLength(byte[] compressed, int compressedOffset) - throws CorruptionException - { - return readUncompressedLength(compressed, compressedOffset)[0]; - } - - public static byte[] uncompress(byte[] compressed, int compressedOffset, int compressedSize) - throws CorruptionException - { - // Read the uncompressed length from the front of the compressed input - int[] varInt = readUncompressedLength(compressed, compressedOffset); - int expectedLength = varInt[0]; - compressedOffset += varInt[1]; - compressedSize -= varInt[1]; - - // allocate the uncompressed buffer - byte[] uncompressed = new byte[expectedLength]; - - // Process the entire input - int uncompressedSize = decompressAllTags( - compressed, - compressedOffset, - compressedSize, - uncompressed, - 0); - - if (!(expectedLength == uncompressedSize)) { - throw new CorruptionException(String.format("Recorded length is %s bytes but actual length after decompression is %s bytes ", - expectedLength, - uncompressedSize)); - } - - return uncompressed; - } - - public static int uncompress(byte[] compressed, int compressedOffset, int compressedSize, byte[] uncompressed, int uncompressedOffset) - throws CorruptionException - { - // Read the uncompressed length from the front of the compressed input - int[] varInt = readUncompressedLength(compressed, compressedOffset); - int expectedLength = varInt[0]; - compressedOffset += varInt[1]; - compressedSize -= varInt[1]; - - SnappyInternalUtils.checkArgument(expectedLength <= uncompressed.length - uncompressedOffset, - "Uncompressed length %s must be less than %s", expectedLength, uncompressed.length - uncompressedOffset); - - // Process the entire input - int uncompressedSize = decompressAllTags( - compressed, - compressedOffset, - compressedSize, - uncompressed, - uncompressedOffset); - - if (!(expectedLength == uncompressedSize)) { - throw new CorruptionException(String.format("Recorded length is %s bytes but actual length after decompression is %s bytes ", - expectedLength, - uncompressedSize)); - } - - return expectedLength; - } - - private static int decompressAllTags( - final byte[] input, - final int inputOffset, - final int inputSize, - final byte[] output, - final int outputOffset) - throws CorruptionException - { - final int outputLimit = output.length; - - final int ipLimit = inputOffset + inputSize; - int opIndex = outputOffset; - int ipIndex = inputOffset; - - while (ipIndex < ipLimit - 5) { - int opCode = loadByte(input, ipIndex++); - int entry = lookupShort(opLookupTable, opCode); - int trailerBytes = entry >>> 11; - int trailer = readTrailer(input, ipIndex, trailerBytes); - - // advance the ipIndex past the op codes - ipIndex += entry >>> 11; - int length = entry & 0xff; - - if ((opCode & 0x3) == Snappy.LITERAL) { - int literalLength = length + trailer; - copyLiteral(input, ipIndex, output, opIndex, literalLength); - ipIndex += literalLength; - opIndex += literalLength; - } - else { - // copyOffset/256 is encoded in bits 8..10. By just fetching - // those bits, we get copyOffset (since the bit-field starts at - // bit 8). - int copyOffset = entry & 0x700; - copyOffset += trailer; - - // inline to force hot-spot to keep inline - // - // Equivalent to incrementalCopy (below) except that it can write up to ten extra - // bytes after the end of the copy, and that it is faster. - // - // The main part of this loop is a simple copy of eight bytes at a time until - // we've copied (at least) the requested amount of bytes. However, if op and - // src are less than eight bytes apart (indicating a repeating pattern of - // length < 8), we first need to expand the pattern in order to get the correct - // results. For instance, if the buffer looks like this, with the eight-byte - // and patterns marked as intervals: - // - // abxxxxxxxxxxxx - // [------] src - // [------] op - // - // a single eight-byte copy from to will repeat the pattern once, - // after which we can move two bytes without moving : - // - // ababxxxxxxxxxx - // [------] src - // [------] op - // - // and repeat the exercise until the two no longer overlap. - // - // This allows us to do very well in the special case of one single byte - // repeated many times, without taking a big hit for more general cases. - // - // The worst case of extra writing past the end of the match occurs when - // op - src == 1 and len == 1; the last copy will read from byte positions - // [0..7] and write to [4..11], whereas it was only supposed to write to - // position 1. Thus, ten excess bytes. - { - int spaceLeft = outputLimit - opIndex; - int srcIndex = opIndex - copyOffset; - if (srcIndex < outputOffset) { - throw new CorruptionException("Invalid copy offset for opcode starting at " + (ipIndex - trailerBytes - 1)); - } - - if (length <= 16 && copyOffset >= 8 && spaceLeft >= 16) { - // Fast path, used for the majority (70-80%) of dynamic invocations. - copyLong(output, srcIndex, output, opIndex); - copyLong(output, srcIndex + 8, output, opIndex + 8); - } - else if (spaceLeft >= length + MAX_INCREMENT_COPY_OVERFLOW) { - incrementalCopyFastPath(output, srcIndex, opIndex, length); - } - else { - incrementalCopy(output, srcIndex, output, opIndex, length); - } - } - opIndex += length; - } - } - - - for (; ipIndex < ipLimit; ) { - int[] result = decompressTagSlow(input, ipIndex, output, outputLimit, outputOffset, opIndex); - ipIndex = result[0]; - opIndex = result[1]; - } - - return opIndex - outputOffset; - } - - /** - * This is a second copy of the inner loop of decompressTags used when near the end - * of the input. The key difference is the reading of the trailer bytes. The fast - * code does a blind read of the next 4 bytes as an int, and this code assembles - * the int byte-by-byte to assure that the array is not over run. The reason this - * code path is separate is the if condition to choose between these two seemingly - * small differences costs like 10-20% of the throughput. I'm hoping in future - * versions of hot-spot this code can be integrated into the main loop but for now - * it is worth the extra maintenance pain to get the extra 10-20%. - */ - private static int[] decompressTagSlow(byte[] input, int ipIndex, byte[] output, int outputLimit, int outputOffset, int opIndex) - throws CorruptionException - { - // read the op code - int opCode = loadByte(input, ipIndex++); - int entry = lookupShort(opLookupTable, opCode); - int trailerBytes = entry >>> 11; - // - // Key difference here - // - int trailer = 0; - switch (trailerBytes) { - case 4: - trailer = (input[ipIndex + 3] & 0xff) << 24; - case 3: - trailer |= (input[ipIndex + 2] & 0xff) << 16; - case 2: - trailer |= (input[ipIndex + 1] & 0xff) << 8; - case 1: - trailer |= (input[ipIndex] & 0xff); - } - - // advance the ipIndex past the op codes - ipIndex += trailerBytes; - int length = entry & 0xff; - - if ((opCode & 0x3) == Snappy.LITERAL) { - int literalLength = length + trailer; - copyLiteral(input, ipIndex, output, opIndex, literalLength); - ipIndex += literalLength; - opIndex += literalLength; - } - else { - // copyOffset/256 is encoded in bits 8..10. By just fetching - // those bits, we get copyOffset (since the bit-field starts at - // bit 8). - int copyOffset = entry & 0x700; - copyOffset += trailer; - - // inline to force hot-spot to keep inline - { - int spaceLeft = outputLimit - opIndex; - int srcIndex = opIndex - copyOffset; - - if (srcIndex < outputOffset) { - throw new CorruptionException("Invalid copy offset for opcode starting at " + (ipIndex - trailerBytes - 1)); - } - - if (length <= 16 && copyOffset >= 8 && spaceLeft >= 16) { - // Fast path, used for the majority (70-80%) of dynamic invocations. - copyLong(output, srcIndex, output, opIndex); - copyLong(output, srcIndex + 8, output, opIndex + 8); - } - else if (spaceLeft >= length + MAX_INCREMENT_COPY_OVERFLOW) { - incrementalCopyFastPath(output, srcIndex, opIndex, length); - } - else { - incrementalCopy(output, srcIndex, output, opIndex, length); - } - } - opIndex += length; - } - return new int[] {ipIndex, opIndex}; - } - - private static int readTrailer(byte[] data, int index, int bytes) - { - return SnappyInternalUtils.loadInt(data, index) & wordmask[bytes]; - } - - private static void copyLiteral(byte[] input, int ipIndex, byte[] output, int opIndex, int length) - throws CorruptionException - { - assert length > 0; - assert ipIndex >= 0; - assert opIndex >= 0; - - int spaceLeft = output.length - opIndex; - int readableBytes = input.length - ipIndex; - - if (readableBytes < length || spaceLeft < length) { - throw new CorruptionException("Corrupt literal length"); - } - - if (length <= 16 && spaceLeft >= 16 && readableBytes >= 16) { - copyLong(input, ipIndex, output, opIndex); - copyLong(input, ipIndex + 8, output, opIndex + 8); - } - else { - int fastLength = length & 0xFFFFFFF8; - if (fastLength <= 64) { - // copy long-by-long - for (int i = 0; i < fastLength; i += 8) { - copyLong(input, ipIndex + i, output, opIndex + i); - } - - // copy byte-by-byte - int slowLength = length & 0x7; - // NOTE: This is not a manual array copy. We are copying an overlapping region - // and we want input data to repeat as it is recopied. see incrementalCopy below. - //noinspection ManualArrayCopy - for (int i = 0; i < slowLength; i += 1) { - output[opIndex + fastLength + i] = input[ipIndex + fastLength + i]; - } - } - else { - SnappyInternalUtils.copyMemory(input, ipIndex, output, opIndex, length); - } - } - } - - /** - * Copy "len" bytes from "src" to "op", one byte at a time. Used for - * handling COPY operations where the input and output regions may - * overlap. For example, suppose: - * src == "ab" - * op == src + 2 - * len == 20 - *

- * After incrementalCopy, the result will have - * eleven copies of "ab" - * ababababababababababab - * Note that this does not match the semantics of either memcpy() - * or memmove(). - */ - private static void incrementalCopy(byte[] src, int srcIndex, byte[] op, int opIndex, int length) - { - do { - op[opIndex++] = src[srcIndex++]; - } while (--length > 0); - } - - private static void incrementalCopyFastPath(byte[] output, int srcIndex, int opIndex, int length) - { - int copiedLength = 0; - while ((opIndex + copiedLength) - srcIndex < 8) { - copyLong(output, srcIndex, output, opIndex + copiedLength); - copiedLength += (opIndex + copiedLength) - srcIndex; - } - - for (int i = 0; i < length - copiedLength; i += 8) { - copyLong(output, srcIndex + i, output, opIndex + copiedLength + i); - } - } - - // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits - private static final int[] wordmask = new int[] { - 0, 0xff, 0xffff, 0xffffff, 0xffffffff - }; - - // Data stored per entry in lookup table: - // Range Bits-used Description - // ------------------------------------ - // 1..64 0..7 Literal/copy length encoded in opcode byte - // 0..7 8..10 Copy offset encoded in opcode byte / 256 - // 0..4 11..13 Extra bytes after opcode - // - // We use eight bits for the length even though 7 would have sufficed - // because of efficiency reasons: - // (1) Extracting a byte is faster than a bit-field - // (2) It properly aligns copy offset so we do not need a <<8 - private static final short[] opLookupTable = new short[] { - 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, - 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, - 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, - 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, - 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, - 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, - 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, - 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, - 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, - 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, - 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, - 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, - 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, - 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, - 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, - 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, - 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, - 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, - 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, - 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, - 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, - 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, - 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, - 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, - 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, - 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, - 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, - 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, - 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, - 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, - 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, - 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 - }; - - /** - * Reads the variable length integer encoded a the specified offset, and - * returns this length with the number of bytes read. - */ - private static int[] readUncompressedLength(byte[] compressed, int compressedOffset) - throws CorruptionException - { - int result; - int bytesRead = 0; - { - int b = compressed[compressedOffset + bytesRead++] & 0xFF; - result = b & 0x7f; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 7; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 14; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 21; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 28; - if ((b & 0x80) != 0) { - throw new CorruptionException("last byte of compressed length int has high bit set"); - } - } - } - } - } - } - return new int[] {result, bytesRead}; - } -} diff --git a/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java b/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java index a3de8c5..467b21c 100644 --- a/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java +++ b/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java @@ -17,29 +17,221 @@ */ package org.iq80.snappy; + +import java.io.EOFException; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; -import static org.iq80.snappy.SnappyFramed.COMPRESSED_DATA_FLAG; -import static org.iq80.snappy.SnappyFramed.HEADER_BYTES; -import static org.iq80.snappy.SnappyFramed.STREAM_IDENTIFIER_FLAG; -import static org.iq80.snappy.SnappyFramed.UNCOMPRESSED_DATA_FLAG; import static org.iq80.snappy.SnappyFramedOutputStream.MAX_BLOCK_SIZE; +import static java.lang.Math.min; /** * Implements the x-snappy-framed as an {@link InputStream}. */ -public class SnappyFramedInputStream - extends AbstractSnappyInputStream +public final class SnappyFramedInputStream + extends InputStream { + private final InputStream in; + private final byte[] frameHeader; + private final boolean verifyChecksums; + + /** + * A single frame read from the underlying {@link InputStream}. + */ + private byte[] input = new byte[0]; + /** + * The decompressed data from {@link #input}. + */ + private byte[] uncompressed = new byte[0]; + /** + * Indicates if this instance has been closed. + */ + private boolean closed; + /** + * Indicates if we have reached the EOF on {@link #in}. + */ + private boolean eof; + /** + * The position in {@link #input} to read to. + */ + private int valid; + /** + * The next position to read from {@link #buffer}. + */ + private int position; + /** + * Buffer is a reference to the real buffer of uncompressed data for the + * current block: uncompressed if the block is compressed, or input if it is + * not. + */ + private byte[] buffer; + + public SnappyFramedInputStream(InputStream in) + throws IOException + { + this(in, true); + } + public SnappyFramedInputStream(InputStream in, boolean verifyChecksums) throws IOException { - super(in, MAX_BLOCK_SIZE, 4, verifyChecksums, HEADER_BYTES); + this.in = in; + this.verifyChecksums = verifyChecksums; + allocateBuffersBasedOnSize(MAX_BLOCK_SIZE + 5); + this.frameHeader = new byte[4]; + + // stream must begin with stream header + byte[] actualHeader = new byte[SnappyFramed.HEADER_BYTES.length]; + + int read = SnappyInternalUtils.readBytes(in, actualHeader, 0, actualHeader.length); + if (read < SnappyFramed.HEADER_BYTES.length) { + throw new EOFException("encountered EOF while reading stream header"); + } + if (!Arrays.equals(SnappyFramed.HEADER_BYTES, actualHeader)) { + throw new IOException("invalid stream header"); + } + } + + @Override + public int read() + throws IOException + { + if (closed) { + return -1; + } + if (!ensureBuffer()) { + return -1; + } + return buffer[position++] & 0xFF; + } + + @Override + public int read(byte[] output, int offset, int length) + throws IOException + { + SnappyInternalUtils.checkNotNull(output, "output is null"); + SnappyInternalUtils.checkPositionIndexes(offset, offset + length, output.length); + if (closed) { + throw new IOException("Stream is closed"); + } + + if (length == 0) { + return 0; + } + if (!ensureBuffer()) { + return -1; + } + + int size = min(length, available()); + System.arraycopy(buffer, position, output, offset, size); + position += size; + return size; + } + + @Override + public int available() + throws IOException + { + if (closed) { + return 0; + } + return valid - position; } @Override - protected FrameMetaData getFrameMetaData(byte[] frameHeader) + public void close() + throws IOException + { + try { + in.close(); + } + finally { + if (!closed) { + closed = true; + } + } + } + + private boolean ensureBuffer() + throws IOException + { + if (available() > 0) { + return true; + } + if (eof) { + return false; + } + + if (!readBlockHeader()) { + eof = true; + return false; + } + + // get action based on header + FrameMetaData frameMetaData = getFrameMetaData(frameHeader); + + if (FrameAction.SKIP == frameMetaData.frameAction) { + SnappyInternalUtils.skip(in, frameMetaData.length); + return ensureBuffer(); + } + + if (frameMetaData.length > input.length) { + allocateBuffersBasedOnSize(frameMetaData.length); + } + + int actualRead = SnappyInternalUtils.readBytes(in, input, 0, frameMetaData.length); + if (actualRead != frameMetaData.length) { + throw new EOFException("unexpected EOF when reading frame"); + } + + FrameData frameData = getFrameData(input); + + if (FrameAction.UNCOMPRESS == frameMetaData.frameAction) { + int uncompressedLength = Snappy.getUncompressedLength(input, frameData.offset); + + if (uncompressedLength > uncompressed.length) { + uncompressed = new byte[uncompressedLength]; + } + + this.valid = Snappy.uncompress(input, frameData.offset, actualRead - frameData.offset, uncompressed, 0); + this.buffer = uncompressed; + this.position = 0; + } + else { + // we need to start reading at the offset + this.position = frameData.offset; + this.buffer = input; + // valid is until the end of the read data, regardless of offset + // indicating where we start + this.valid = actualRead; + } + + if (verifyChecksums) { + int actualCrc32c = Crc32C.maskedCrc32c(buffer, position, valid - position); + if (frameData.checkSum != actualCrc32c) { + throw new IOException("Corrupt input: invalid checksum"); + } + } + + return true; + } + + private void allocateBuffersBasedOnSize(int size) + { + if (input.length < size) { + input = new byte[size]; + } + if (uncompressed.length < size) { + uncompressed = new byte[size]; + } + } + + /** + * Use the content of the frameHeader to describe what type of frame we have + * and the action to take. + */ + private static FrameMetaData getFrameMetaData(byte[] frameHeader) throws IOException { int length = (frameHeader[1] & 0xFF); @@ -50,15 +242,15 @@ protected FrameMetaData getFrameMetaData(byte[] frameHeader) FrameAction frameAction; int flag = frameHeader[0] & 0xFF; switch (flag) { - case COMPRESSED_DATA_FLAG: + case SnappyFramed.COMPRESSED_DATA_FLAG: frameAction = FrameAction.UNCOMPRESS; minLength = 5; break; - case UNCOMPRESSED_DATA_FLAG: + case SnappyFramed.UNCOMPRESSED_DATA_FLAG: frameAction = FrameAction.RAW; minLength = 5; break; - case STREAM_IDENTIFIER_FLAG: + case SnappyFramed.STREAM_IDENTIFIER_FLAG: if (length != 6) { throw new IOException("stream identifier chunk with invalid length: " + length); } @@ -83,8 +275,13 @@ protected FrameMetaData getFrameMetaData(byte[] frameHeader) return new FrameMetaData(frameAction, length); } - @Override - protected FrameData getFrameData(byte[] frameHeader, byte[] content, int length) + /** + * Extract frame data + * + * @param content The content of the frame. Content begins at index {@code 0}. + * @return Metadata about the content of the frame. + */ + private static FrameData getFrameData(byte[] content) { // crc is contained in the frame content int crc32c = (content[3] & 0xFF) << 24 | @@ -94,4 +291,53 @@ protected FrameData getFrameData(byte[] frameHeader, byte[] content, int length) return new FrameData(crc32c, 4); } + + private boolean readBlockHeader() + throws IOException + { + int read = SnappyInternalUtils.readBytes(in, frameHeader, 0, frameHeader.length); + + if (read == -1) { + return false; + } + + if (read < frameHeader.length) { + throw new EOFException("encountered EOF while reading block header"); + } + + return true; + } + + private enum FrameAction + { + RAW, SKIP, UNCOMPRESS + } + + private static final class FrameMetaData + { + final int length; + final FrameAction frameAction; + + /** + * @param frameAction + * @param length + */ + public FrameMetaData(FrameAction frameAction, int length) + { + this.frameAction = frameAction; + this.length = length; + } + } + + private static final class FrameData + { + final int checkSum; + final int offset; + + public FrameData(int checkSum, int offset) + { + this.checkSum = checkSum; + this.offset = offset; + } + } } diff --git a/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java b/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java index e625ea7..fa18f40 100644 --- a/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java +++ b/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java @@ -20,16 +20,11 @@ import java.io.IOException; import java.io.OutputStream; -import static org.iq80.snappy.SnappyFramed.COMPRESSED_DATA_FLAG; -import static org.iq80.snappy.SnappyFramed.HEADER_BYTES; -import static org.iq80.snappy.SnappyFramed.UNCOMPRESSED_DATA_FLAG; -import static org.iq80.snappy.SnappyInternalUtils.checkArgument; - /** * Implements the x-snappy-framed as an {@link OutputStream}. */ public final class SnappyFramedOutputStream - extends AbstractSnappyOutputStream + extends OutputStream { /** * We place an additional restriction that the uncompressed data in @@ -41,42 +36,217 @@ public final class SnappyFramedOutputStream public static final int DEFAULT_BLOCK_SIZE = MAX_BLOCK_SIZE; public static final double DEFAULT_MIN_COMPRESSION_RATIO = 0.85d; + private final int blockSize; + private final byte[] buffer; + private final byte[] outputBuffer; + private final double minCompressionRatio; + private final OutputStream out; + private final boolean writeChecksums; + + private int position; + private boolean closed; + /** + * Creates a Snappy output stream to write data to the specified underlying output stream. + * + * @param out the underlying output stream + */ public SnappyFramedOutputStream(OutputStream out) throws IOException { - this(out, DEFAULT_BLOCK_SIZE, DEFAULT_MIN_COMPRESSION_RATIO); + this(out, true); + } + + /** + * Creates a Snappy output stream to write data to the specified underlying output stream. + * + * @param out the underlying output stream + */ + public SnappyFramedOutputStream(OutputStream out, int blockSize, double minCompressionRatio) + throws IOException + { + this(out, true, blockSize, minCompressionRatio); + } + + /** + * Creates a Snappy output stream with block checksums disabled. This is only useful for + * apples-to-apples benchmarks with other compressors that do not perform block checksums. + * + * @param out the underlying output stream + */ + public static SnappyFramedOutputStream newChecksumFreeBenchmarkOutputStream(OutputStream out) + throws IOException + { + return new SnappyFramedOutputStream(out, false); + } + + private SnappyFramedOutputStream(OutputStream out, boolean writeChecksums) + throws IOException + { + this(out, writeChecksums, DEFAULT_BLOCK_SIZE, DEFAULT_MIN_COMPRESSION_RATIO); } - public SnappyFramedOutputStream(OutputStream out, int blockSize, - double minCompressionRatio) + private SnappyFramedOutputStream(OutputStream out, boolean writeChecksums, int blockSize, double minCompressionRatio) throws IOException { - super(out, blockSize, minCompressionRatio); - checkArgument(blockSize > 0 && blockSize <= MAX_BLOCK_SIZE, "blockSize must be in (0, 65536]", blockSize); + this.out = SnappyInternalUtils.checkNotNull(out, "out is null"); + this.writeChecksums = writeChecksums; + SnappyInternalUtils.checkArgument(minCompressionRatio > 0 && minCompressionRatio <= 1.0, "minCompressionRatio %1s must be between (0,1.0].", minCompressionRatio); + this.minCompressionRatio = minCompressionRatio; + this.blockSize = blockSize; + this.buffer = new byte[blockSize]; + this.outputBuffer = new byte[Snappy.maxCompressedLength(blockSize)]; + + out.write(SnappyFramed.HEADER_BYTES); + SnappyInternalUtils.checkArgument(blockSize > 0 && blockSize <= MAX_BLOCK_SIZE, "blockSize must be in (0, 65536]", blockSize); } @Override - protected void writeHeader(OutputStream out) + public void write(int b) throws IOException { - out.write(HEADER_BYTES); + if (closed) { + throw new IOException("Stream is closed"); + } + if (position >= blockSize) { + flushBuffer(); + } + buffer[position++] = (byte) b; + } + + @Override + public void write(byte[] input, int offset, int length) + throws IOException + { + SnappyInternalUtils.checkNotNull(input, "input is null"); + SnappyInternalUtils.checkPositionIndexes(offset, offset + length, input.length); + if (closed) { + throw new IOException("Stream is closed"); + } + + int free = blockSize - position; + + // easy case: enough free space in buffer for entire input + if (free >= length) { + copyToBuffer(input, offset, length); + return; + } + + // fill partial buffer as much as possible and flush + if (position > 0) { + copyToBuffer(input, offset, free); + flushBuffer(); + offset += free; + length -= free; + } + + // write remaining full blocks directly from input array + while (length >= blockSize) { + writeCompressed(input, offset, blockSize); + offset += blockSize; + length -= blockSize; + } + + // copy remaining partial block into now-empty buffer + copyToBuffer(input, offset, length); + } + + @Override + public void flush() + throws IOException + { + if (closed) { + throw new IOException("Stream is closed"); + } + flushBuffer(); + out.flush(); + } + + @Override + public void close() + throws IOException + { + if (closed) { + return; + } + try { + flush(); + out.close(); + } + finally { + closed = true; + } + } + + private void copyToBuffer(byte[] input, int offset, int length) + { + System.arraycopy(input, offset, buffer, position, length); + position += length; } /** - * Each chunk consists first a single byte of chunk identifier, then a - * three-byte little-endian length of the chunk in bytes (from 0 to - * 16777215, inclusive), and then the data if any. The four bytes of chunk - * header is not counted in the data length. + * Compresses and writes out any buffered data. This does nothing if there + * is no currently buffered data. */ - @Override - protected void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) + private void flushBuffer() + throws IOException + { + if (position > 0) { + writeCompressed(buffer, 0, position); + position = 0; + } + } + + /** + * {@link Crc32C#maskedCrc32c(byte[], int, int) Calculates} the crc, compresses + * the data, determines if the compression ratio is acceptable and calls + * {@link #writeBlock(OutputStream, byte[], int, int, boolean, int)} to + * actually write the frame. + * + * @param input The byte[] containing the raw data to be compressed. + * @param offset The offset into input where the data starts. + * @param length The amount of data in input. + */ + private void writeCompressed(byte[] input, int offset, int length) + throws IOException + { + // crc is based on the user supplied input data + int crc32c = writeChecksums ? Crc32C.maskedCrc32c(input, offset, length) : 0; + + int compressed = Snappy.compress(input, + offset, + length, + outputBuffer, + 0); + + // only use the compressed data if compression ratio is <= the minCompressionRatio + if (((double) compressed / (double) length) <= minCompressionRatio) { + writeBlock(out, outputBuffer, 0, compressed, true, crc32c); + } + else { + // otherwise use the uncompressed data. + writeBlock(out, input, offset, length, false, crc32c); + } + } + + /** + * Write a frame (block) to out. + * + * @param out The {@link OutputStream} to write to. + * @param data The data to write. + * @param offset The offset in data to start at. + * @param length The length of data to use. + * @param compressed Indicates if data is the compressed or raw content. + * This is based on whether the compression ratio desired is + * reached. + * @param crc32c The calculated checksum. + */ + private static void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) throws IOException { - out.write(compressed ? COMPRESSED_DATA_FLAG : UNCOMPRESSED_DATA_FLAG); + out.write(compressed ? SnappyFramed.COMPRESSED_DATA_FLAG : SnappyFramed.UNCOMPRESSED_DATA_FLAG); - // the length written out to the header is both the checksum and the - // frame + // the length written out to the header is both the checksum and the frame int headerLength = length + 4; // write length diff --git a/src/main/java/org/iq80/snappy/SnappyInputStream.java b/src/main/java/org/iq80/snappy/SnappyInputStream.java deleted file mode 100644 index 19afb92..0000000 --- a/src/main/java/org/iq80/snappy/SnappyInputStream.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; - -import static java.lang.String.format; -import static org.iq80.snappy.SnappyOutputStream.MAX_BLOCK_SIZE; -import static org.iq80.snappy.SnappyOutputStream.STREAM_HEADER; - -/** - * This class implements an input stream for reading Snappy compressed data - * of the format produced by {@link SnappyOutputStream}. - *

- * NOTE:This implementation cannot read compressed data produced - * by {@link SnappyFramedOutputStream}. - *

- * - * @deprecated Prefer the use of {@link SnappyFramedInputStream} which implements - * the standard {@code x-snappy-framed} specification. - */ -@Deprecated -public class SnappyInputStream - extends AbstractSnappyInputStream -{ - private static final int HEADER_LENGTH = 7; - - /** - * Creates a Snappy input stream to read data from the specified underlying input stream. - * - * @param in the underlying input stream - */ - public SnappyInputStream(InputStream in) - throws IOException - { - this(in, true); - } - - /** - * Creates a Snappy input stream to read data from the specified underlying input stream. - * - * @param in the underlying input stream - * @param verifyChecksums if true, checksums in input stream will be verified - */ - public SnappyInputStream(InputStream in, boolean verifyChecksums) - throws IOException - { - super(in, MAX_BLOCK_SIZE, HEADER_LENGTH, verifyChecksums, STREAM_HEADER); - } - - @Override - protected FrameMetaData getFrameMetaData(byte[] frameHeader) - throws IOException - { - int x = frameHeader[0] & 0xFF; - - int a = frameHeader[1] & 0xFF; - int b = frameHeader[2] & 0xFF; - int length = (a << 8) | b; - - FrameAction action; - switch (x) { - case 0x00: - action = FrameAction.RAW; - break; - case 0x01: - action = FrameAction.UNCOMPRESS; - break; - case 's': - if (!Arrays.equals(STREAM_HEADER, frameHeader)) { - throw new IOException(format("invalid compressed flag in header: 0x%02x", x)); - } - action = FrameAction.SKIP; - length = 0; - break; - default: - throw new IOException(format("invalid compressed flag in header: 0x%02x", x)); - } - - if (((length <= 0) || (length > MAX_BLOCK_SIZE)) && action != FrameAction.SKIP) { - throw new IOException("invalid block size in header: " + length); - } - - return new FrameMetaData(action, length); - } - - @Override - protected FrameData getFrameData(byte[] frameHeader, byte[] content, int length) - { - // crc is contained in the frame header - int crc32c = (frameHeader[3] & 0xFF) << 24 | - (frameHeader[4] & 0xFF) << 16 | - (frameHeader[5] & 0xFF) << 8 | - (frameHeader[6] & 0xFF); - - return new FrameData(crc32c, 0); - } -} diff --git a/src/main/java/org/iq80/snappy/SnappyInternalUtils.java b/src/main/java/org/iq80/snappy/SnappyInternalUtils.java index b2b00c6..3249194 100644 --- a/src/main/java/org/iq80/snappy/SnappyInternalUtils.java +++ b/src/main/java/org/iq80/snappy/SnappyInternalUtils.java @@ -19,100 +19,10 @@ import java.io.IOException; import java.io.InputStream; -import java.nio.ByteOrder; final class SnappyInternalUtils { - private SnappyInternalUtils() - { - } - - private static final Memory memory; - - static { - // Try to only load one implementation of Memory to assure the call sites are monomorphic (fast) - Memory memoryInstance = null; - - // TODO enable UnsafeMemory on big endian machines - // - // The current UnsafeMemory code assumes the machine is little endian, and will - // not work correctly on big endian CPUs. For now, we will disable UnsafeMemory on - // big endian machines. This will make the code significantly slower on big endian. - // In the future someone should add the necessary flip bytes calls to make this - // work efficiently on big endian machines. - if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) { - try { - Class unsafeMemoryClass = SnappyInternalUtils.class.getClassLoader().loadClass("org.iq80.snappy.UnsafeMemory").asSubclass(Memory.class); - Memory unsafeMemory = unsafeMemoryClass.newInstance(); - if (unsafeMemory.loadInt(new byte[4], 0) == 0) { - memoryInstance = unsafeMemory; - } - } - catch (Throwable ignored) { - } - } - if (memoryInstance == null) { - try { - Class slowMemoryClass = SnappyInternalUtils.class.getClassLoader().loadClass("org.iq80.snappy.SlowMemory").asSubclass(Memory.class); - Memory slowMemory = slowMemoryClass.newInstance(); - if (slowMemory.loadInt(new byte[4], 0) == 0) { - memoryInstance = slowMemory; - } - else { - throw new AssertionError("SlowMemory class is broken!"); - } - } - catch (Throwable ignored) { - throw new AssertionError("Could not find SlowMemory class"); - } - } - memory = memoryInstance; - } - - static final boolean HAS_UNSAFE = memory.fastAccessSupported(); - - static boolean equals(byte[] left, int leftIndex, byte[] right, int rightIndex, int length) - { - checkPositionIndexes(leftIndex, leftIndex + length, left.length); - checkPositionIndexes(rightIndex, rightIndex + length, right.length); - - for (int i = 0; i < length; i++) { - if (left[leftIndex + i] != right[rightIndex + i]) { - return false; - } - } - return true; - } - - public static int lookupShort(short[] data, int index) - { - return memory.lookupShort(data, index); - } - - public static int loadByte(byte[] data, int index) - { - return memory.loadByte(data, index); - } - - static int loadInt(byte[] data, int index) - { - return memory.loadInt(data, index); - } - - static void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex) - { - memory.copyLong(src, srcIndex, dest, destIndex); - } - - static long loadLong(byte[] data, int index) - { - return memory.loadLong(data, index); - } - - static void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length) - { - memory.copyMemory(input, inputIndex, output, outputIndex, length); - } + private SnappyInternalUtils() {} // // Copied from Guava Preconditions @@ -140,7 +50,7 @@ static void checkPositionIndexes(int start, int end, int size) } } - static String badPositionIndexes(int start, int end, int size) + private static String badPositionIndexes(int start, int end, int size) { if (start < 0 || start > size) { return badPositionIndex(start, size, "start index"); @@ -152,7 +62,7 @@ static String badPositionIndexes(int start, int end, int size) return String.format("end index (%s) must not be less than start index (%s)", end, start); } - static String badPositionIndex(int index, int size, String desc) + private static String badPositionIndex(int index, int size, String desc) { if (index < 0) { return String.format("%s (%s) must not be negative", desc, index); diff --git a/src/main/java/org/iq80/snappy/SnappyOutputStream.java b/src/main/java/org/iq80/snappy/SnappyOutputStream.java deleted file mode 100644 index d6b3afc..0000000 --- a/src/main/java/org/iq80/snappy/SnappyOutputStream.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.OutputStream; - -/** - * This class implements an output stream for writing Snappy compressed data. - * The output format is the stream header "snappy\0" followed by one or more - * compressed blocks of data, each of which is preceded by a seven byte header. - *

- * The first byte of the header is a flag indicating if the block is compressed - * or not. A value of 0x00 means uncompressed, and 0x01 means compressed. - *

- * The second and third bytes are the size of the block in the stream as a big - * endian number. This value is never zero as empty blocks are never written. - * The maximum allowed length is 32k (1 << 15). - *

- * The remaining four byes are crc32c checksum of the user input data masked - * with the following function: {@code ((crc >>> 15) | (crc << 17)) + 0xa282ead8 } - *

- * An uncompressed block is simply copied from the input, thus guaranteeing - * that the output is never larger than the input (not including the header). - *

- * NOTE:This data produced by this class is not compatible with the - * {@code x-snappy-framed} specification. It can only be read by - * {@link SnappyInputStream}. - *

- * - * @deprecated Use {@link SnappyFramedOutputStream} which implements - * the standard {@code x-snappy-framed} specification. - */ -@Deprecated -public class SnappyOutputStream - extends AbstractSnappyOutputStream -{ - static final byte[] STREAM_HEADER = new byte[] {'s', 'n', 'a', 'p', 'p', 'y', 0}; - - // the header format requires the max block size to fit in 15 bits -- do not change! - static final int MAX_BLOCK_SIZE = 1 << 15; - - /** - * Write out the uncompressed content if the compression ratio (compressed length / raw length) exceeds this value. - */ - public static final double MIN_COMPRESSION_RATIO = 7.0 / 8.0; - - private final boolean calculateChecksum; - - /** - * Creates a Snappy output stream to write data to the specified underlying output stream. - * - * @param out the underlying output stream - */ - public SnappyOutputStream(OutputStream out) - throws IOException - { - this(out, true); - } - - private SnappyOutputStream(OutputStream out, boolean calculateChecksum) - throws IOException - { - super(out, MAX_BLOCK_SIZE, MIN_COMPRESSION_RATIO); - this.calculateChecksum = calculateChecksum; - } - - /** - * Creates a Snappy output stream with block checksums disabled. This is only useful for - * apples-to-apples benchmarks with other compressors that do not perform block checksums. - * - * @param out the underlying output stream - */ - public static SnappyOutputStream newChecksumFreeBenchmarkOutputStream(OutputStream out) - throws IOException - { - return new SnappyOutputStream(out, false); - } - - @Override - protected void writeHeader(OutputStream out) - throws IOException - { - out.write(STREAM_HEADER); - } - - @Override - protected int calculateCRC32C(byte[] data, int offset, int length) - { - return calculateChecksum ? super.calculateCRC32C(data, offset, length) : 0; - } - - @Override - protected void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) - throws IOException - { - // write compressed flag - out.write(compressed ? 0x01 : 0x00); - - // write length - out.write(length >>> 8); - out.write(length); - - // write crc32c of user input data - out.write(crc32c >>> 24); - out.write(crc32c >>> 16); - out.write(crc32c >>> 8); - out.write(crc32c); - - // write data - out.write(data, offset, length); - } -} diff --git a/src/main/java/org/iq80/snappy/SnappyRawCompressor.java b/src/main/java/org/iq80/snappy/SnappyRawCompressor.java new file mode 100644 index 0000000..b9ee582 --- /dev/null +++ b/src/main/java/org/iq80/snappy/SnappyRawCompressor.java @@ -0,0 +1,411 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import java.util.Arrays; + +import static org.iq80.snappy.SnappyConstants.COPY_1_BYTE_OFFSET; +import static org.iq80.snappy.SnappyConstants.COPY_2_BYTE_OFFSET; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_INT; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_SHORT; +import static org.iq80.snappy.UnsafeUtil.UNSAFE; + +final class SnappyRawCompressor +{ + // The size of a compression block. Note that many parts of the compression + // code assumes that BLOCK_SIZE <= 65536; in particular, the hash table + // can only store 16-bit offsets, and EmitCopy() also assumes the offset + // is 65535 bytes or less. Note also that if you change this, it will + // affect the framing format (see framing_format.txt). + // + // Note that there might be older data around that is compressed with larger + // block sizes, so the decompression code should not rely on the + // non-existence of long back-references. + private static final int BLOCK_LOG = 16; + private static final int BLOCK_SIZE = 1 << BLOCK_LOG; + + private static final int INPUT_MARGIN_BYTES = 15; + + private static final int MAX_HASH_TABLE_BITS = 14; + public static final int MAX_HASH_TABLE_SIZE = 1 << MAX_HASH_TABLE_BITS; + + private SnappyRawCompressor() {} + + public static int maxCompressedLength(int sourceLength) + { + // Compressed data can be defined as: + // compressed := item* literal* + // item := literal* copy + // + // The trailing literal sequence has a space blowup of at most 62/60 + // since a literal of length 60 needs one tag byte + one extra byte + // for length information. + // + // Item blowup is trickier to measure. Suppose the "copy" op copies + // 4 bytes of data. Because of a special check in the encoding code, + // we produce a 4-byte copy only if the offset is < 65536. Therefore + // the copy op takes 3 bytes to encode, and this type of item leads + // to at most the 62/60 blowup for representing literals. + // + // Suppose the "copy" op copies 5 bytes of data. If the offset is big + // enough, it will take 5 bytes to encode the copy op. Therefore the + // worst case here is a one-byte literal followed by a five-byte copy. + // I.e., 6 bytes of input turn into 7 bytes of "compressed" data. + // + // This last factor dominates the blowup, so the final estimate is: + return 32 + sourceLength + sourceLength / 6; + } + + // suppress warnings is required to use assert + @SuppressWarnings("IllegalToken") + public static int compress( + final Object inputBase, + final long inputAddress, + final long inputLimit, + final Object outputBase, + final long outputAddress, + final long outputLimit, + final short[] table) + { + // The compression code assumes output is larger than the max compression size (with 32 bytes of + // extra padding), and does not check bounds for writing to output. + int maxCompressedLength = maxCompressedLength((int) (inputLimit - inputAddress)); + if (outputLimit - outputAddress < maxCompressedLength) { + throw new IllegalArgumentException("Output buffer must be at least " + maxCompressedLength + " bytes"); + } + + // First write the uncompressed size to the output as a variable length int + long output = writeUncompressedLength(outputBase, outputAddress, (int) (inputLimit - inputAddress)); + + for (long blockAddress = inputAddress; blockAddress < inputLimit; blockAddress += BLOCK_SIZE) { + final long blockLimit = Math.min(inputLimit, blockAddress + BLOCK_SIZE); + long input = blockAddress; + assert blockLimit - blockAddress <= BLOCK_SIZE; + + int blockHashTableSize = getHashTableSize((int) (blockLimit - blockAddress)); + Arrays.fill(table, 0, blockHashTableSize, (short) 0); + + // todo given that hashTableSize is required to be a power of 2, this is overly complex + final int shift = 32 - log2Floor(blockHashTableSize); + assert (blockHashTableSize & (blockHashTableSize - 1)) == 0 : "table must be power of two"; + assert 0xFFFFFFFF >>> shift == blockHashTableSize - 1; + + // Bytes in [nextEmitAddress, input) will be emitted as literal bytes. Or + // [nextEmitAddress, inputLimit) after the main loop. + long nextEmitAddress = input; + + final long fastInputLimit = blockLimit - INPUT_MARGIN_BYTES; + while (input <= fastInputLimit) { + assert nextEmitAddress <= input; + + // The body of this loop emits a literal once and then emits a copy one + // or more times. (The exception is that when we're close to exhausting + // the input we exit and emit a literal.) + // + // In the first iteration of this loop we're just starting, so + // there's nothing to copy, so we must emit a literal once. And we + // only start a new iteration when the current iteration has determined + // that a literal will precede the next copy (if any). + // + // Step 1: Scan forward in the input looking for a 4-byte-long match. + // If we get close to exhausting the input exit and emit a final literal. + // + // Heuristic match skipping: If 32 bytes are scanned with no matches + // found, start looking only at every other byte. If 32 more bytes are + // scanned, look at every third byte, etc.. When a match is found, + // immediately go back to looking at every byte. This is a small loss + // (~5% performance, ~0.1% density) for compressible data due to more + // bookkeeping, but for non-compressible data (such as JPEG) it's a huge + // win since the compressor quickly "realizes" the data is incompressible + // and doesn't bother looking for matches everywhere. + // + // The "skip" variable keeps track of how many bytes there are since the + // last match; dividing it by 32 (ie. right-shifting by five) gives the + // number of bytes to move ahead for each iteration. + int skip = 32; + + long candidateIndex = 0; + for (input += 1; input + (skip >>> 5) <= fastInputLimit; input += ((skip++) >>> 5)) { + // hash the 4 bytes starting at the input pointer + int currentInt = UNSAFE.getInt(inputBase, input); + int hash = hashBytes(currentInt, shift); + + // get the position of a 4 bytes sequence with the same hash + candidateIndex = blockAddress + (table[hash] & 0xFFFF); + assert candidateIndex >= 0; + assert candidateIndex < input; + + // update the hash to point to the current position + table[hash] = (short) (input - blockAddress); + + // if the 4 byte sequence a the candidate index matches the sequence at the + // current position, proceed to the next phase + if (currentInt == UNSAFE.getInt(inputBase, candidateIndex)) { + break; + } + } + if (input + (skip >>> 5) > fastInputLimit) { + break; + } + + // Step 2: A 4-byte match has been found. We'll later see if more + // than 4 bytes match. But, prior to the match, input + // bytes [nextEmit, ip) are unmatched. Emit them as "literal bytes." + assert nextEmitAddress + 16 <= blockLimit; + + int literalLength = (int) (input - nextEmitAddress); + output = emitLiteralLength(outputBase, output, literalLength); + + // Fast copy can use 8 extra bytes of input and output, which is safe because: + // - The input will always have INPUT_MARGIN_BYTES = 15 extra available bytes + // - The output will always have 32 spare bytes (see MaxCompressedLength). + output = fastCopy(inputBase, nextEmitAddress, outputBase, output, literalLength); + + // Step 3: Call EmitCopy, and then see if another EmitCopy could + // be our next move. Repeat until we find no match for the + // input immediately after what was consumed by the last EmitCopy call. + // + // If we exit this loop normally then we need to call EmitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can exit + // this loop via goto if we get close to exhausting the input. + int inputBytes; + do { + // We have a 4-byte match at input, and no need to emit any + // "literal bytes" prior to input. + assert (blockLimit >= input + SIZE_OF_INT); + + // determine match length + int matched = count(inputBase, input + SIZE_OF_INT, candidateIndex + SIZE_OF_INT, blockLimit); + matched += SIZE_OF_INT; + + // Emit the copy operation for this chunk + output = emitCopy(outputBase, output, input, candidateIndex, matched); + input += matched; + + // are we done? + if (input >= fastInputLimit) { + break; + } + + // We could immediately start working at input now, but to improve + // compression we first update table[Hash(ip - 1, ...)]. + long longValue = UNSAFE.getLong(inputBase, input - 1); + int prevInt = (int) longValue; + inputBytes = (int) (longValue >>> 8); + + // add hash starting with previous byte + int prevHash = hashBytes(prevInt, shift); + table[prevHash] = (short) (input - blockAddress - 1); + + // update hash of current byte + int curHash = hashBytes(inputBytes, shift); + + candidateIndex = blockAddress + (table[curHash] & 0xFFFF); + table[curHash] = (short) (input - blockAddress); + } while (inputBytes == UNSAFE.getInt(inputBase, candidateIndex)); + nextEmitAddress = input; + } + + // Emit the remaining bytes as a literal + if (nextEmitAddress < blockLimit) { + int literalLength = (int) (blockLimit - nextEmitAddress); + output = emitLiteralLength(outputBase, output, literalLength); + UNSAFE.copyMemory(inputBase, nextEmitAddress, outputBase, output, literalLength); + output += literalLength; + } + } + + return (int) (output - outputAddress); + } + + private static int count(Object inputBase, final long start, long matchStart, long matchLimit) + { + long current = start; + + // first, compare long at a time + while (current < matchLimit - (SIZE_OF_LONG - 1)) { + long diff = UNSAFE.getLong(inputBase, matchStart) ^ UNSAFE.getLong(inputBase, current); + if (diff != 0) { + current += Long.numberOfTrailingZeros(diff) >> 3; + return (int) (current - start); + } + + current += SIZE_OF_LONG; + matchStart += SIZE_OF_LONG; + } + + if (current < matchLimit - (SIZE_OF_INT - 1) && UNSAFE.getInt(inputBase, matchStart) == UNSAFE.getInt(inputBase, current)) { + current += SIZE_OF_INT; + matchStart += SIZE_OF_INT; + } + + if (current < matchLimit - (SIZE_OF_SHORT - 1) && UNSAFE.getShort(inputBase, matchStart) == UNSAFE.getShort(inputBase, current)) { + current += SIZE_OF_SHORT; + matchStart += SIZE_OF_SHORT; + } + + if (current < matchLimit && UNSAFE.getByte(inputBase, matchStart) == UNSAFE.getByte(inputBase, current)) { + ++current; + } + + return (int) (current - start); + } + + private static long emitLiteralLength(Object outputBase, long output, int literalLength) + { + int n = literalLength - 1; // Zero-length literals are disallowed + if (n < 60) { + // Size fits in tag byte + UNSAFE.putByte(outputBase, output++, (byte) (n << 2)); + } + else { + int bytes; + if (n < (1 << 8)) { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 1 << 2)); + bytes = 1; + } + else if (n < (1 << 16)) { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 2 << 2)); + bytes = 2; + } + else if (n < (1 << 24)) { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 3 << 2)); + bytes = 3; + } + else { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 4 << 2)); + bytes = 4; + } + // System is assumed to be little endian, so low bytes will be zero for the smaller numbers + UNSAFE.putInt(outputBase, output, n); + output += bytes; + } + return output; + } + + private static long fastCopy(final Object inputBase, long input, final Object outputBase, long output, final int literalLength) + { + final long outputLimit = output + literalLength; + do { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(inputBase, input)); + input += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + while (output < outputLimit); + return outputLimit; + } + + private static long emitCopy(Object outputBase, long output, long input, long matchIndex, int matchLength) + { + long offset = input - matchIndex; + + // Emit 64 byte copies but make sure to keep at least four bytes reserved + while (matchLength >= 68) { + UNSAFE.putByte(outputBase, output++, (byte) (COPY_2_BYTE_OFFSET + ((64 - 1) << 2))); + UNSAFE.putShort(outputBase, output, (short) offset); + output += SIZE_OF_SHORT; + matchLength -= 64; + } + + // Emit an extra 60 byte copy if have too much data to fit in one copy + // length < 68 + if (matchLength > 64) { + UNSAFE.putByte(outputBase, output++, (byte) (COPY_2_BYTE_OFFSET + ((60 - 1) << 2))); + UNSAFE.putShort(outputBase, output, (short) offset); + output += SIZE_OF_SHORT; + matchLength -= 60; + } + + // Emit remainder + if ((matchLength < 12) && (offset < 2048)) { + int lenMinus4 = matchLength - 4; + UNSAFE.putByte(outputBase, output++, (byte) (COPY_1_BYTE_OFFSET + ((lenMinus4) << 2) + ((offset >>> 8) << 5))); + UNSAFE.putByte(outputBase, output++, (byte) (offset)); + } + else { + UNSAFE.putByte(outputBase, output++, (byte) (COPY_2_BYTE_OFFSET + ((matchLength - 1) << 2))); + UNSAFE.putShort(outputBase, output, (short) offset); + output += SIZE_OF_SHORT; + } + return output; + } + + @SuppressWarnings("IllegalToken") + private static int getHashTableSize(int inputSize) + { + // Use smaller hash table when input.size() is smaller, since we + // fill the table, incurring O(hash table size) overhead for + // compression, and if the input is short, we won't need that + // many hash table entries anyway. + assert (MAX_HASH_TABLE_SIZE >= 256); + + // smallest power of 2 larger than inputSize + int target = Integer.highestOneBit(inputSize - 1) << 1; + + // keep it between MIN_TABLE_SIZE and MAX_TABLE_SIZE + return Math.max(Math.min(target, MAX_HASH_TABLE_SIZE), 256); + } + + // Any hash function will produce a valid compressed stream, but a good + // hash function reduces the number of collisions and thus yields better + // compression for compressible input, and more speed for incompressible + // input. Of course, it doesn't hurt if the hash function is reasonably fast + // either, as it gets called a lot. + private static int hashBytes(int value, int shift) + { + return (value * 0x1e35a7bd) >>> shift; + } + + private static int log2Floor(int n) + { + return n == 0 ? -1 : 31 ^ Integer.numberOfLeadingZeros(n); + } + + private static final int HIGH_BIT_MASK = 0x80; + + /** + * Writes the uncompressed length as variable length integer. + */ + private static long writeUncompressedLength(Object outputBase, long outputAddress, int uncompressedLength) + { + if (uncompressedLength < (1 << 7) && uncompressedLength >= 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength)); + } + else if (uncompressedLength < (1 << 14) && uncompressedLength > 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 7)); + } + else if (uncompressedLength < (1 << 21) && uncompressedLength > 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 7) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 14)); + } + else if (uncompressedLength < (1 << 28) && uncompressedLength > 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 7) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 14) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 21)); + } + else { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 7) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 14) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 21) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 28)); + } + return outputAddress; + } +} diff --git a/src/main/java/org/iq80/snappy/SnappyRawDecompressor.java b/src/main/java/org/iq80/snappy/SnappyRawDecompressor.java new file mode 100644 index 0000000..4769170 --- /dev/null +++ b/src/main/java/org/iq80/snappy/SnappyRawDecompressor.java @@ -0,0 +1,320 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import static org.iq80.snappy.SnappyConstants.LITERAL; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_INT; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; +import static org.iq80.snappy.UnsafeUtil.UNSAFE; + +final class SnappyRawDecompressor +{ + private static final int[] DEC_32_TABLE = {4, 1, 2, 1, 4, 4, 4, 4}; + private static final int[] DEC_64_TABLE = {0, 0, 0, -1, 0, 1, 2, 3}; + + private SnappyRawDecompressor() {} + + public static int getUncompressedLength(Object compressed, long compressedAddress, long compressedLimit) + { + return readUncompressedLength(compressed, compressedAddress, compressedLimit)[0]; + } + + public static int decompress( + final Object inputBase, + final long inputAddress, + final long inputLimit, + final Object outputBase, + final long outputAddress, + final long outputLimit) + { + // Read the uncompressed length from the front of the input + long input = inputAddress; + int[] varInt = readUncompressedLength(inputBase, input, inputLimit); + int expectedLength = varInt[0]; + input += varInt[1]; + + SnappyInternalUtils.checkArgument(expectedLength <= (outputLimit - outputAddress), + "Uncompressed length %s must be less than %s", expectedLength, (outputLimit - outputAddress)); + + // Process the entire input + int uncompressedSize = uncompressAll( + inputBase, + input, + inputLimit, + outputBase, + outputAddress, + outputLimit); + + if (!(expectedLength == uncompressedSize)) { + throw new CorruptionException(0, String.format("Recorded length is %s bytes but actual length after decompression is %s bytes ", + expectedLength, + uncompressedSize)); + } + + return expectedLength; + } + + private static int uncompressAll( + final Object inputBase, + final long inputAddress, + final long inputLimit, + final Object outputBase, + final long outputAddress, + final long outputLimit) + { + final long fastOutputLimit = outputLimit - SIZE_OF_LONG; // maximum offset in output buffer to which it's safe to write long-at-a-time + + long output = outputAddress; + long input = inputAddress; + + while (input < inputLimit) { + int opCode = UNSAFE.getByte(inputBase, input++) & 0xFF; + int entry = opLookupTable[opCode] & 0xFFFF; + + int trailerBytes = entry >>> 11; + int trailer = 0; + if (input + SIZE_OF_INT < inputLimit) { + trailer = UNSAFE.getInt(inputBase, input) & wordmask[trailerBytes]; + } + else { + if (input + trailerBytes > inputLimit) { + throw new CorruptionException(input - inputAddress); + } + switch (trailerBytes) { + case 4: + trailer = (UNSAFE.getByte(inputBase, input + 3) & 0xff) << 24; + case 3: + trailer |= (UNSAFE.getByte(inputBase, input + 2) & 0xff) << 16; + case 2: + trailer |= (UNSAFE.getByte(inputBase, input + 1) & 0xff) << 8; + case 1: + trailer |= (UNSAFE.getByte(inputBase, input) & 0xff); + } + } + if (trailer < 0) { + throw new CorruptionException(input - inputAddress); + } + input += trailerBytes; + + int length = entry & 0xff; + if (length == 0) { + continue; + } + + if ((opCode & 0x3) == LITERAL) { + int literalLength = length + trailer; + if (literalLength < 0) { + throw new CorruptionException(input - inputAddress); + } + + // copy literal + long literalOutputLimit = output + literalLength; + if (literalOutputLimit > fastOutputLimit || input + literalLength > inputLimit - SIZE_OF_LONG) { + if (literalOutputLimit > outputLimit || input + literalLength > inputLimit) { + throw new CorruptionException(input - inputAddress); + } + + // slow, precise copy + UNSAFE.copyMemory(inputBase, input, outputBase, output, literalLength); + input += literalLength; + output += literalLength; + } + else { + // fast copy. We may over-copy but there's enough room in input and output to not overrun them + do { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(inputBase, input)); + input += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + while (output < literalOutputLimit); + input -= (output - literalOutputLimit); // adjust index if we over-copied + output = literalOutputLimit; + } + } + else { + // matchOffset/256 is encoded in bits 8..10. By just fetching + // those bits, we get matchOffset (since the bit-field starts at + // bit 8). + int matchOffset = entry & 0x700; + matchOffset += trailer; + if (matchOffset < 0) { + throw new CorruptionException(input - inputAddress); + } + + long matchAddress = output - matchOffset; + if (matchAddress < outputAddress || output + length > outputLimit) { + throw new CorruptionException(input - inputAddress); + } + long matchOutputLimit = output + length; + if (matchOutputLimit > outputLimit) { + throw new CorruptionException(input - inputAddress); + } + + if (output > fastOutputLimit) { + // slow match copy + while (output < matchOutputLimit) { + UNSAFE.putByte(outputBase, output++, UNSAFE.getByte(outputBase, matchAddress++)); + } + } + else { + // copy repeated sequence + if (matchOffset < SIZE_OF_LONG) { + // 8 bytes apart so that we can copy long-at-a-time below + int increment32 = DEC_32_TABLE[matchOffset]; + int decrement64 = DEC_64_TABLE[matchOffset]; + + UNSAFE.putByte(outputBase, output, UNSAFE.getByte(outputBase, matchAddress)); + UNSAFE.putByte(outputBase, output + 1, UNSAFE.getByte(outputBase, matchAddress + 1)); + UNSAFE.putByte(outputBase, output + 2, UNSAFE.getByte(outputBase, matchAddress + 2)); + UNSAFE.putByte(outputBase, output + 3, UNSAFE.getByte(outputBase, matchAddress + 3)); + output += SIZE_OF_INT; + matchAddress += increment32; + + UNSAFE.putInt(outputBase, output, UNSAFE.getInt(outputBase, matchAddress)); + output += SIZE_OF_INT; + matchAddress -= decrement64; + } + else { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(outputBase, matchAddress)); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + if (matchOutputLimit > fastOutputLimit) { + while (output < fastOutputLimit) { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(outputBase, matchAddress)); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + while (output < matchOutputLimit) { + UNSAFE.putByte(outputBase, output++, UNSAFE.getByte(outputBase, matchAddress++)); + } + } + else { + while (output < matchOutputLimit) { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(outputBase, matchAddress)); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + } + } + output = matchOutputLimit; // correction in case we over-copied + } + } + + return (int) (output - outputAddress); + } + + // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits + private static final int[] wordmask = new int[] { + 0, 0xff, 0xffff, 0xffffff, 0xffffffff + }; + + // Data stored per entry in lookup table: + // Range Bits-used Description + // ------------------------------------ + // 1..64 0..7 Literal/copy length encoded in opcode byte + // 0..7 8..10 Copy offset encoded in opcode byte / 256 + // 0..4 11..13 Extra bytes after opcode + // + // We use eight bits for the length even though 7 would have sufficed + // because of efficiency reasons: + // (1) Extracting a byte is faster than a bit-field + // (2) It properly aligns copy offset so we do not need a <<8 + private static final short[] opLookupTable = new short[] { + 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, + 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, + 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, + 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, + 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, + 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, + 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, + 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, + 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, + 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, + 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, + 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, + 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, + 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, + 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, + 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, + 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, + 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, + 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, + 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, + 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, + 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, + 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, + 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, + 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, + 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, + 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, + 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, + 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, + 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, + 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, + 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 + }; + + /** + * Reads the variable length integer encoded a the specified offset, and + * returns this length with the number of bytes read. + */ + private static int[] readUncompressedLength(Object compressed, long compressedAddress, long compressedLimit) + { + int result; + int bytesRead = 0; + { + int b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result = b & 0x7f; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 7; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 14; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 21; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 28; + if ((b & 0x80) != 0) { + throw new CorruptionException(compressedAddress + bytesRead, "last byte of compressed length int has high bit set"); + } + } + } + } + } + } + if (result < 0) { + throw new CorruptionException(compressedAddress, "negative compressed length"); + } + return new int[] {result, bytesRead}; + } + + private static int getUnsignedByteSafe(Object base, long address, long limit) + { + if (address >= limit) { + throw new CorruptionException(limit - address, "Input is truncated"); + } + return UNSAFE.getByte(base, address) & 0xFF; + } +} diff --git a/src/main/java/org/iq80/snappy/UnsafeMemory.java b/src/main/java/org/iq80/snappy/UnsafeMemory.java deleted file mode 100644 index 478ca97..0000000 --- a/src/main/java/org/iq80/snappy/UnsafeMemory.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import sun.misc.Unsafe; - -import java.lang.reflect.Field; - -class UnsafeMemory - implements Memory -{ - private static final Unsafe unsafe; - - static { - try { - Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe"); - theUnsafe.setAccessible(true); - unsafe = (Unsafe) theUnsafe.get(null); - // It seems not all Unsafe implementations implement the following method. - new UnsafeMemory().copyMemory(new byte[1], 0, new byte[1], 0, 1); - } - catch (Exception e) { - throw new RuntimeException(e); - } - } - - private static final long BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class); - private static final long SHORT_ARRAY_OFFSET = unsafe.arrayBaseOffset(short[].class); - private static final long SHORT_ARRAY_STRIDE = unsafe.arrayIndexScale(short[].class); - - @Override - public boolean fastAccessSupported() - { - return true; - } - - @Override - public int lookupShort(short[] data, int index) - { - assert index >= 0; - assert index <= data.length; - return unsafe.getShort(data, SHORT_ARRAY_OFFSET + (index * SHORT_ARRAY_STRIDE)) & 0xFFFF; - } - - @Override - public int loadByte(byte[] data, int index) - { - assert index >= 0; - assert index <= data.length; - return unsafe.getByte(data, BYTE_ARRAY_OFFSET + index) & 0xFF; - } - - @Override - public int loadInt(byte[] data, int index) - { - assert index >= 0; - assert index + 4 <= data.length; - return unsafe.getInt(data, BYTE_ARRAY_OFFSET + index); - } - - @Override - public void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex) - { - assert srcIndex >= 0; - assert srcIndex + 8 <= src.length; - assert destIndex >= 0; - assert destIndex + 8 <= dest.length; - long value = unsafe.getLong(src, BYTE_ARRAY_OFFSET + srcIndex); - unsafe.putLong(dest, (BYTE_ARRAY_OFFSET + destIndex), value); - } - - @Override - public long loadLong(byte[] data, int index) - { - assert index > 0; - assert index + 4 < data.length; - return unsafe.getLong(data, BYTE_ARRAY_OFFSET + index); - } - - @Override - public void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length) - { - assert inputIndex >= 0; - assert inputIndex + length <= input.length; - assert outputIndex >= 0; - assert outputIndex + length <= output.length; - unsafe.copyMemory(input, BYTE_ARRAY_OFFSET + inputIndex, output, BYTE_ARRAY_OFFSET + outputIndex, length); - } -} diff --git a/src/main/java/org/iq80/snappy/UnsafeUtil.java b/src/main/java/org/iq80/snappy/UnsafeUtil.java new file mode 100644 index 0000000..f102c01 --- /dev/null +++ b/src/main/java/org/iq80/snappy/UnsafeUtil.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import sun.misc.Unsafe; + +import java.lang.reflect.Field; +import java.nio.ByteOrder; + +import static java.lang.String.format; + +final class UnsafeUtil +{ + public static final Unsafe UNSAFE; + + private UnsafeUtil() {} + + static { + ByteOrder order = ByteOrder.nativeOrder(); + if (!order.equals(ByteOrder.LITTLE_ENDIAN)) { + throw new IncompatibleJvmException(format("Snappy requires a little endian platform (found %s)", order)); + } + + try { + Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe"); + theUnsafe.setAccessible(true); + UNSAFE = (Unsafe) theUnsafe.get(null); + } + catch (Exception e) { + throw new IncompatibleJvmException("Snappy requires access to sun.misc.Unsafe"); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/iq80/snappy/BenchmarkDriver.java b/src/test/java/org/iq80/snappy/BenchmarkDriver.java index a4c0034..5d7f2a5 100644 --- a/src/test/java/org/iq80/snappy/BenchmarkDriver.java +++ b/src/test/java/org/iq80/snappy/BenchmarkDriver.java @@ -225,7 +225,7 @@ public long compress(TestData testData, long iterations) long start = System.nanoTime(); while (iterations-- > 0) { rawOut.reset(); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); out.write(contents); out.close(); } @@ -248,7 +248,7 @@ public long uncompress(TestData testData, long iterations) byte[] contents = testData.getContents(); ByteArrayOutputStream compressedStream = new ByteArrayOutputStream(Snappy.maxCompressedLength(contents.length)); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); out.write(contents); out.close(); byte[] compressed = compressedStream.toByteArray(); @@ -259,7 +259,7 @@ public long uncompress(TestData testData, long iterations) long start = System.nanoTime(); while (iterations-- > 0) { ByteArrayInputStream compIn = new ByteArrayInputStream(compressed); - SnappyInputStream in = new SnappyInputStream(compIn, false); + SnappyFramedInputStream in = new SnappyFramedInputStream(compIn, false); while (in.read(inputBuffer) >= 0) { } @@ -287,12 +287,12 @@ public long roundTrip(TestData testData, long iterations) long start = System.nanoTime(); while (iterations-- > 0) { compressedStream.reset(); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); out.write(contents); out.close(); ByteArrayInputStream compIn = new ByteArrayInputStream(compressedStream.getBuffer(), 0, compressedStream.size()); - SnappyInputStream in = new SnappyInputStream(compIn, false); + SnappyFramedInputStream in = new SnappyFramedInputStream(compIn, false); while (in.read(inputBuffer) >= 0) { } @@ -314,7 +314,7 @@ public double getCompressionRatio(TestData testData) int compressedSize; try { ByteArrayOutputStream rawOut = new ByteArrayOutputStream(Snappy.maxCompressedLength(contents.length)); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); out.write(contents); out.close(); diff --git a/src/test/java/org/iq80/snappy/SnappyBench.java b/src/test/java/org/iq80/snappy/SnappyBench.java index 8674c56..46aa22a 100644 --- a/src/test/java/org/iq80/snappy/SnappyBench.java +++ b/src/test/java/org/iq80/snappy/SnappyBench.java @@ -116,11 +116,11 @@ public void verify() byte[] contents = testData.getContents(); ByteArrayOutputStream rawOut = new ByteArrayOutputStream(Snappy.maxCompressedLength(contents.length)); - SnappyOutputStream out = new SnappyOutputStream(rawOut); + SnappyFramedOutputStream out = new SnappyFramedOutputStream(rawOut); out.write(contents); out.close(); - SnappyInputStream in = new SnappyInputStream(new ByteArrayInputStream(rawOut.toByteArray())); + SnappyFramedInputStream in = new SnappyFramedInputStream(new ByteArrayInputStream(rawOut.toByteArray())); byte[] uncompressed = ByteStreams.toByteArray(in); if (!Arrays.equals(uncompressed, testData.getContents())) { diff --git a/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java b/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java index 67e4e2e..5a2e004 100644 --- a/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java +++ b/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java @@ -18,8 +18,6 @@ package org.iq80.snappy; import com.google.common.base.Charsets; -import org.testng.annotations.AfterTest; -import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import java.io.ByteArrayInputStream; @@ -44,13 +42,6 @@ public class SnappyFramedStreamTest extends AbstractSnappyStreamTest { - @BeforeTest - @AfterTest - public void resetBufferRecycler() - { - BufferRecycler.instance().clear(); - } - @Override protected OutputStream createOutputStream(OutputStream target) throws IOException diff --git a/src/test/java/org/iq80/snappy/SnappyStreamTest.java b/src/test/java/org/iq80/snappy/SnappyStreamTest.java deleted file mode 100644 index 4f26f6d..0000000 --- a/src/test/java/org/iq80/snappy/SnappyStreamTest.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import com.google.common.base.Charsets; -import org.testng.annotations.Test; - -import java.io.ByteArrayInputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.util.Arrays; - -import static com.google.common.io.ByteStreams.toByteArray; -import static com.google.common.primitives.UnsignedBytes.toInt; -import static org.iq80.snappy.SnappyOutputStream.STREAM_HEADER; -import static org.testng.Assert.assertEquals; - -public class SnappyStreamTest - extends AbstractSnappyStreamTest -{ - @Override - protected OutputStream createOutputStream(OutputStream target) - throws IOException - { - return new SnappyOutputStream(target); - } - - @Override - protected InputStream createInputStream(InputStream source, boolean verifyCheckSums) - throws IOException - { - return new SnappyInputStream(source, verifyCheckSums); - } - - @Override - protected byte[] getMarkerFrame() - { - return STREAM_HEADER; - } - - @Test - public void testSimple() - throws Exception - { - byte[] original = "aaaaaaaaaaaabbbbbbbaaaaaa".getBytes(Charsets.UTF_8); - - byte[] compressed = compress(original); - byte[] uncompressed = uncompress(compressed); - - assertEquals(uncompressed, original); - assertEquals(compressed.length, 33); // 7 byte stream header, 7 byte block header, 19 bytes compressed data - assertEquals(Arrays.copyOf(compressed, 7), STREAM_HEADER); // stream header - assertEquals(toInt(compressed[7]), 0x01); // flag: compressed - assertEquals(toInt(compressed[8]), 0x00); // length: 19 = 0x0013 - assertEquals(toInt(compressed[9]), 0x13); - assertEquals(toInt(compressed[10]), 0x92); // crc32c: 0x9274cda8 - assertEquals(toInt(compressed[11]), 0x74); - assertEquals(toInt(compressed[12]), 0xCD); - assertEquals(toInt(compressed[13]), 0xA8); - } - - @Test - public void testUncompressable() - throws Exception - { - byte[] random = getRandom(1, 5000); - int crc32c = Crc32C.maskedCrc32c(random); - - byte[] compressed = compress(random); - byte[] uncompressed = uncompress(compressed); - - assertEquals(uncompressed, random); - assertEquals(compressed.length, random.length + 7 + 7); - assertEquals(toInt(compressed[7]), 0x00); // flag: uncompressed - assertEquals(toInt(compressed[8]), 0x13); // length: 5000 = 0x1388 - assertEquals(toInt(compressed[9]), 0x88); - assertEquals(ByteBuffer.wrap(compressed, 10, 4).getInt(), crc32c); // crc: see above - } - - @Test - public void testEmptyCompression() - throws Exception - { - byte[] empty = new byte[0]; - assertEquals(compress(empty), STREAM_HEADER); - assertEquals(uncompress(STREAM_HEADER), empty); - } - - @Test(expectedExceptions = EOFException.class, expectedExceptionsMessageRegExp = ".*block header.*") - public void testShortBlockHeader() - throws Exception - { - uncompressBlock(new byte[] {0}); - } - - @Test(expectedExceptions = EOFException.class, expectedExceptionsMessageRegExp = ".*reading frame.*") - public void testShortBlockData() - throws Exception - { - uncompressBlock(new byte[] {0, 0, 4, 0, 0, 0, 0, 'x', 'x'}); // flag = 0, size = 4, crc32c = 0, block data = [x, x] - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "invalid compressed flag in header: 0x41") - public void testInvalidBlockHeaderCompressedFlag() - throws Exception - { - uncompressBlock(new byte[] {'A', 0, 1, 0, 0, 0, 0, 0}); // flag = 'A', block size = 1, crc32c = 0 - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "invalid block size in header: 0") - public void testInvalidBlockSizeZero() - throws Exception - { - uncompressBlock(new byte[] {0, 0, 0, 0, 0, 0, 0}); // flag = '0', block size = 0, crc32c = 0 - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "invalid block size in header: 55555") - public void testInvalidBlockSizeLarge() - throws Exception - { - uncompressBlock(new byte[] {0, (byte) 0xD9, 0x03, 0, 0, 0, 0}); // flag = 0, block size = 55555, crc32c = 0 - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "Corrupt input: invalid checksum") - public void testInvalidChecksum() - throws Exception - { - uncompressBlock(new byte[] {0, 0, 1, 0, 0, 0, 0, 'a'}); // flag = 0, size = 4, crc32c = 0, block data = [a] - } - - @Test - public void testInvalidChecksumIgnoredWhenVerificationDisabled() - throws Exception - { - byte[] block = {0, 0, 1, 0, 0, 0, 0, 'a'}; // flag = 0, size = 4, crc32c = 0, block data = [a] - ByteArrayInputStream inputData = new ByteArrayInputStream(blockToStream(block)); - assertEquals(toByteArray(createInputStream(inputData, false)), new byte[] {'a'}); - } - - private byte[] uncompressBlock(byte[] block) - throws IOException - { - return uncompress(blockToStream(block)); - } - - private static byte[] blockToStream(byte[] block) - { - byte[] stream = new byte[STREAM_HEADER.length + block.length]; - System.arraycopy(STREAM_HEADER, 0, stream, 0, STREAM_HEADER.length); - System.arraycopy(block, 0, stream, STREAM_HEADER.length, block.length); - return stream; - } -} diff --git a/src/test/java/org/iq80/snappy/SnappyTest.java b/src/test/java/org/iq80/snappy/SnappyTest.java index c8a49aa..2219240 100644 --- a/src/test/java/org/iq80/snappy/SnappyTest.java +++ b/src/test/java/org/iq80/snappy/SnappyTest.java @@ -22,16 +22,15 @@ import org.testng.annotations.Test; import java.io.File; -import java.util.Arrays; import java.util.Random; public class SnappyTest { private static final File TEST_DATA_DIR = new File("testdata"); - private RandomGenerator randomGenerator = new RandomGenerator(0.5); + private final RandomGenerator randomGenerator = new RandomGenerator(0.5); @Test - public void testByteForByteOutputSyntheticData() + public void testNativeCompatibleSyntheticData() throws Exception { for (int i = 1; i < 65 * 1024; i++) { @@ -39,13 +38,14 @@ public void testByteForByteOutputSyntheticData() verifyCompression(i); } catch (Error e) { + e.printStackTrace(); Assert.fail(i + " byte block", e); } } } @Test - public void testByteForByteTestData() + public void testNativeCompatibleTestData() throws Exception { for (File testFile : getTestFiles()) { @@ -54,12 +54,36 @@ public void testByteForByteTestData() verifyCompression(data, 0, data.length); } catch (Throwable e) { + e.printStackTrace(); Assert.fail("Testdata: " + testFile.getName(), e); - } } } + @Test(expectedExceptions = CorruptionException.class, expectedExceptionsMessageRegExp = "Malformed input: offset=5") + public void testInvalidLiteralLength() + { + byte[] data = { + // Encoded uncompressed length 1024 + -128, 8, + // op-code + (byte) 252, + // Trailer value Integer.MAX_VALUE + (byte) 0b1111_1111, (byte) 0b1111_1111, (byte) 0b1111_1111, (byte) 0b0111_1111, + // Some arbitrary data + 0, 0, 0, 0, 0, 0, 0, 0 + }; + + Snappy.uncompress(data, 0, data.length, new byte[1024], 0, 1024); + } + + @Test(expectedExceptions = CorruptionException.class, expectedExceptionsMessageRegExp = "negative compressed length: offset=16") + public void testNegativeLength() + { + byte[] data = {(byte) 255, (byte) 255, (byte) 255, (byte) 255, 0b0000_1000}; + Snappy.getUncompressedLength(data, 0); + } + private void verifyCompression(int size) throws Exception { @@ -72,16 +96,8 @@ private void verifyCompression(int size) private static void verifyCompression(byte[] input, int position, int size) throws Exception { - byte[] nativeCompressed = new byte[org.xerial.snappy.Snappy.maxCompressedLength(size)]; byte[] javaCompressed = new byte[Snappy.maxCompressedLength(size)]; - int nativeCompressedSize = org.xerial.snappy.Snappy.compress( - input, - position, - size, - nativeCompressed, - 0); - int javaCompressedSize = Snappy.compress( input, position, @@ -89,29 +105,34 @@ private static void verifyCompression(byte[] input, int position, int size) javaCompressed, 0); - // verify outputs are exactly the same - String failureMessage = "Invalid compressed output for input size " + size + " at offset " + position; - if (!SnappyInternalUtils.equals(javaCompressed, 0, nativeCompressed, 0, nativeCompressedSize)) { - if (nativeCompressedSize < 100) { - Assert.assertEquals( - Arrays.toString(Arrays.copyOf(javaCompressed, nativeCompressedSize)), - Arrays.toString(Arrays.copyOf(nativeCompressed, nativeCompressedSize)), - failureMessage - ); - } - else { - Assert.fail(failureMessage); - } - } - Assert.assertEquals(javaCompressedSize, nativeCompressedSize); - - // verify the contents can be uncompressed + // Verify Java codec decompresses Java compressed data byte[] uncompressed = new byte[size]; - Snappy.uncompress(javaCompressed, 0, javaCompressedSize, uncompressed, 0); + int uncompressedSize = Snappy.uncompress(javaCompressed, 0, javaCompressedSize, uncompressed, 0); + Assert.assertEquals(uncompressedSize, size, "Size mismatch"); + Assert.assertTrue(arraysEqual(input, position, uncompressed, 0, size), "Data mismatch"); - if (!SnappyInternalUtils.equals(uncompressed, 0, input, position, size)) { - Assert.fail("Invalid uncompressed output for input size " + size + " at offset " + position); - } + // Verify Native codec decompresses Java compressed data + byte[] nativeUncompressed = new byte[size]; + int nativeUncompressedSize = org.xerial.snappy.Snappy.uncompress( + javaCompressed, + 0, + javaCompressedSize, + nativeUncompressed, + 0); + Assert.assertEquals(nativeUncompressedSize, size, "Size mismatch"); + Assert.assertTrue(arraysEqual(input, position, nativeUncompressed, 0, size), "Data mismatch"); + + // Verify Java codec decompresses Native compressed data + byte[] nativeCompressed = new byte[org.xerial.snappy.Snappy.maxCompressedLength(size)]; + int nativeCompressedSize = org.xerial.snappy.Snappy.compress( + input, + position, + size, + nativeCompressed, + 0); + uncompressedSize = Snappy.uncompress(nativeCompressed, 0, nativeCompressedSize, uncompressed, 0); + Assert.assertEquals(uncompressedSize, size, "Size mismatch"); + Assert.assertTrue(arraysEqual(input, position, uncompressed, 0, size), "Data mismatch"); } public static class RandomGenerator @@ -177,4 +198,14 @@ static File[] getTestFiles() Assert.assertTrue(testFiles != null && testFiles.length > 0, "No test files at " + TEST_DATA_DIR.getAbsolutePath()); return testFiles; } + + private static boolean arraysEqual(byte[] left, int leftIndex, byte[] right, int rightIndex, int length) + { + for (int i = 0; i < length; i++) { + if (left[leftIndex + i] != right[rightIndex + i]) { + return false; + } + } + return true; + } }