From 01cfd80828d5e33b07275755ce2437aafcb0db67 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sun, 11 Oct 2020 13:59:53 -0500 Subject: [PATCH 1/6] Describe Dictionary-in-Stream format --- .../zstd_dict_in_stream_format.md | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 contrib/dict_in_stream/zstd_dict_in_stream_format.md diff --git a/contrib/dict_in_stream/zstd_dict_in_stream_format.md b/contrib/dict_in_stream/zstd_dict_in_stream_format.md new file mode 100644 index 00000000000..b1f5c1b1685 --- /dev/null +++ b/contrib/dict_in_stream/zstd_dict_in_stream_format.md @@ -0,0 +1,63 @@ +# Zstandard Dictionary-in-Stream Format + +### Version + +0.1.0 (2020-10-11): initial version + +## Introduction + +This document defines a format for including a Zstandard dictionary inside a +compressed Zstandard stream. When combined with the +[Seekable Format](../seekable_format) or other formats that use multiple +Zstandard frames, this format can help reduce the size of each frame without +requiring an external dictionary. This format can also be used to create a +stand-alone stream from a stream that uses an external dictionary, without +needing to recompress the stream. + +### Usage + +This format is used by tools such as [Wget-AT] to compress WARC files. + +[Wget-AT]: https://github.com/ArchiveTeam/wget-lua/releases/tag/v1.20.3-at.20200401.01 + +## Format + +The format consists of a skippable frame containing the dictionary, followed by +a normal Zstandard stream. All compressed frames in the stream must have been +compressed using the dictionary. + +### Dictionary Frame Format + +The dictionary frame is a [Zstandard skippable frame], structured as follows: + +|`Magic_Number`|`Frame_Size`|`Compressed_or_Uncompressed_Dictionary` | +|--------------|------------|------------------------------------------| +| 4 bytes | 4 bytes | n bytes | + +__`Magic_Number`__ + +Little-endian value: 0x184D2A5D. +Since it is legal for other Zstandard skippable frames to use the same +magic number, it is not recommended for a decoder to recognize frames +solely on this. + +__`Frame_Size`__ + +Little-endian, the total size of the skippable frame, not including the +`Magic_Number` or `Frame_Size`. + +__`Compressed_or_Uncompressed_Dictionary`__ + +The dictionary data, which may optionally be compressed. + +If uncompressed, this data must conform to the [Dictionary Format]. In +particular, it must start with little-endian 0xEC30A437. + +Otherwise, this data must be a single Zstandard compressed frame that +decompresses into data in the Dictionary Format. In particular, the compressed +data must start with little-endian 0xFD2FB528. The frame __must__ include a +`Frame_Content_Size` field. This field __must not__ contain any skippable +frames. + +[Dictionary Format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format +[Zstandard skippable frame]: https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#skippable-frames From 5f011328d6635b57e19cc285cdd576748b5a7feb Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sun, 11 Oct 2020 14:49:00 -0500 Subject: [PATCH 2/6] Add dictionary-in-stream header --- contrib/dict_in_stream/zstd_dict_in_stream.h | 95 ++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 contrib/dict_in_stream/zstd_dict_in_stream.h diff --git a/contrib/dict_in_stream/zstd_dict_in_stream.h b/contrib/dict_in_stream/zstd_dict_in_stream.h new file mode 100644 index 00000000000..2a4b285e323 --- /dev/null +++ b/contrib/dict_in_stream/zstd_dict_in_stream.h @@ -0,0 +1,95 @@ +/* + * Copyright 2020 Sean Bartell. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ +#ifndef ZSTD_DICT_IN_STREAM_H +#define ZSTD_DICT_IN_STREAM_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include /* size_t */ +#include "zstd.h" + +#define ZSTD_DICT_IN_STREAM_MAGIC 0x184D2A5D +#define ZSTD_DICT_IN_STREAM_HEADER_SIZE 8 + +/*-**************************************************************************** +* Dictionary in Stream - Decompression HowTo +* +* 1. Read the header (ZSTD_DICT_IN_STREAM_HEADER_SIZE bytes). +* 2. Use ZSTD_dict_in_stream_getDataSize on the header, +* to check how many data bytes to read. +* 3. Read data bytes. +* 4. Use ZSTD_dict_in_stream_getDictSize on the data, +* to check the size of the decompressed dictionary. +* 5. Use ZSTD_dict_in_stream_getDict on the data, +* to decompress the dictionary. +* +* Instead of steps 4-5, you can also use ZSTD_dict_in_stream_createCDict or +* ZSTD_dict_in_stream_createDDict. +* +* ****************************************************************************/ + +/*! ZSTD_dict_in_stream_getDataSize() : + * Given a dict_in_stream header, of size ZSTD_DICT_IN_STREAM_HEADER_SIZE, + * determine how many bytes of dictionary data follow the header. + * Returns 0 if this is not a dict_in_stream header. */ +ZSTDLIB_API size_t ZSTD_dict_in_stream_getDataSize(const void* src, size_t srcSize); + +/*! ZSTD_dict_in_stream_getDictSize() : + * Given the (possibly compressed) dictionary data that follows a + * dict_in_stream header, determine the decompressed dictionary size. + * Returns 0 if this is not a valid dictionary. */ +ZSTDLIB_API size_t ZSTD_dict_in_stream_getDictSize(const void* src, size_t srcSize); + +/*! ZSTD_dict_in_stream_getDict() : + * Given the (possibly compressed) dictionary data that follows a + * dict_in_stream header, decompress the dictionary. Returns 0 on error. */ +ZSTDLIB_API size_t ZSTD_dict_in_stream_getDict(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +/*! ZSTD_dict_in_stream_createCDict() : + * Given the (possibly compressed) dictionary data that follows a + * dict_in_stream header, load the dictionary as a CDict. + * Returns 0 on error or if this is not a valid dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_dict_in_stream_createCDict(const void* src, size_t srcSize); + +/*! ZSTD_dict_in_stream_createDDict() : + * Given the (possibly compressed) dictionary data that follows a + * dict_in_stream header, load the dictionary as a DDict. + * Returns 0 on error or if this is not a valid dictionary. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_dict_in_stream_createDDict(const void* src, size_t srcSize); + +/*-**************************************************************************** +* Dictionary in Stream - Compression HowTo +* +* 1. Use ZSTD_dict_in_stream_maxFrameSize on the dictionary, +* to determine the maximum possible size of the dictionary frame. +* 2. Use ZSTD_dict_in_stream_createFrame to create the frame. +* 3. Write the resulting frame at the beginning of the file. +* +* ****************************************************************************/ + +/*! ZSTD_dict_in_stream_maxFrameSize() : + * Determine the maximum possible size of the dictionary frame needed to store + * a dictionary. */ +ZSTDLIB_API size_t ZSTD_dict_in_stream_maxFrameSize(const void* dict, size_t dictSize); + +/*! ZSTD_dict_in_stream_createFrame() : + * Create a dictionary frame from a dictionary, with optional compression. + * compressionLevel can be 0 to disable compression. */ +ZSTDLIB_API size_t ZSTD_dict_in_stream_createFrame(void* dst, size_t dstCapacity, + const void* dict, size_t dictSize, + int compressionLevel); + +#if defined(__cplusplus) +} +#endif + +#endif // ZSTD_DICT_IN_STREAM_H From f524167c0ae00133dd3c29125b78fa2957483dfc Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sun, 11 Oct 2020 15:59:48 -0500 Subject: [PATCH 3/6] Add dictionary-in-stream compression code --- Makefile | 1 + contrib/dict_in_stream/examples/Makefile | 42 +++++ .../examples/seekable_compression.c | 173 ++++++++++++++++++ contrib/dict_in_stream/zstd_dict_in_stream.h | 7 +- .../zstd_dict_in_stream_format.md | 2 +- contrib/dict_in_stream/zstddis_compress.c | 47 +++++ contrib/seekable_format/zstd_seekable.h | 2 + contrib/seekable_format/zstdseek_compress.c | 10 + 8 files changed, 281 insertions(+), 3 deletions(-) create mode 100644 contrib/dict_in_stream/examples/Makefile create mode 100644 contrib/dict_in_stream/examples/seekable_compression.c create mode 100644 contrib/dict_in_stream/zstddis_compress.c diff --git a/Makefile b/Makefile index 1735ab865b9..87d4fe84603 100644 --- a/Makefile +++ b/Makefile @@ -121,6 +121,7 @@ man: contrib: lib $(MAKE) -C contrib/pzstd all $(MAKE) -C contrib/seekable_format/examples all + $(MAKE) -C contrib/dict_in_stream/examples all $(MAKE) -C contrib/largeNbDicts all cd contrib/single_file_libs/ ; ./build_decoder_test.sh cd contrib/single_file_libs/ ; ./build_library_test.sh diff --git a/contrib/dict_in_stream/examples/Makefile b/contrib/dict_in_stream/examples/Makefile new file mode 100644 index 00000000000..f6e644b4227 --- /dev/null +++ b/contrib/dict_in_stream/examples/Makefile @@ -0,0 +1,42 @@ +# ################################################################ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under both the BSD-style license (found in the +# LICENSE file in the root directory of this source tree) and the GPLv2 (found +# in the COPYING file in the root directory of this source tree). +# ################################################################ + +# This Makefile presumes libzstd is built, using `make` in / or /lib/ + +ZSTDLIB_PATH = ../../../lib +ZSTDLIB_NAME = libzstd.a +ZSTDLIB = $(ZSTDLIB_PATH)/$(ZSTDLIB_NAME) + +CPPFLAGS += -I.. -I../../seekable_format -I../../../lib -I../../../lib/common + +CFLAGS ?= -O3 +CFLAGS += -g + +DIS_OBJS = ../zstddis_compress.c +SEEKABLE_OBJS = ../../seekable_format/zstdseek_compress.c ../../seekable_format/zstdseek_decompress.c + +.PHONY: default all clean test + +default: all + +all: seekable_compression seekable_decompression + +$(ZSTDLIB): + make -C $(ZSTDLIB_PATH) $(ZSTDLIB_NAME) + +seekable_compression : seekable_compression.c $(DIS_OBJS) $(SEEKABLE_OBJS) $(ZSTDLIB) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +seekable_decompression : seekable_decompression.c $(DIS_OBJS) $(SEEKABLE_OBJS) $(ZSTDLIB) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +clean: + @rm -f core *.o tmp* result* *.zst \ + seekable_compression seekable_decompression + @echo Cleaning completed diff --git a/contrib/dict_in_stream/examples/seekable_compression.c b/contrib/dict_in_stream/examples/seekable_compression.c new file mode 100644 index 00000000000..2a3a127640c --- /dev/null +++ b/contrib/dict_in_stream/examples/seekable_compression.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * Copyright (c) 2020 Sean Bartell + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include // malloc, free, exit, atoi +#include // fprintf, perror, feof, fopen, etc. +#include // strlen, memset, strcat +#define ZSTD_STATIC_LINKING_ONLY +#include // presumes zstd library is installed + +#include "zstd_dict_in_stream.h" +#include "zstd_seekable.h" + +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc:"); + exit(1); +} + +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + +static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + +static size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + +static size_t fsize_orDie(FILE* file) +{ + if (fseek(file, 0, SEEK_END)) { + perror("fseek"); + exit(7); + } + long result = ftell(file); + if (result < 0) { + perror("ftell"); + exit(8); + } + if (fseek(file, 0, SEEK_SET)) { + perror("fseek"); + exit(9); + } + return result; +} + +static void compressFile_orDie(const char* dictName, const char* fname, const char* outName, int cLevel, unsigned frameSize) +{ + FILE* const fdict = fopen_orDie(dictName, "rb"); + FILE* const fin = fopen_orDie(fname, "rb"); + FILE* const fout = fopen_orDie(outName, "wb"); + size_t const dictSize = fsize_orDie(fdict); + void* const dict = malloc_orDie(dictSize); + size_t const buffInSize = ZSTD_CStreamInSize(); /* can always read one full block */ + void* const buffIn = malloc_orDie(buffInSize); + size_t const buffOutSize = ZSTD_CStreamOutSize(); /* can always flush a full block */ + void* const buffOut = malloc_orDie(buffOutSize); + + ZSTD_seekable_CStream* const cstream = ZSTD_seekable_createCStream(); + if (cstream==NULL) { fprintf(stderr, "ZSTD_seekable_createCStream() error \n"); exit(10); } + size_t const initResult = ZSTD_seekable_initCStream(cstream, cLevel, 1, frameSize); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + fread_orDie(dict, dictSize, fdict); + ZSTD_CDict* cdict = ZSTD_createCDict(dict, dictSize, cLevel); + ZSTD_seekable_refCDict(cstream, cdict); + + size_t dictFrameSize = ZSTD_dict_in_stream_maxFrameSize(dict, dictSize); + if (ZSTD_isError(dictFrameSize)) { fprintf(stderr, "ZSTD_dict_in_stream_maxFrameSize() error : %s \n", ZSTD_getErrorName(dictFrameSize)); exit(14); } + void* const dictFrame = malloc_orDie(dictFrameSize); + dictFrameSize = ZSTD_dict_in_stream_createFrame(dictFrame, dictFrameSize, dict, dictSize, 5); + if (ZSTD_isError(dictFrameSize)) { fprintf(stderr, "ZSTD_dict_in_stream_createFrame() error : %s \n", ZSTD_getErrorName(dictFrameSize)); exit(15); } + fwrite_orDie(dictFrame, dictFrameSize, fout); + ZSTD_seekable_logFrame(ZSTD_seekable_getFrameLog(cstream), dictFrameSize, 0, 0); + + size_t read, toRead = buffInSize; + while( (read = fread_orDie(buffIn, toRead, fin)) ) { + ZSTD_inBuffer input = { buffIn, read, 0 }; + while (input.pos < input.size) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + toRead = ZSTD_seekable_compressStream(cstream, &output , &input); /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */ + if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_seekable_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); } + if (toRead > buffInSize) toRead = buffInSize; /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/ + fwrite_orDie(buffOut, output.pos, fout); + } + } + + while (1) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + size_t const remainingToFlush = ZSTD_seekable_endStream(cstream, &output); /* close stream */ + if (ZSTD_isError(remainingToFlush)) { fprintf(stderr, "ZSTD_seekable_endStream() error : %s \n", ZSTD_getErrorName(remainingToFlush)); exit(13); } + fwrite_orDie(buffOut, output.pos, fout); + if (!remainingToFlush) break; + } + + ZSTD_seekable_freeCStream(cstream); + ZSTD_freeCDict(cdict); + fclose_orDie(fout); + fclose_orDie(fin); + fclose_orDie(fdict); + free(dict); + free(dictFrame); + free(buffIn); + free(buffOut); +} + +static char* createOutFilename_orDie(const char* filename) +{ + size_t const inL = strlen(filename); + size_t const outL = inL + 5; + void* outSpace = malloc_orDie(outL); + memset(outSpace, 0, outL); + strcat(outSpace, filename); + strcat(outSpace, ".zst"); + return (char*)outSpace; +} + +int main(int argc, const char** argv) { + const char* const exeName = argv[0]; + if (argc!=4) { + printf("wrong arguments\n"); + printf("usage:\n"); + printf("%s DICT_FILE FILE FRAME_SIZE\n", exeName); + return 1; + } + + { const char* const dictFileName = argv[1]; + const char* const inFileName = argv[2]; + unsigned const frameSize = (unsigned)atoi(argv[3]); + + char* const outFileName = createOutFilename_orDie(inFileName); + compressFile_orDie(dictFileName, inFileName, outFileName, 5, frameSize); + free(outFileName); + } + + return 0; +} diff --git a/contrib/dict_in_stream/zstd_dict_in_stream.h b/contrib/dict_in_stream/zstd_dict_in_stream.h index 2a4b285e323..e7f11db7b59 100644 --- a/contrib/dict_in_stream/zstd_dict_in_stream.h +++ b/contrib/dict_in_stream/zstd_dict_in_stream.h @@ -78,12 +78,15 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_dict_in_stream_createDDict(const void* src, size_t /*! ZSTD_dict_in_stream_maxFrameSize() : * Determine the maximum possible size of the dictionary frame needed to store - * a dictionary. */ + * a dictionary. + * Returns an error code if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_dict_in_stream_maxFrameSize(const void* dict, size_t dictSize); /*! ZSTD_dict_in_stream_createFrame() : * Create a dictionary frame from a dictionary, with optional compression. - * compressionLevel can be 0 to disable compression. */ + * compressionLevel can be 0 to disable compression. + * Returns the size of the frame, + * or an error code if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_dict_in_stream_createFrame(void* dst, size_t dstCapacity, const void* dict, size_t dictSize, int compressionLevel); diff --git a/contrib/dict_in_stream/zstd_dict_in_stream_format.md b/contrib/dict_in_stream/zstd_dict_in_stream_format.md index b1f5c1b1685..89e076d4188 100644 --- a/contrib/dict_in_stream/zstd_dict_in_stream_format.md +++ b/contrib/dict_in_stream/zstd_dict_in_stream_format.md @@ -39,7 +39,7 @@ __`Magic_Number`__ Little-endian value: 0x184D2A5D. Since it is legal for other Zstandard skippable frames to use the same magic number, it is not recommended for a decoder to recognize frames -solely on this. +using this field alone. __`Frame_Size`__ diff --git a/contrib/dict_in_stream/zstddis_compress.c b/contrib/dict_in_stream/zstddis_compress.c new file mode 100644 index 00000000000..824d4154c26 --- /dev/null +++ b/contrib/dict_in_stream/zstddis_compress.c @@ -0,0 +1,47 @@ +/* + * Copyright 2020 Sean Bartell. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_dict_in_stream.h" + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +size_t ZSTD_dict_in_stream_maxFrameSize(const void* dict, size_t dictSize) +{ + size_t result = dictSize + ZSTD_DICT_IN_STREAM_HEADER_SIZE; + if (result < dictSize) // overflow + return ERROR(parameter_outOfBound); + return result; +} + +size_t ZSTD_dict_in_stream_createFrame(void* dst, size_t dstCapacity, + const void* dict, size_t dictSize, + int compressionLevel) +{ + if (dstCapacity < ZSTD_DICT_IN_STREAM_HEADER_SIZE) + return ERROR(dstSize_tooSmall); + BYTE *dataDst = (BYTE*)dst + ZSTD_DICT_IN_STREAM_HEADER_SIZE; + size_t dataCapacity = dstCapacity - ZSTD_DICT_IN_STREAM_HEADER_SIZE; + size_t dataSize = ERROR(dstSize_tooSmall); + if (compressionLevel != 0) + dataSize = ZSTD_compress(dataDst, dataCapacity, dict, dictSize, compressionLevel); + if (ZSTD_isError(dataSize) && dataCapacity >= dictSize) { + memcpy(dataDst, dict, dictSize); + dataCapacity = dictSize; + } + if (ZSTD_isError(dataSize)) + return dataSize; + MEM_writeLE32(dst, ZSTD_DICT_IN_STREAM_MAGIC); + MEM_writeLE32((BYTE*)dst + 4, dataSize); + return dataSize + ZSTD_DICT_IN_STREAM_HEADER_SIZE; +} diff --git a/contrib/seekable_format/zstd_seekable.h b/contrib/seekable_format/zstd_seekable.h index 7ffd1ba0a72..462a0deb8af 100644 --- a/contrib/seekable_format/zstd_seekable.h +++ b/contrib/seekable_format/zstd_seekable.h @@ -83,6 +83,7 @@ ZSTDLIB_API size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, int com ZSTDLIB_API size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); ZSTDLIB_API size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output); ZSTDLIB_API size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output); +ZSTDLIB_API size_t ZSTD_seekable_refCDict(ZSTD_seekable_CStream* zcs, const ZSTD_CDict* cdict); /*= Raw seek table API * These functions allow for the seek table to be constructed directly. @@ -106,6 +107,7 @@ ZSTDLIB_API ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag); ZSTDLIB_API size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl); ZSTDLIB_API size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, unsigned compressedSize, unsigned decompressedSize, unsigned checksum); ZSTDLIB_API size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output); +ZSTDLIB_API ZSTD_frameLog* ZSTD_seekable_getFrameLog(ZSTD_seekable_CStream* zcs); /*-**************************************************************************** * Seekable decompression - HowTo diff --git a/contrib/seekable_format/zstdseek_compress.c b/contrib/seekable_format/zstdseek_compress.c index 5a75714fac5..40ccf2ab658 100644 --- a/contrib/seekable_format/zstdseek_compress.c +++ b/contrib/seekable_format/zstdseek_compress.c @@ -168,6 +168,16 @@ size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, return ZSTD_initCStream(zcs->cstream, compressionLevel); } +size_t ZSTD_seekable_refCDict(ZSTD_seekable_CStream* zcs, const ZSTD_CDict* cdict) +{ + return ZSTD_CCtx_refCDict(zcs->cstream, cdict); +} + +ZSTD_frameLog* ZSTD_seekable_getFrameLog(ZSTD_seekable_CStream* zcs) +{ + return &zcs->framelog; +} + size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, unsigned compressedSize, unsigned decompressedSize, From c6121a55029a3a2929ab4ed249915bf3e68c0cb8 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sun, 11 Oct 2020 16:29:24 -0500 Subject: [PATCH 4/6] Add dictionary-in-stream decompression code --- contrib/dict_in_stream/examples/Makefile | 2 +- .../examples/seekable_decompression.c | 153 ++++++++++++++++++ contrib/dict_in_stream/zstd_dict_in_stream.h | 22 +-- contrib/dict_in_stream/zstddis_decompress.c | 99 ++++++++++++ contrib/seekable_format/zstd_seekable.h | 1 + contrib/seekable_format/zstdseek_decompress.c | 5 + 6 files changed, 271 insertions(+), 11 deletions(-) create mode 100644 contrib/dict_in_stream/examples/seekable_decompression.c create mode 100644 contrib/dict_in_stream/zstddis_decompress.c diff --git a/contrib/dict_in_stream/examples/Makefile b/contrib/dict_in_stream/examples/Makefile index f6e644b4227..0933973ba4a 100644 --- a/contrib/dict_in_stream/examples/Makefile +++ b/contrib/dict_in_stream/examples/Makefile @@ -18,7 +18,7 @@ CPPFLAGS += -I.. -I../../seekable_format -I../../../lib -I../../../lib/common CFLAGS ?= -O3 CFLAGS += -g -DIS_OBJS = ../zstddis_compress.c +DIS_OBJS = ../zstddis_compress.c ../zstddis_decompress.c SEEKABLE_OBJS = ../../seekable_format/zstdseek_compress.c ../../seekable_format/zstdseek_decompress.c .PHONY: default all clean test diff --git a/contrib/dict_in_stream/examples/seekable_decompression.c b/contrib/dict_in_stream/examples/seekable_decompression.c new file mode 100644 index 00000000000..4842390d097 --- /dev/null +++ b/contrib/dict_in_stream/examples/seekable_decompression.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * Copyright (c) 2020 Sean Bartell + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + + +#include // malloc, exit +#include // fprintf, perror, feof +#include // strerror +#include // errno +#define ZSTD_STATIC_LINKING_ONLY +#include // presumes zstd library is installed +#include + +#include "zstd_dict_in_stream.h" +#include "zstd_seekable.h" + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc"); + exit(1); +} + +static void* realloc_orDie(void* ptr, size_t size) +{ + ptr = realloc(ptr, size); + if (ptr) return ptr; + /* error */ + perror("realloc"); + exit(1); +} + +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + +static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + +static size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + +static void fseek_orDie(FILE* file, long int offset, int origin) { + if (!fseek(file, offset, origin)) { + if (!fflush(file)) return; + } + /* error */ + perror("fseek"); + exit(7); +} + + +static void decompressFile_orDie(const char* fname, off_t startOffset, off_t endOffset) +{ + FILE* const fin = fopen_orDie(fname, "rb"); + FILE* const fout = stdout; + size_t const buffOutSize = ZSTD_DStreamOutSize(); /* Guarantee to successfully flush at least one complete compressed block in all circumstances. */ + void* const buffOut = malloc_orDie(buffOutSize); + + ZSTD_seekable* const seekable = ZSTD_seekable_create(); + if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); } + + char header[ZSTD_DICT_IN_STREAM_HEADER_SIZE]; + size_t const headerSize = fread_orDie(header, ZSTD_DICT_IN_STREAM_HEADER_SIZE, fin); + size_t const dictDataSize = ZSTD_dict_in_stream_getDataSize(header, headerSize); + if (ZSTD_isError(dictDataSize)) { fprintf(stderr, "ZSTD_dict_in_stream_getDataSize() error : %s \n", ZSTD_getErrorName(dictDataSize)); exit(13); } + void* const dictData = malloc_orDie(dictDataSize); + size_t const actualDictDataSize = fread_orDie(dictData, dictDataSize, fin); + ZSTD_DDict *ddict = ZSTD_dict_in_stream_createDDict(dictData, actualDictDataSize); + if (!ddict) { fprintf(stderr, "ZSTD_dict_in_stream_createDDict() error\n"); exit(14); } + free(dictData); + + size_t const initResult = ZSTD_seekable_initFile(seekable, fin); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + size_t const refResult = ZSTD_seekable_refDDict(seekable, ddict); + if (ZSTD_isError(refResult)) { fprintf(stderr, "ZSTD_seekable_refDDict() error : %s \n", ZSTD_getErrorName(refResult)); exit(15); } + + while (startOffset < endOffset) { + size_t const result = ZSTD_seekable_decompress(seekable, buffOut, MIN(endOffset - startOffset, buffOutSize), startOffset); + + if (ZSTD_isError(result)) { + fprintf(stderr, "ZSTD_seekable_decompress() error : %s \n", + ZSTD_getErrorName(result)); + exit(12); + } + fwrite_orDie(buffOut, result, fout); + startOffset += result; + } + + ZSTD_freeDDict(ddict); + ZSTD_seekable_free(seekable); + fclose_orDie(fin); + fclose_orDie(fout); + free(buffOut); +} + + +int main(int argc, const char** argv) +{ + const char* const exeName = argv[0]; + + if (argc!=4) { + fprintf(stderr, "wrong arguments\n"); + fprintf(stderr, "usage:\n"); + fprintf(stderr, "%s FILE START END\n", exeName); + return 1; + } + + { + const char* const inFilename = argv[1]; + off_t const startOffset = atoll(argv[2]); + off_t const endOffset = atoll(argv[3]); + decompressFile_orDie(inFilename, startOffset, endOffset); + } + + return 0; +} diff --git a/contrib/dict_in_stream/zstd_dict_in_stream.h b/contrib/dict_in_stream/zstd_dict_in_stream.h index e7f11db7b59..9aead637200 100644 --- a/contrib/dict_in_stream/zstd_dict_in_stream.h +++ b/contrib/dict_in_stream/zstd_dict_in_stream.h @@ -39,31 +39,33 @@ extern "C" { /*! ZSTD_dict_in_stream_getDataSize() : * Given a dict_in_stream header, of size ZSTD_DICT_IN_STREAM_HEADER_SIZE, * determine how many bytes of dictionary data follow the header. - * Returns 0 if this is not a dict_in_stream header. */ + * Returns an error code if this is an invalid header + * (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_dict_in_stream_getDataSize(const void* src, size_t srcSize); /*! ZSTD_dict_in_stream_getDictSize() : * Given the (possibly compressed) dictionary data that follows a * dict_in_stream header, determine the decompressed dictionary size. - * Returns 0 if this is not a valid dictionary. */ + * Returns an error code if this is invalid dictionary data + * (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_dict_in_stream_getDictSize(const void* src, size_t srcSize); /*! ZSTD_dict_in_stream_getDict() : * Given the (possibly compressed) dictionary data that follows a - * dict_in_stream header, decompress the dictionary. Returns 0 on error. */ + * dict_in_stream header, decompress the dictionary. + * Returns an error code on error (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_dict_in_stream_getDict(void* dst, size_t dstCapacity, const void* src, size_t srcSize); /*! ZSTD_dict_in_stream_createCDict() : - * Given the (possibly compressed) dictionary data that follows a - * dict_in_stream header, load the dictionary as a CDict. - * Returns 0 on error or if this is not a valid dictionary. */ -ZSTDLIB_API ZSTD_CDict* ZSTD_dict_in_stream_createCDict(const void* src, size_t srcSize); + * Convenience function to load the dictionary as a CDict. + * Returns NULL on error or if this is not a valid dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_dict_in_stream_createCDict(const void* src, size_t srcSize, + int compressionLevel); /*! ZSTD_dict_in_stream_createDDict() : - * Given the (possibly compressed) dictionary data that follows a - * dict_in_stream header, load the dictionary as a DDict. - * Returns 0 on error or if this is not a valid dictionary. */ + * Convenience function to load the dictionary as a DDict. + * Returns NULL on error or if this is not a valid dictionary. */ ZSTDLIB_API ZSTD_DDict* ZSTD_dict_in_stream_createDDict(const void* src, size_t srcSize); /*-**************************************************************************** diff --git a/contrib/dict_in_stream/zstddis_decompress.c b/contrib/dict_in_stream/zstddis_decompress.c new file mode 100644 index 00000000000..ea5fe071760 --- /dev/null +++ b/contrib/dict_in_stream/zstddis_decompress.c @@ -0,0 +1,99 @@ +/* + * Copyright 2020 Sean Bartell. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include // malloc, free + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_dict_in_stream.h" + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +size_t ZSTD_dict_in_stream_getDataSize(const void* src, size_t srcSize) +{ + if (srcSize < ZSTD_DICT_IN_STREAM_HEADER_SIZE) + return ERROR(srcSize_wrong); + if (MEM_read32(src) != ZSTD_DICT_IN_STREAM_MAGIC) + return ERROR(prefix_unknown); + return MEM_read32((BYTE*)src + 4); +} + +size_t ZSTD_dict_in_stream_getDictSize(const void* src, size_t srcSize) +{ + if (srcSize < 4) + return ERROR(srcSize_wrong); + if (MEM_read32(src) == ZSTD_MAGIC_DICTIONARY) + return srcSize; + unsigned long long result = ZSTD_getFrameContentSize(src, srcSize); + if (result == ZSTD_CONTENTSIZE_UNKNOWN) + return ERROR(dictionary_corrupted); + if (result == ZSTD_CONTENTSIZE_ERROR) + return ERROR(GENERIC); + if ((size_t)result != result) + return ERROR(frameParameter_windowTooLarge); + return (size_t)result; +} + +size_t ZSTD_dict_in_stream_getDict(void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + if (srcSize < 4) + return ERROR(srcSize_wrong); + if (MEM_read32(src) == ZSTD_MAGIC_DICTIONARY) { + if (dstCapacity < srcSize) + return ERROR(dstSize_tooSmall); + memcpy(dst, src, srcSize); + return srcSize; + } + if (MEM_read32(src) != ZSTD_MAGICNUMBER) + return ERROR(prefix_unknown); + if (ZSTD_findFrameCompressedSize(src, srcSize) != srcSize) + return ERROR(srcSize_wrong); + return ZSTD_decompress(dst, dstCapacity, src, srcSize); +} + +ZSTD_CDict* ZSTD_dict_in_stream_createCDict(const void* src, size_t srcSize, + int compressionLevel) +{ + size_t size = ZSTD_dict_in_stream_getDictSize(src, srcSize); + if (ZSTD_isError(size)) + return NULL; + void* buffer = malloc(size); + if (!buffer) + return NULL; + size_t actualSize = ZSTD_dict_in_stream_getDict(buffer, size, src, srcSize); + if (actualSize != size) { + free(buffer); + return NULL; + } + ZSTD_CDict* result = ZSTD_createCDict(buffer, actualSize, compressionLevel); + free(buffer); + return result; +} + +ZSTD_DDict* ZSTD_dict_in_stream_createDDict(const void* src, size_t srcSize) +{ + size_t size = ZSTD_dict_in_stream_getDictSize(src, srcSize); + if (ZSTD_isError(size)) + return NULL; + void* buffer = malloc(size); + if (!buffer) + return NULL; + size_t actualSize = ZSTD_dict_in_stream_getDict(buffer, size, src, srcSize); + if (actualSize != size) { + free(buffer); + return NULL; + } + ZSTD_DDict* result = ZSTD_createDDict(buffer, actualSize); + free(buffer); + return result; +} diff --git a/contrib/seekable_format/zstd_seekable.h b/contrib/seekable_format/zstd_seekable.h index 462a0deb8af..f814565b500 100644 --- a/contrib/seekable_format/zstd_seekable.h +++ b/contrib/seekable_format/zstd_seekable.h @@ -161,6 +161,7 @@ ZSTDLIB_API size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, si ZSTDLIB_API size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src); ZSTDLIB_API size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned long long offset); ZSTDLIB_API size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex); +ZSTDLIB_API size_t ZSTD_seekable_refDDict(ZSTD_seekable* zs, const ZSTD_DDict* ddict); #define ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE (0ULL-2) /*===== Seek Table access functions =====*/ diff --git a/contrib/seekable_format/zstdseek_decompress.c b/contrib/seekable_format/zstdseek_decompress.c index abfd1e90271..7e455f0c119 100644 --- a/contrib/seekable_format/zstdseek_decompress.c +++ b/contrib/seekable_format/zstdseek_decompress.c @@ -377,6 +377,11 @@ size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile sr return 0; } +size_t ZSTD_seekable_refDDict(ZSTD_seekable* zs, const ZSTD_DDict* ddict) +{ + return ZSTD_DCtx_refDDict(zs->dstream, ddict); +} + size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, unsigned long long offset) { U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset); From 9134b4819044d6572277ca0d369dcfd7e0784214 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sun, 11 Oct 2020 19:19:48 -0500 Subject: [PATCH 5/6] Fix dictionary-in-stream compression code --- contrib/dict_in_stream/zstddis_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/dict_in_stream/zstddis_compress.c b/contrib/dict_in_stream/zstddis_compress.c index 824d4154c26..b7553e97aa9 100644 --- a/contrib/dict_in_stream/zstddis_compress.c +++ b/contrib/dict_in_stream/zstddis_compress.c @@ -37,7 +37,7 @@ size_t ZSTD_dict_in_stream_createFrame(void* dst, size_t dstCapacity, dataSize = ZSTD_compress(dataDst, dataCapacity, dict, dictSize, compressionLevel); if (ZSTD_isError(dataSize) && dataCapacity >= dictSize) { memcpy(dataDst, dict, dictSize); - dataCapacity = dictSize; + dataSize = dictSize; } if (ZSTD_isError(dataSize)) return dataSize; From 0d62e19becde38ce4e198269320ddb9779f20cd8 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Mon, 12 Oct 2020 17:14:39 -0500 Subject: [PATCH 6/6] dict_in_stream: dict frames must not use dicts --- contrib/dict_in_stream/examples/seekable_compression.c | 2 +- contrib/dict_in_stream/zstd_dict_in_stream_format.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/dict_in_stream/examples/seekable_compression.c b/contrib/dict_in_stream/examples/seekable_compression.c index 2a3a127640c..259859f9e34 100644 --- a/contrib/dict_in_stream/examples/seekable_compression.c +++ b/contrib/dict_in_stream/examples/seekable_compression.c @@ -165,7 +165,7 @@ int main(int argc, const char** argv) { unsigned const frameSize = (unsigned)atoi(argv[3]); char* const outFileName = createOutFilename_orDie(inFileName); - compressFile_orDie(dictFileName, inFileName, outFileName, 5, frameSize); + compressFile_orDie(dictFileName, inFileName, outFileName, 10, frameSize); free(outFileName); } diff --git a/contrib/dict_in_stream/zstd_dict_in_stream_format.md b/contrib/dict_in_stream/zstd_dict_in_stream_format.md index 89e076d4188..fb488282024 100644 --- a/contrib/dict_in_stream/zstd_dict_in_stream_format.md +++ b/contrib/dict_in_stream/zstd_dict_in_stream_format.md @@ -56,8 +56,8 @@ particular, it must start with little-endian 0xEC30A437. Otherwise, this data must be a single Zstandard compressed frame that decompresses into data in the Dictionary Format. In particular, the compressed data must start with little-endian 0xFD2FB528. The frame __must__ include a -`Frame_Content_Size` field. This field __must not__ contain any skippable -frames. +`Frame_Content_Size` field, and __must not__ require a dictionary. This field +__must not__ contain any skippable frames. [Dictionary Format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format [Zstandard skippable frame]: https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#skippable-frames