From 886720442f712b6e94c13075edaec1f224c1ae1a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 29 Dec 2024 02:13:57 -0800 Subject: [PATCH 01/18] initial implementation (incomplete) needs to take care of long lengths > 65535 --- lib/compress/zstd_compress.c | 264 ++++++++++++++++++++++++++++++++--- tests/Makefile | 2 +- 2 files changed, 249 insertions(+), 17 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 04b6bb9f111..d91fae619ad 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -15,7 +15,9 @@ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ #include "../common/mem.h" #include "../common/error_private.h" +#include "compiler.h" #include "hist.h" /* HIST_countFast_wksp */ +#include "zstd_internal.h" #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "../common/fse.h" #include "../common/huf.h" @@ -7103,15 +7105,226 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, return cSize; } + +#if defined(__AVX2__) + +#include /* AVX2 intrinsics */ + +/* + * Convert 2 sequences per iteration, using AVX2 intrinsics: + * - offset -> offBase = offset + 2 + * - litLength -> (U16) litLength + * - matchLength -> (U16)(matchLength - 3) + * - rep is ignored + * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). + * + * At the end, instead of extracting two __m128i, + * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, + * then store the lower 16 bytes in one go. + */ +void convertSequences_noRepcodes( + SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + /* + * addition: + * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) + */ + const __m256i addition = _mm256_setr_epi32( + ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ + ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ + ); + + /* + * shuffle mask for byte-level rearrangement in each 128-bit half: + * + * Input layout (after addition) per 128-bit half: + * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] + * We only need: + * offBase (4 bytes) = offset+2 + * litLength (2 bytes) = low 2 bytes of litLength + * mlBase (2 bytes) = low 2 bytes of (matchLength) + * => Bytes [0..3, 4..5, 8..9], zero the rest. + */ + const __m256i mask = _mm256_setr_epi8( + /* For the lower 128 bits => sequence i */ + 0, 1, 2, 3, /* offset+2 */ + 4, 5, /* litLength (16 bits) */ + 8, 9, /* matchLength (16 bits) */ + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + + /* For the upper 128 bits => sequence i+1 */ + 16,17,18,19, /* offset+2 */ + 20,21, /* litLength */ + 24,25, /* matchLength */ + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (char)0x80, (char)0x80, (char)0x80, (char)0x80 + ); + + /* + * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). + * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. + * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. + */ +#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ + + size_t i = 0; + /* Process 2 sequences per loop iteration */ + for (; i + 1 < nbSequences; i += 2) { + /* 1) Load 2 ZSTD_Sequence (32 bytes) */ + __m256i vin = _mm256_loadu_si256((__m256i const*)&inSeqs[i]); + + /* 2) Add {2, 0, -3, 0} in each 128-bit half */ + __m256i vadd = _mm256_add_epi32(vin, addition); + + /* 3) Shuffle bytes so each half gives us the 8 bytes we need */ + __m256i vshf = _mm256_shuffle_epi8(vadd, mask); + /* + * Now: + * Lane0 = seq0's 8 bytes + * Lane1 = 0 + * Lane2 = seq1's 8 bytes + * Lane3 = 0 + */ + + /* 4) Permute 64-bit lanes => move Lane2 down into Lane1. */ + __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); + /* + * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. + * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. + */ + + /* 5) Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ + _mm_storeu_si128((__m128i *)&dstSeqs[i], _mm256_castsi256_si128(vperm)); + /* + * This writes out 16 bytes total: + * - offset 0..7 => seq0 (offBase, litLength, mlBase) + * - offset 8..15 => seq1 (offBase, litLength, mlBase) + */ + } + + /* Handle leftover if nbSequences is odd */ + if (i < nbSequences) { + /* Fallback: process last sequence */ + assert(i == nbSequences - 1); + dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); + /* note: doesn't work if one length is > 65535 */ + dstSeqs[i].litLength = (U16)inSeqs[i].litLength; + dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + } +} + +#elif defined(__SSSE3__) + +#include /* SSSE3 intrinsics: _mm_shuffle_epi8 */ +#include /* SSE2 intrinsics: _mm_add_epi32, etc. */ + +/* + * Convert sequences with SSE. + * - offset -> offBase = offset + 2 + * - litLength (32-bit) -> (U16) litLength + * - matchLength (32-bit) -> (U16)(matchLength - 3) + * - rep is discarded. + * + * We shuffle so that only the first 8 bytes in the final 128-bit + * register are used. We still store 16 bytes (low 8 are good, high 8 are "don't care"). + */ +static void convertSequences_noRepcodes(SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + /* + addition = { offset+2, litLength+0, matchLength-3, rep+0 } + setr means the first argument is placed in the lowest 32 bits, + second in next-lower 32 bits, etc. + */ + const __m128i addition = _mm_setr_epi32(2, 0, -3, 0); + + /* + Shuffle mask: we reorder bytes after the addition. + + Input layout in 128-bit register (after addition): + Bytes: [ 0..3 | 4..7 | 8..11 | 12..15 ] + Fields: offset+2 litLength matchLength rep + + We want in output: + Bytes: [ 0..3 | 4..5 | 6..7 | 8..15 ignore ] + Fields: offset+2 (U16)litLength (U16)(matchLength) + + _mm_shuffle_epi8 picks bytes from the source. A byte of 0x80 means “zero out”. + So we want: + out[0] = in[0], out[1] = in[1], out[2] = in[2], out[3] = in[3], // offset+2 (4 bytes) + out[4] = in[4], out[5] = in[5], // (U16) litLength + out[6] = in[8], out[7] = in[9], // (U16) matchLength + out[8..15] = 0x80 => won't matter if we only care about first 8 bytes + */ + const __m128i mask = _mm_setr_epi8( + 0, 1, 2, 3, /* offset (4 bytes) */ + 4, 5, /* litLength (2 bytes) */ + 8, 9, /* matchLength (2 bytes) */ + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (char)0x80, (char)0x80, (char)0x80, (char)0x80 + ); + size_t i; + + for (i = 0; i + 1 < nbSequences; i += 2) { + /*-------------------------*/ + /* Process inSeqs[i] */ + /*-------------------------*/ + __m128i vin0 = _mm_loadu_si128((const __m128i *)(const void*)&inSeqs[i]); + __m128i vadd0 = _mm_add_epi32(vin0, addition); + __m128i vshf0 = _mm_shuffle_epi8(vadd0, mask); + _mm_storel_epi64((__m128i *)(void*)&dstSeqs[i], vshf0); + + /*-------------------------*/ + /* Process inSeqs[i + 1] */ + /*-------------------------*/ + __m128i vin1 = _mm_loadu_si128((__m128i const *)(const void*)&inSeqs[i + 1]); + __m128i vadd1 = _mm_add_epi32(vin1, addition); + __m128i vshf1 = _mm_shuffle_epi8(vadd1, mask); + _mm_storel_epi64((__m128i *)(void*)&dstSeqs[i + 1], vshf1); + } + + /* Handle leftover if nbSequences is odd */ + if (i < nbSequences) { + /* Fallback: process last sequence */ + assert(i == nbSequences - 1); + dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); + /* note: doesn't work if one length is > 65535 */ + dstSeqs[i].litLength = (U16)inSeqs[i].litLength; + dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + } + +} + +#else /* no SSE */ + +FORCE_INLINE_TEMPLATE void convertSequences_noRepcodes(SeqDef* dstSeqs, + const ZSTD_Sequence* const inSeqs, size_t nbSequences) +{ + size_t n; + for (n=0; n 65535 */ + dstSeqs[n].litLength = (U16)inSeqs[n].litLength; + dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); + } +} + +#endif + /* + * Precondition: Sequences must end on an explicit Block Delimiter * @return: 0 on success, or an error code. * Note: Sequence validation functionality has been disabled (removed). * This is helpful to generate a lean main pipeline, improving performance. * It may be re-inserted later. */ -size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, - const ZSTD_Sequence* const inSeqs, size_t nbSequences, - int repcodeResolution) +static size_t ZSTD_convertBlockSequences_internal(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int repcodeResolution) { Repcodes_t updatedRepcodes; size_t seqNb = 0; @@ -7129,21 +7342,26 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, assert(inSeqs[nbSequences-1].offset == 0); /* Convert Sequences from public format to internal format */ - for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { - U32 const litLength = inSeqs[seqNb].litLength; - U32 const matchLength = inSeqs[seqNb].matchLength; - U32 offBase; + if (!repcodeResolution) { + convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences); + cctx->seqStore.sequences += nbSequences; + } else { + for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { + U32 const litLength = inSeqs[seqNb].litLength; + U32 const matchLength = inSeqs[seqNb].matchLength; + U32 offBase; - if (!repcodeResolution) { - offBase = OFFSET_TO_OFFBASE(inSeqs[seqNb].offset); - } else { - U32 const ll0 = (litLength == 0); - offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); - ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); - } + if (!repcodeResolution) { + offBase = OFFSET_TO_OFFBASE(inSeqs[seqNb].offset); + } else { + U32 const ll0 = (litLength == 0); + offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } - DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); - ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + } } /* If we skipped repcode search while parsing, we need to update repcodes now */ @@ -7172,6 +7390,20 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return 0; } +static size_t ZSTD_convertBlockSequences_noRepcode(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences) +{ + return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); +} + +size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int repcodeResolution) +{ + (void)repcodeResolution; + return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); +} + typedef struct { size_t nbSequences; size_t blockSize; diff --git a/tests/Makefile b/tests/Makefile index 982181de8fb..abb0b2b1d3f 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -148,7 +148,7 @@ fullbench32: CPPFLAGS += -m32 $(FULLBENCHS) : CPPFLAGS += $(MULTITHREAD_CPP) -Wno-deprecated-declarations $(FULLBENCHS) : LDFLAGS += $(MULTITHREAD_LD) $(FULLBENCHS) : DEBUGFLAGS = -DNDEBUG # turn off assert() for speed measurements -$(FULLBENCHS) : DEBUGLEVEL ?= 0 # turn off assert() for speed measurements +$(FULLBENCHS) : DEBUGLEVEL = 0 # turn off assert() for speed measurements $(FULLBENCHS) : $(ZSTD_FILES) $(FULLBENCHS) : $(PRGDIR)/datagen.c $(PRGDIR)/lorem.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PRGDIR)/benchfn.c fullbench.c $(LINK.c) $^ -o $@$(EXT) From d1f0e5fb9738073150e7e5c25b03444b5a6a5389 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 15:51:38 -0800 Subject: [PATCH 02/18] fullbench can run a verification function compressSequencesAndLiterals: fixed long lengths in scalar mode --- lib/compress/zstd_compress.c | 29 +++++++++++-- tests/fullbench.c | 80 +++++++++++++++++++++++++----------- 2 files changed, 80 insertions(+), 29 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index d91fae619ad..c8dc86ccf94 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7205,7 +7205,7 @@ void convertSequences_noRepcodes( */ } - /* Handle leftover if nbSequences is odd */ + /* Handle leftover if @nbSequences is odd */ if (i < nbSequences) { /* Fallback: process last sequence */ assert(i == nbSequences - 1); @@ -7301,16 +7301,23 @@ static void convertSequences_noRepcodes(SeqDef* dstSeqs, #else /* no SSE */ -FORCE_INLINE_TEMPLATE void convertSequences_noRepcodes(SeqDef* dstSeqs, +static size_t +convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* const inSeqs, size_t nbSequences) { + size_t longLen = 0; size_t n; for (n=0; n 65535 */ dstSeqs[n].litLength = (U16)inSeqs[n].litLength; dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); + if (UNLIKELY(inSeqs[n].litLength > 65535)) { + assert(longLen == 0); + longLen = n + nbSequences + 1; + } } + return longLen; } #endif @@ -7343,8 +7350,22 @@ static size_t ZSTD_convertBlockSequences_internal(ZSTD_CCtx* cctx, /* Convert Sequences from public format to internal format */ if (!repcodeResolution) { - convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences); - cctx->seqStore.sequences += nbSequences; + size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); + cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; + if (longl) { + DEBUGLOG(5, "long length"); + assert(cctx->seqStore.longLengthType == ZSTD_llt_none); + if (longl <= nbSequences-1) { + DEBUGLOG(5, "long match length detected at pos %zu", longl-1); + cctx->seqStore.longLengthType = ZSTD_llt_matchLength; + cctx->seqStore.longLengthPos = (U32)(longl-1); + } else { + DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); + assert(longl <= 2* (nbSequences-1)); + cctx->seqStore.longLengthType = ZSTD_llt_literalLength; + cctx->seqStore.longLengthPos = (U32)(longl-nbSequences); + } + } } else { for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { U32 const litLength = inSeqs[seqNb].litLength; diff --git a/tests/fullbench.c b/tests/fullbench.c index 5683eca25a0..739e55ef2ee 100644 --- a/tests/fullbench.c +++ b/tests/fullbench.c @@ -687,6 +687,23 @@ local_convertSequences(const void* input, size_t inputSize, return nbSeqs; } +static size_t +check_compressedSequences(const void* compressed, size_t cSize, const void* orig, size_t origSize) +{ + size_t decSize; + int diff; + void* decompressed = malloc(origSize); + if (decompressed == NULL) return 2; + + decSize = ZSTD_decompress(decompressed, origSize, compressed, cSize); + if (decSize != origSize) { free(decompressed); DISPLAY("ZSTD_decompress failed (%zu) ", decSize); return 1; } + + diff = memcmp(decompressed, orig, origSize); + if (diff) { free(decompressed); return 1; } + + free(decompressed); + return 0; +} static PrepResult prepCopy(const void* src, size_t srcSize, int cLevel) { @@ -714,40 +731,43 @@ static PrepResult prepShorterDstCapacity(const void* src, size_t srcSize, int cL * List of Scenarios *********************************************************/ -/* if PrepFunction_f returns 0, benchmarking is cancelled */ +/* if PrepFunction_f returns PrepResult.prepBuffSize == 0, benchmarking is cancelled */ typedef PrepResult (*PrepFunction_f)(const void* src, size_t srcSize, int cLevel); typedef size_t (*BenchedFunction_f)(const void* src, size_t srcSize, void* dst, size_t dstSize, void* opaque); +/* must return 0, otherwise verification is considered failed */ +typedef size_t (*VerifFunction_f)(const void* processed, size_t procSize, const void* input, size_t inputSize); typedef struct { const char* name; PrepFunction_f preparation_f; - BenchedFunction_f benchedFunction; + BenchedFunction_f benched_f; + VerifFunction_f verif_f; /* optional */ } BenchScenario; static BenchScenario kScenarios[] = { - { "compress", NULL, local_ZSTD_compress }, - { "decompress", prepDecompress, local_ZSTD_decompress }, - { "compress_freshCCtx", NULL, local_ZSTD_compress_freshCCtx }, - { "decompressDCtx", prepDecompress, local_ZSTD_decompressDCtx }, - { "compressContinue", NULL, local_ZSTD_compressContinue }, - { "compressContinue_extDict", NULL, local_ZSTD_compressContinue_extDict }, - { "decompressContinue", prepDecompress, local_ZSTD_decompressContinue }, - { "compressStream", NULL, local_ZSTD_compressStream }, - { "compressStream_freshCCtx", NULL, local_ZSTD_compressStream_freshCCtx }, - { "decompressStream", prepDecompress, local_ZSTD_decompressStream }, - { "compress2", NULL, local_ZSTD_compress2 }, - { "compressStream2, end", NULL, local_ZSTD_compressStream2_end }, - { "compressStream2, end & short", prepShorterDstCapacity, local_ZSTD_compressStream2_end }, - { "compressStream2, continue", NULL, local_ZSTD_compressStream2_continue }, - { "compressStream2, -T2, continue", NULL, local_ZSTD_compress_generic_T2_continue }, - { "compressStream2, -T2, end", NULL, local_ZSTD_compress_generic_T2_end }, - { "compressSequences", prepSequences, local_compressSequences }, - { "compressSequencesAndLiterals", prepSequencesAndLiterals, local_compressSequencesAndLiterals }, - { "convertSequences (1st block)", prepConvertSequences, local_convertSequences }, + { "compress", NULL, local_ZSTD_compress, NULL }, + { "decompress", prepDecompress, local_ZSTD_decompress, NULL }, + { "compress_freshCCtx", NULL, local_ZSTD_compress_freshCCtx, NULL }, + { "decompressDCtx", prepDecompress, local_ZSTD_decompressDCtx, NULL }, + { "compressContinue", NULL, local_ZSTD_compressContinue, NULL }, + { "compressContinue_extDict", NULL, local_ZSTD_compressContinue_extDict, NULL }, + { "decompressContinue", prepDecompress, local_ZSTD_decompressContinue, NULL }, + { "compressStream", NULL, local_ZSTD_compressStream, NULL }, + { "compressStream_freshCCtx", NULL, local_ZSTD_compressStream_freshCCtx, NULL }, + { "decompressStream", prepDecompress, local_ZSTD_decompressStream, NULL }, + { "compress2", NULL, local_ZSTD_compress2, NULL }, + { "compressStream2, end", NULL, local_ZSTD_compressStream2_end, NULL }, + { "compressStream2, end & short", prepShorterDstCapacity, local_ZSTD_compressStream2_end, NULL }, + { "compressStream2, continue", NULL, local_ZSTD_compressStream2_continue, NULL }, + { "compressStream2, -T2, continue", NULL, local_ZSTD_compress_generic_T2_continue, NULL }, + { "compressStream2, -T2, end", NULL, local_ZSTD_compress_generic_T2_end, NULL }, + { "compressSequences", prepSequences, local_compressSequences, check_compressedSequences }, + { "compressSequencesAndLiterals", prepSequencesAndLiterals, local_compressSequencesAndLiterals, check_compressedSequences }, + { "convertSequences (1st block)", prepConvertSequences, local_convertSequences, NULL }, #ifndef ZSTD_DLL_IMPORT - { "decodeLiteralsHeader (1st block)", prepLiterals, local_ZSTD_decodeLiteralsHeader }, - { "decodeLiteralsBlock (1st block)", prepLiterals, local_ZSTD_decodeLiteralsBlock }, - { "decodeSeqHeaders (1st block)", prepSequences1stBlock, local_ZSTD_decodeSeqHeaders }, + { "decodeLiteralsHeader (1st block)", prepLiterals, local_ZSTD_decodeLiteralsHeader, NULL }, + { "decodeLiteralsBlock (1st block)", prepLiterals, local_ZSTD_decodeLiteralsBlock, NULL }, + { "decodeSeqHeaders (1st block)", prepSequences1stBlock, local_ZSTD_decodeSeqHeaders, NULL }, #endif }; #define NB_SCENARIOS (sizeof(kScenarios) / sizeof(kScenarios[0])) @@ -767,13 +787,15 @@ static int benchMem(unsigned scenarioID, const char* benchName; BMK_benchFn_t benchFunction; PrepFunction_f prep_f; + VerifFunction_f verif_f; int errorcode = 0; if (scenarioID >= NB_SCENARIOS) return 0; /* scenario doesn't exist */ benchName = kScenarios[scenarioID].name; - benchFunction = kScenarios[scenarioID].benchedFunction; + benchFunction = kScenarios[scenarioID].benched_f; prep_f = kScenarios[scenarioID].preparation_f; + verif_f = kScenarios[scenarioID].verif_f; if (prep_f == NULL) prep_f = prepCopy; /* default */ /* Initialization */ @@ -857,6 +879,14 @@ static int benchMem(unsigned scenarioID, scenarioID, benchName, (double)origSrcSize * TIMELOOP_NANOSEC / bestResult.nanoSecPerRun / MB_UNIT, (unsigned)newResult.sumOfReturn ); + + if (verif_f) { + size_t const vRes = verif_f(dst, newResult.sumOfReturn, origSrc, origSrcSize); + if (vRes) { + DISPLAY(" validation failed ! (%zu)\n", vRes); + break; + } + } } if ( BMK_isCompleted_TimedFn(tfs) ) break; From 8d621645891a8ec8a114fe09e94f967f2049352b Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 16:42:36 -0800 Subject: [PATCH 03/18] control long length within AVX2 implementation --- lib/compress/zstd_compress.c | 150 ++++++++++++++--------------------- 1 file changed, 60 insertions(+), 90 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index c8dc86ccf94..a5298031303 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7121,8 +7121,12 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, * At the end, instead of extracting two __m128i, * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, * then store the lower 16 bytes in one go. + * + * @returns 0 on succes, with no long length detected + * @returns > 0 if there is one long length (> 65535), + * indicating the position, and type. */ -void convertSequences_noRepcodes( +size_t convertSequences_noRepcodes( SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) @@ -7136,6 +7140,9 @@ void convertSequences_noRepcodes( ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ ); + /* limit: check if there is a long length */ + const __m256i limit = _mm256_set1_epi32(65535); + /* * shuffle mask for byte-level rearrangement in each 128-bit half: * @@ -7170,16 +7177,20 @@ void convertSequences_noRepcodes( */ #define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ - size_t i = 0; + size_t longLen = 0, i = 0; /* Process 2 sequences per loop iteration */ for (; i + 1 < nbSequences; i += 2) { - /* 1) Load 2 ZSTD_Sequence (32 bytes) */ + /* Load 2 ZSTD_Sequence (32 bytes) */ __m256i vin = _mm256_loadu_si256((__m256i const*)&inSeqs[i]); - /* 2) Add {2, 0, -3, 0} in each 128-bit half */ + /* Add {2, 0, -3, 0} in each 128-bit half */ __m256i vadd = _mm256_add_epi32(vin, addition); - /* 3) Shuffle bytes so each half gives us the 8 bytes we need */ + /* Check for long length */ + __m256i cmp = _mm256_cmpgt_epi32(vadd, limit); // 0xFFFFFFFF for element > 65535 + int cmp_res = _mm256_movemask_epi8(cmp); + + /* Shuffle bytes so each half gives us the 8 bytes we need */ __m256i vshf = _mm256_shuffle_epi8(vadd, mask); /* * Now: @@ -7189,105 +7200,47 @@ void convertSequences_noRepcodes( * Lane3 = 0 */ - /* 4) Permute 64-bit lanes => move Lane2 down into Lane1. */ + /* Permute 64-bit lanes => move Lane2 down into Lane1. */ __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); /* * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. */ - /* 5) Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ + /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ _mm_storeu_si128((__m128i *)&dstSeqs[i], _mm256_castsi256_si128(vperm)); /* * This writes out 16 bytes total: * - offset 0..7 => seq0 (offBase, litLength, mlBase) * - offset 8..15 => seq1 (offBase, litLength, mlBase) */ - } - /* Handle leftover if @nbSequences is odd */ - if (i < nbSequences) { - /* Fallback: process last sequence */ - assert(i == nbSequences - 1); - dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); - /* note: doesn't work if one length is > 65535 */ - dstSeqs[i].litLength = (U16)inSeqs[i].litLength; - dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + /* check (unlikely) long lengths > 65535 + * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] + * => combined mask = 0x0FF00FF0 + */ + if (UNLIKELY((cmp_res & 0x0FF00FF0) != 0)) { + /* long length detected: let's figure out which one*/ + if (inSeqs[i].matchLength > 65535+MINMATCH) { + assert(longLen == 0); + longLen = i + 1; + } + if (inSeqs[i].litLength > 65535) { + assert(longLen == 0); + longLen = i + nbSequences + 1; + } + if (inSeqs[i+1].matchLength > 65535+MINMATCH) { + assert(longLen == 0); + longLen = i + 1 + 1; + } + if (inSeqs[i+1].litLength > 65535) { + assert(longLen == 0); + longLen = i + 1 + nbSequences + 1; + } + } } -} - -#elif defined(__SSSE3__) - -#include /* SSSE3 intrinsics: _mm_shuffle_epi8 */ -#include /* SSE2 intrinsics: _mm_add_epi32, etc. */ -/* - * Convert sequences with SSE. - * - offset -> offBase = offset + 2 - * - litLength (32-bit) -> (U16) litLength - * - matchLength (32-bit) -> (U16)(matchLength - 3) - * - rep is discarded. - * - * We shuffle so that only the first 8 bytes in the final 128-bit - * register are used. We still store 16 bytes (low 8 are good, high 8 are "don't care"). - */ -static void convertSequences_noRepcodes(SeqDef* dstSeqs, - const ZSTD_Sequence* inSeqs, - size_t nbSequences) -{ - /* - addition = { offset+2, litLength+0, matchLength-3, rep+0 } - setr means the first argument is placed in the lowest 32 bits, - second in next-lower 32 bits, etc. - */ - const __m128i addition = _mm_setr_epi32(2, 0, -3, 0); - - /* - Shuffle mask: we reorder bytes after the addition. - - Input layout in 128-bit register (after addition): - Bytes: [ 0..3 | 4..7 | 8..11 | 12..15 ] - Fields: offset+2 litLength matchLength rep - - We want in output: - Bytes: [ 0..3 | 4..5 | 6..7 | 8..15 ignore ] - Fields: offset+2 (U16)litLength (U16)(matchLength) - - _mm_shuffle_epi8 picks bytes from the source. A byte of 0x80 means “zero out”. - So we want: - out[0] = in[0], out[1] = in[1], out[2] = in[2], out[3] = in[3], // offset+2 (4 bytes) - out[4] = in[4], out[5] = in[5], // (U16) litLength - out[6] = in[8], out[7] = in[9], // (U16) matchLength - out[8..15] = 0x80 => won't matter if we only care about first 8 bytes - */ - const __m128i mask = _mm_setr_epi8( - 0, 1, 2, 3, /* offset (4 bytes) */ - 4, 5, /* litLength (2 bytes) */ - 8, 9, /* matchLength (2 bytes) */ - (char)0x80, (char)0x80, (char)0x80, (char)0x80, - (char)0x80, (char)0x80, (char)0x80, (char)0x80 - ); - size_t i; - - for (i = 0; i + 1 < nbSequences; i += 2) { - /*-------------------------*/ - /* Process inSeqs[i] */ - /*-------------------------*/ - __m128i vin0 = _mm_loadu_si128((const __m128i *)(const void*)&inSeqs[i]); - __m128i vadd0 = _mm_add_epi32(vin0, addition); - __m128i vshf0 = _mm_shuffle_epi8(vadd0, mask); - _mm_storel_epi64((__m128i *)(void*)&dstSeqs[i], vshf0); - - /*-------------------------*/ - /* Process inSeqs[i + 1] */ - /*-------------------------*/ - __m128i vin1 = _mm_loadu_si128((__m128i const *)(const void*)&inSeqs[i + 1]); - __m128i vadd1 = _mm_add_epi32(vin1, addition); - __m128i vshf1 = _mm_shuffle_epi8(vadd1, mask); - _mm_storel_epi64((__m128i *)(void*)&dstSeqs[i + 1], vshf1); - } - - /* Handle leftover if nbSequences is odd */ + /* Handle leftover if @nbSequences is odd */ if (i < nbSequences) { /* Fallback: process last sequence */ assert(i == nbSequences - 1); @@ -7295,11 +7248,24 @@ static void convertSequences_noRepcodes(SeqDef* dstSeqs, /* note: doesn't work if one length is > 65535 */ dstSeqs[i].litLength = (U16)inSeqs[i].litLength; dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { + assert(longLen == 0); + longLen = i + 1; + } + if (UNLIKELY(inSeqs[i].litLength > 65535)) { + assert(longLen == 0); + longLen = i + nbSequences + 1; + } } + return longLen; } -#else /* no SSE */ +/* the vector implementation could also be ported to SSSE3, + * but since this implementation is targeting modern systems >= Sapphire Rapid, + * it's not useful to develop and maintain code for older platforms (before AVX2) */ + +#else /* no AVX2 */ static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, @@ -7312,6 +7278,10 @@ convertSequences_noRepcodes(SeqDef* dstSeqs, /* note: doesn't work if one length is > 65535 */ dstSeqs[n].litLength = (U16)inSeqs[n].litLength; dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); + if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { + assert(longLen == 0); + longLen = n + 1; + } if (UNLIKELY(inSeqs[n].litLength > 65535)) { assert(longLen == 0); longLen = n + nbSequences + 1; From bfc58f5ba24a3c27edfbc61288e09d2837235456 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 17:04:12 -0800 Subject: [PATCH 04/18] generalize validation function --- tests/fullbench.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/fullbench.c b/tests/fullbench.c index 739e55ef2ee..3cc5c234117 100644 --- a/tests/fullbench.c +++ b/tests/fullbench.c @@ -745,22 +745,22 @@ typedef struct { } BenchScenario; static BenchScenario kScenarios[] = { - { "compress", NULL, local_ZSTD_compress, NULL }, + { "compress", NULL, local_ZSTD_compress, check_compressedSequences }, { "decompress", prepDecompress, local_ZSTD_decompress, NULL }, - { "compress_freshCCtx", NULL, local_ZSTD_compress_freshCCtx, NULL }, + { "compress_freshCCtx", NULL, local_ZSTD_compress_freshCCtx, check_compressedSequences }, { "decompressDCtx", prepDecompress, local_ZSTD_decompressDCtx, NULL }, - { "compressContinue", NULL, local_ZSTD_compressContinue, NULL }, + { "compressContinue", NULL, local_ZSTD_compressContinue, check_compressedSequences }, { "compressContinue_extDict", NULL, local_ZSTD_compressContinue_extDict, NULL }, { "decompressContinue", prepDecompress, local_ZSTD_decompressContinue, NULL }, - { "compressStream", NULL, local_ZSTD_compressStream, NULL }, - { "compressStream_freshCCtx", NULL, local_ZSTD_compressStream_freshCCtx, NULL }, + { "compressStream", NULL, local_ZSTD_compressStream, check_compressedSequences }, + { "compressStream_freshCCtx", NULL, local_ZSTD_compressStream_freshCCtx, check_compressedSequences }, { "decompressStream", prepDecompress, local_ZSTD_decompressStream, NULL }, - { "compress2", NULL, local_ZSTD_compress2, NULL }, - { "compressStream2, end", NULL, local_ZSTD_compressStream2_end, NULL }, - { "compressStream2, end & short", prepShorterDstCapacity, local_ZSTD_compressStream2_end, NULL }, - { "compressStream2, continue", NULL, local_ZSTD_compressStream2_continue, NULL }, - { "compressStream2, -T2, continue", NULL, local_ZSTD_compress_generic_T2_continue, NULL }, - { "compressStream2, -T2, end", NULL, local_ZSTD_compress_generic_T2_end, NULL }, + { "compress2", NULL, local_ZSTD_compress2, check_compressedSequences }, + { "compressStream2, end", NULL, local_ZSTD_compressStream2_end, check_compressedSequences }, + { "compressStream2, end & short", prepShorterDstCapacity, local_ZSTD_compressStream2_end, check_compressedSequences }, + { "compressStream2, continue", NULL, local_ZSTD_compressStream2_continue, check_compressedSequences }, + { "compressStream2, -T2, continue", NULL, local_ZSTD_compress_generic_T2_continue, check_compressedSequences }, + { "compressStream2, -T2, end", NULL, local_ZSTD_compress_generic_T2_end, check_compressedSequences }, { "compressSequences", prepSequences, local_compressSequences, check_compressedSequences }, { "compressSequencesAndLiterals", prepSequencesAndLiterals, local_compressSequencesAndLiterals, check_compressedSequences }, { "convertSequences (1st block)", prepConvertSequences, local_convertSequences, NULL }, From 8eb2587432d70359f26ff98fd12db7e8c9be7515 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 19:29:06 -0800 Subject: [PATCH 05/18] added benchmark for get1BlockSummary() --- lib/compress/zstd_compress.c | 10 ++-------- lib/compress/zstd_compress_internal.h | 7 +++++++ tests/fullbench.c | 21 +++++++++++++++++++++ 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index a5298031303..d5e3d1c7856 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7395,13 +7395,7 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); } -typedef struct { - size_t nbSequences; - size_t blockSize; - size_t litSize; -} BlockSummary; - -static BlockSummary get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { size_t blockSize = 0; size_t litSize = 0; @@ -7456,7 +7450,7 @@ ZSTD_compressSequencesAndLiterals_internal(ZSTD_CCtx* cctx, while (nbSequences) { size_t compressedSeqsSize, cBlockSize, conversionStatus; - BlockSummary const block = get1BlockSummary(inSeqs, nbSequences); + BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); U32 const lastBlock = (block.nbSequences == nbSequences); FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); assert(block.nbSequences <= nbSequences); diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 2be67a1240a..ca5e2a4c5bf 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -1525,6 +1525,13 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, const ZSTD_Sequence* const inSeqs, size_t nbSequences, int const repcodeResolution); +typedef struct { + size_t nbSequences; + size_t blockSize; + size_t litSize; +} BlockSummary; + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs); /* ============================================================== * Private declarations diff --git a/tests/fullbench.c b/tests/fullbench.c index 3cc5c234117..7d77bd176d4 100644 --- a/tests/fullbench.c +++ b/tests/fullbench.c @@ -705,6 +705,26 @@ check_compressedSequences(const void* compressed, size_t cSize, const void* orig return 0; } +static size_t +local_get1BlockSummary(const void* input, size_t inputSize, + void* dst, size_t dstCapacity, + void* payload) +{ + const char* ip = input; + size_t const blockSize = MEM_read32(ip); + size_t const nbSeqs = MEM_read32(ip+=4); + const ZSTD_Sequence* seqs = (const ZSTD_Sequence*)(const void*)(ip+=4); + ZSTD_CCtx_reset(g_zcc, ZSTD_reset_session_and_parameters); + ZSTD_resetSeqStore(&g_zcc->seqStore); + ZSTD_CCtx_setParameter(g_zcc, ZSTD_c_blockDelimiters, ZSTD_sf_explicitBlockDelimiters); + assert(8 + nbSeqs * sizeof(ZSTD_Sequence) == inputSize); (void)inputSize; + (void)dst; (void)dstCapacity; + (void)payload; (void)blockSize; + + (void)ZSTD_get1BlockSummary(seqs, nbSeqs); + return nbSeqs; +} + static PrepResult prepCopy(const void* src, size_t srcSize, int cLevel) { PrepResult r = PREPRESULT_INIT; @@ -764,6 +784,7 @@ static BenchScenario kScenarios[] = { { "compressSequences", prepSequences, local_compressSequences, check_compressedSequences }, { "compressSequencesAndLiterals", prepSequencesAndLiterals, local_compressSequencesAndLiterals, check_compressedSequences }, { "convertSequences (1st block)", prepConvertSequences, local_convertSequences, NULL }, + { "get1BlockSummary (1st block)", prepConvertSequences, local_get1BlockSummary, NULL }, #ifndef ZSTD_DLL_IMPORT { "decodeLiteralsHeader (1st block)", prepLiterals, local_ZSTD_decodeLiteralsHeader, NULL }, { "decodeLiteralsBlock (1st block)", prepLiterals, local_ZSTD_decodeLiteralsBlock, NULL }, From b6a4d5a8ba29bc873c95098103f57f987cfacd23 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 19:34:06 -0800 Subject: [PATCH 06/18] minor +10% speed improvement for scalar ZSTD_get1BlockSummary() --- lib/compress/zstd_compress.c | 61 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index d5e3d1c7856..11933fb3c15 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7395,14 +7395,68 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); } +#if 0 && defined(__AVX2__) + +/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ +#if defined(__GNUC__) +# define ALIGNED32 __attribute__((aligned(32))) +#else +# define ALIGNED32 +#endif + BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { - size_t blockSize = 0; + size_t i; + __m256i sumVec; /* accumulates match+lit in 32-bit lanes */ + __m256i mask; /* shuffling control */ + ALIGNED32 int tmp[8]; /* temporary buffer for reduction */ + uint64_t sum; + int k; + + sumVec = _mm256_setzero_si256(); + mask = _mm256_setr_epi32( + 1,5, /* match(0), match(1) */ + 2,6, /* lit(0), lit(1) */ + 1,5, /* match(0), match(1) */ + 2,6 /* lit(0), lit(1) */ + ); + + /* Process 2 structs (32 bytes) at a time */ + for (i = 0; i + 2 <= count; i += 2) { + /* Load two consecutive MyStructs (8×4 = 32 bytes) */ + __m256i data = _mm256_loadu_si256((const __m256i*)&arr[i]); + /* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */ + __m256i selected = _mm256_permutevar8x32_epi32(data, mask); + /* Accumulate in sumVec */ + sumVec = _mm256_add_epi32(sumVec, selected); + } + + /* Horizontal reduction of sumVec */ + _mm256_store_si256((__m256i*)tmp, sumVec); + sum = 0; + for (k = 0; k < 8; k++) { + sum += (uint64_t)tmp[k]; /* each lane is match+lit from pairs, repeated twice */ + } + + /* Handle the leftover (if count is odd) */ + for (; i < count; i++) { + sum += arr[i].matchLength; + sum += arr[i].litLength; + } + + return sum; +} + +#else + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t totalMatchSize = 0; size_t litSize = 0; size_t n; assert(seqs); for (n=0; n Date: Tue, 7 Jan 2025 23:32:05 -0800 Subject: [PATCH 07/18] AVX2 version of ZSTD_get1BlockSummary() --- lib/compress/zstd_compress.c | 69 ++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 11933fb3c15..445c5613afa 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7395,56 +7395,65 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); } -#if 0 && defined(__AVX2__) +#if defined(__AVX2__) /* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ #if defined(__GNUC__) # define ALIGNED32 __attribute__((aligned(32))) +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define ALIGNED32 alignas(32) #else + /* this compiler will require its own alignment instruction */ # define ALIGNED32 #endif BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { size_t i; - __m256i sumVec; /* accumulates match+lit in 32-bit lanes */ - __m256i mask; /* shuffling control */ - ALIGNED32 int tmp[8]; /* temporary buffer for reduction */ - uint64_t sum; - int k; - - sumVec = _mm256_setzero_si256(); - mask = _mm256_setr_epi32( - 1,5, /* match(0), match(1) */ - 2,6, /* lit(0), lit(1) */ - 1,5, /* match(0), match(1) */ - 2,6 /* lit(0), lit(1) */ - ); + __m256i const zeroVec = _mm256_setzero_si256(); + __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ + __m256i shuffle32; /* shuffling control */ + ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */ + size_t mSum = 0, lSum = 0; /* Process 2 structs (32 bytes) at a time */ - for (i = 0; i + 2 <= count; i += 2) { - /* Load two consecutive MyStructs (8×4 = 32 bytes) */ - __m256i data = _mm256_loadu_si256((const __m256i*)&arr[i]); - /* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */ - __m256i selected = _mm256_permutevar8x32_epi32(data, mask); + for (i = 0; i + 2 <= nbSeqs; i += 2) { + /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ + __m256i data = _mm256_loadu_si256((const __m256i*)&seqs[i]); + /* check end of block signal */ + __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); + int cmp_res = _mm256_movemask_epi8(cmp); + /* indices for match lengths correspond to bits [8..11], [24..27] + * => combined mask = 0x0F000F00 */ + if (cmp_res & 0x0F000F00) break; /* Accumulate in sumVec */ - sumVec = _mm256_add_epi32(sumVec, selected); + sumVec = _mm256_add_epi32(sumVec, data); } - /* Horizontal reduction of sumVec */ + /* Horizontal reduction */ _mm256_store_si256((__m256i*)tmp, sumVec); - sum = 0; - for (k = 0; k < 8; k++) { - sum += (uint64_t)tmp[k]; /* each lane is match+lit from pairs, repeated twice */ - } + lSum = tmp[1] + tmp[5]; + mSum = tmp[2] + tmp[6]; - /* Handle the leftover (if count is odd) */ - for (; i < count; i++) { - sum += arr[i].matchLength; - sum += arr[i].litLength; + /* Handle the leftover */ + for (; i < nbSeqs; i++) { + lSum += seqs[i].litLength; + mSum += seqs[i].matchLength; + if (seqs[i].matchLength == 0) break; /* end of block */ } - return sum; + if (i==nbSeqs) { + /* reaching end of sequences: end of block signal was not present */ + BlockSummary bs; + bs.nbSequences = ERROR(externalSequences_invalid); + return bs; + } + { BlockSummary bs; + bs.nbSequences = i+1; + bs.blockSize = lSum + mSum; + bs.litSize = lSum; + return bs; + } } #else From cd53924eff684146b67e890d7b48158c37eca32c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 23:34:19 -0800 Subject: [PATCH 08/18] removed erroneous #includes that were automatically added by the editor without notification --- lib/compress/zstd_compress.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 445c5613afa..beff0aae6fe 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -15,9 +15,7 @@ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ #include "../common/mem.h" #include "../common/error_private.h" -#include "compiler.h" #include "hist.h" /* HIST_countFast_wksp */ -#include "zstd_internal.h" #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "../common/fse.h" #include "../common/huf.h" From db3d48823a75a12a5ad9221e5a39191ff0044d3a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 23:40:49 -0800 Subject: [PATCH 09/18] no need for specialized variant the branch is not in the hot loop --- lib/compress/zstd_compress.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index beff0aae6fe..8e49169b186 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7297,7 +7297,7 @@ convertSequences_noRepcodes(SeqDef* dstSeqs, * This is helpful to generate a lean main pipeline, improving performance. * It may be re-inserted later. */ -static size_t ZSTD_convertBlockSequences_internal(ZSTD_CCtx* cctx, +size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, const ZSTD_Sequence* const inSeqs, size_t nbSequences, int repcodeResolution) { @@ -7379,20 +7379,6 @@ static size_t ZSTD_convertBlockSequences_internal(ZSTD_CCtx* cctx, return 0; } -static size_t ZSTD_convertBlockSequences_noRepcode(ZSTD_CCtx* cctx, - const ZSTD_Sequence* const inSeqs, size_t nbSequences) -{ - return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); -} - -size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, - const ZSTD_Sequence* const inSeqs, size_t nbSequences, - int repcodeResolution) -{ - (void)repcodeResolution; - return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); -} - #if defined(__AVX2__) /* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ From 4aaf9cefe9bdda1fafb5f6a5ba13294d2b478bd7 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 23:45:15 -0800 Subject: [PATCH 10/18] fix minor conversion warning --- lib/compress/zstd_compress.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 8e49169b186..19fb584fbd6 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7157,15 +7157,15 @@ size_t convertSequences_noRepcodes( 0, 1, 2, 3, /* offset+2 */ 4, 5, /* litLength (16 bits) */ 8, 9, /* matchLength (16 bits) */ - (char)0x80, (char)0x80, (char)0x80, (char)0x80, - (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, /* For the upper 128 bits => sequence i+1 */ 16,17,18,19, /* offset+2 */ 20,21, /* litLength */ 24,25, /* matchLength */ - (char)0x80, (char)0x80, (char)0x80, (char)0x80, - (char)0x80, (char)0x80, (char)0x80, (char)0x80 + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 ); /* From 57a45541927180724712b651ed1fb3e125105f30 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 7 Jan 2025 23:59:01 -0800 Subject: [PATCH 11/18] removed unused variable --- lib/compress/zstd_compress.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 19fb584fbd6..e5b0cd80ee0 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7396,7 +7396,6 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) size_t i; __m256i const zeroVec = _mm256_setzero_si256(); __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ - __m256i shuffle32; /* shuffling control */ ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */ size_t mSum = 0, lSum = 0; From aa2cdf964f93d96113c09028ad7354ca2debc849 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 8 Jan 2025 10:51:57 -0800 Subject: [PATCH 12/18] added compilation-time checks to ensure AVX2 code is valid since it depends on a specific definition of ZSTD_Sequence structure. --- lib/compress/zstd_compress.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index e5b0cd80ee0..f6cde3d09df 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7176,6 +7176,17 @@ size_t convertSequences_noRepcodes( #define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ size_t longLen = 0, i = 0; + + /* AVX permutation depends on the specific definition of target structures */ + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); + ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); + /* Process 2 sequences per loop iteration */ for (; i + 1 < nbSequences; i += 2) { /* Load 2 ZSTD_Sequence (32 bytes) */ @@ -7398,6 +7409,7 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */ size_t mSum = 0, lSum = 0; + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); /* Process 2 structs (32 bytes) at a time */ for (i = 0; i + 2 <= nbSeqs; i += 2) { @@ -7408,6 +7420,7 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) int cmp_res = _mm256_movemask_epi8(cmp); /* indices for match lengths correspond to bits [8..11], [24..27] * => combined mask = 0x0F000F00 */ + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); if (cmp_res & 0x0F000F00) break; /* Accumulate in sumVec */ sumVec = _mm256_add_epi32(sumVec, data); From e3181cfd325db59dbdeadcaf91b8187f49c5546c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 8 Jan 2025 14:25:03 -0800 Subject: [PATCH 13/18] minor code doc update --- lib/compress/zstd_compress.c | 16 ++++++++-------- lib/zstd.h | 5 +++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index f6cde3d09df..bdf0e828704 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7196,8 +7196,8 @@ size_t convertSequences_noRepcodes( __m256i vadd = _mm256_add_epi32(vin, addition); /* Check for long length */ - __m256i cmp = _mm256_cmpgt_epi32(vadd, limit); // 0xFFFFFFFF for element > 65535 - int cmp_res = _mm256_movemask_epi8(cmp); + __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ + int ll_res = _mm256_movemask_epi8(ll_cmp); /* Shuffle bytes so each half gives us the 8 bytes we need */ __m256i vshf = _mm256_shuffle_epi8(vadd, mask); @@ -7228,7 +7228,7 @@ size_t convertSequences_noRepcodes( * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] * => combined mask = 0x0FF00FF0 */ - if (UNLIKELY((cmp_res & 0x0FF00FF0) != 0)) { + if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { /* long length detected: let's figure out which one*/ if (inSeqs[i].matchLength > 65535+MINMATCH) { assert(longLen == 0); @@ -7251,12 +7251,12 @@ size_t convertSequences_noRepcodes( /* Handle leftover if @nbSequences is odd */ if (i < nbSequences) { - /* Fallback: process last sequence */ + /* process last sequence */ assert(i == nbSequences - 1); dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); - /* note: doesn't work if one length is > 65535 */ dstSeqs[i].litLength = (U16)inSeqs[i].litLength; dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + /* check (unlikely) long lengths > 65535 */ if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { assert(longLen == 0); longLen = i + 1; @@ -7271,8 +7271,8 @@ size_t convertSequences_noRepcodes( } /* the vector implementation could also be ported to SSSE3, - * but since this implementation is targeting modern systems >= Sapphire Rapid, - * it's not useful to develop and maintain code for older platforms (before AVX2) */ + * but since this implementation is targeting modern systems (>= Sapphire Rapid), + * it's not useful to develop and maintain code for older pre-AVX2 platforms */ #else /* no AVX2 */ @@ -7284,9 +7284,9 @@ convertSequences_noRepcodes(SeqDef* dstSeqs, size_t n; for (n=0; n 65535 */ dstSeqs[n].litLength = (U16)inSeqs[n].litLength; dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); + /* check for long length > 65535 */ if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { assert(longLen == 0); longLen = n + 1; diff --git a/lib/zstd.h b/lib/zstd.h index 907a377d186..b8c0644a7ec 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1696,7 +1696,8 @@ ZSTD_compressSequences(ZSTD_CCtx* cctx, * - Not compatible with frame checksum, which must be disabled * - If any block is incompressible, will fail and return an error * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error. - * - the buffer @literals must have a size @litCapacity which is larger than @litSize by at least 8 bytes. + * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals. + * @litBufCapacity must be at least 8 bytes larger than @litSize. * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error. * @return : final compressed size, or a ZSTD error code. */ @@ -1704,7 +1705,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const ZSTD_Sequence* inSeqs, size_t nbSequences, - const void* literals, size_t litSize, size_t litCapacity, + const void* literals, size_t litSize, size_t litBufCapacity, size_t decompressedSize); From 6f8e6f3c97c8e95527a29c4667edca6b793f798f Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 14 Jan 2025 14:44:02 -0800 Subject: [PATCH 14/18] create new compilation macro ZSTD_ARCH_X86_AVX2 --- lib/common/compiler.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/common/compiler.h b/lib/common/compiler.h index b6cbcee0366..f636b7f2551 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -224,6 +224,9 @@ /* compile time determination of SIMD support */ #if !defined(ZSTD_NO_INTRINSICS) +# if defined(__AVX2__) +# define ZSTD_ARCH_X86_AVX2 +# endif # if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) # define ZSTD_ARCH_X86_SSE2 # endif @@ -231,6 +234,9 @@ # define ZSTD_ARCH_ARM_NEON # endif # +# if defined(ZSTD_ARCH_X86_AVX2) +# include +# endif # if defined(ZSTD_ARCH_X86_SSE2) # include # elif defined(ZSTD_ARCH_ARM_NEON) From debe3d20d9ea0aaa45fbb692302347d0ece9f2c0 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 14 Jan 2025 14:54:02 -0800 Subject: [PATCH 15/18] removed unused branch --- lib/compress/zstd_compress.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index bdf0e828704..55622cbd5c0 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7349,18 +7349,12 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { U32 const litLength = inSeqs[seqNb].litLength; U32 const matchLength = inSeqs[seqNb].matchLength; - U32 offBase; - - if (!repcodeResolution) { - offBase = OFFSET_TO_OFFBASE(inSeqs[seqNb].offset); - } else { - U32 const ll0 = (litLength == 0); - offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); - ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); - } + U32 const ll0 = (litLength == 0); + U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); } } From 2f3ee8b5309958a2bc1fc7477e703fd8195a31ea Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 14 Jan 2025 14:56:10 -0800 Subject: [PATCH 16/18] changed code compilation test to employ ZSTD_ARCH_X86_AVX2 --- lib/compress/zstd_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 55622cbd5c0..8e5f369bb12 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7384,7 +7384,7 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return 0; } -#if defined(__AVX2__) +#if defined(ZSTD_ARCH_X86_AVX2) /* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ #if defined(__GNUC__) From 8bff69af869fca1cc44172c2ae5d5f995322509b Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 14 Jan 2025 15:54:10 -0800 Subject: [PATCH 17/18] Alignment instruction ZSTD_ALIGNED() in common/compiler.h --- lib/common/compiler.h | 15 ++++++++++++++- lib/compress/zstd_compress.c | 12 +----------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/common/compiler.h b/lib/common/compiler.h index f636b7f2551..2a8002288ac 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -281,7 +281,7 @@ #endif /*-************************************************************** -* Alignment check +* Alignment *****************************************************************/ /* @return 1 if @u is a 2^n value, 0 otherwise @@ -315,6 +315,19 @@ MEM_STATIC int ZSTD_isPower2(size_t u) { # endif #endif /* ZSTD_ALIGNOF */ +#ifndef ZSTD_ALIGNED +/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ +# if defined(__GNUC__) +# define ZSTD_ALIGNED(a) __attribute__((aligned(a))) +# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define ZSTD_ALIGNED(a) alignas(a) +# else + /* this compiler will require its own alignment instruction */ +# define ZSTD_ALIGNED(...) +# endif +#endif /* ZSTD_ALIGNED */ + + /*-************************************************************** * Sanitizer *****************************************************************/ diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 8e5f369bb12..eba2d07dcf5 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7386,22 +7386,12 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, #if defined(ZSTD_ARCH_X86_AVX2) -/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ -#if defined(__GNUC__) -# define ALIGNED32 __attribute__((aligned(32))) -#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ -# define ALIGNED32 alignas(32) -#else - /* this compiler will require its own alignment instruction */ -# define ALIGNED32 -#endif - BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { size_t i; __m256i const zeroVec = _mm256_setzero_si256(); __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ - ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */ + ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ size_t mSum = 0, lSum = 0; ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); From 87f0a4fbe0a1ffcaab4618f2aa76545e225acf07 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 14 Jan 2025 15:57:05 -0800 Subject: [PATCH 18/18] restore full equation do not solve the equation, even though some members cancel each other, this is done for clarity, we'll let the compiler do the resolution at compile time. --- lib/compress/zstd_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index eba2d07dcf5..e26e78a8f7f 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7342,7 +7342,7 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); assert(longl <= 2* (nbSequences-1)); cctx->seqStore.longLengthType = ZSTD_llt_literalLength; - cctx->seqStore.longLengthPos = (U32)(longl-nbSequences); + cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); } } } else {