From b9d531a07d8734577c3870a71a037cc48b26a5e5 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Wed, 24 Jul 2024 15:49:01 +0400 Subject: [PATCH 1/4] make block size configurable --- libsql-sqlite3/src/vectorIndex.c | 21 ++++++----- libsql-sqlite3/src/vectorIndexInt.h | 2 ++ libsql-sqlite3/src/vectordiskann.c | 55 +++++++++++++++++++++++++---- 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/libsql-sqlite3/src/vectorIndex.c b/libsql-sqlite3/src/vectorIndex.c index 67bfbb916b..47d2d0e32b 100644 --- a/libsql-sqlite3/src/vectorIndex.c +++ b/libsql-sqlite3/src/vectorIndex.c @@ -383,18 +383,19 @@ static struct VectorColumnType VECTOR_COLUMN_TYPES[] = { struct VectorParamName { const char *zName; int tag; - int type; // 0 - enum, 1 - integer, 2 - float + int type; // 0 - string enum, 1 - integer, 2 - float const char *zValueStr; u64 value; }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 2, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_edges", VECTOR_MAX_EDGES_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { @@ -414,11 +415,15 @@ static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, con continue; } if( VECTOR_PARAM_NAMES[i].type == 1 ){ - u64 value = sqlite3Atoi(zValue); + int value = sqlite3Atoi(zValue); if( value == 0 ){ *pErrMsg = "invalid representation of integer vector index parameter"; return -1; } + if( value < 0 ){ + *pErrMsg = "integer vector index parameter must be positive"; + return -1; + } if( vectorIdxParamsPutU64(pParams, VECTOR_PARAM_NAMES[i].tag, value) != 0 ){ *pErrMsg = "unable to serialize integer vector index parameter"; return -1; diff --git a/libsql-sqlite3/src/vectorIndexInt.h b/libsql-sqlite3/src/vectorIndexInt.h index a2c1ccfb17..7357009362 100644 --- a/libsql-sqlite3/src/vectorIndexInt.h +++ b/libsql-sqlite3/src/vectorIndexInt.h @@ -130,6 +130,8 @@ typedef u8 MetricType; #define VECTOR_SEARCH_L_PARAM_ID 9 #define VECTOR_SEARCH_L_DEFAULT 200 +#define VECTOR_MAX_EDGES_PARAM_ID 10 + /* total amount of vector index parameters */ #define VECTOR_PARAM_IDS_COUNT 9 diff --git a/libsql-sqlite3/src/vectordiskann.c b/libsql-sqlite3/src/vectordiskann.c index 1401961a93..e559668e8b 100644 --- a/libsql-sqlite3/src/vectordiskann.c +++ b/libsql-sqlite3/src/vectordiskann.c @@ -49,6 +49,7 @@ */ #ifndef SQLITE_OMIT_VECTOR +#include "math.h" #include "sqliteInt.h" #include "vectorIndexInt.h" @@ -64,6 +65,11 @@ // stack simplify memory managment code and also doesn't impose very strict limits here since 128 bytes for column names should be enough for almost all use cases #define DISKANN_SQL_RENDER_LIMIT 128 +// limit to the maximum size of DiskANN block (128 MB) +// even with 1MB we can store tens of thousands of nodes in several GBs - which is already too much +// but we are "generous" here and allow user to store up to 128MB blobs +#define DISKANN_MAX_BLOCK_SZ 134217728 + /* * Due to historical reasons parameter for index block size were stored as u16 value and divided by 512 (2^9) * So, we will make inverse transform before initializing index from stored parameters @@ -263,8 +269,16 @@ void blobSpotFree(BlobSpot *pBlobSpot) { ** Layout specific utilities **************************************************************************/ +int nodeEdgeOverhead(int nEdgeVectorSize){ + return nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE; +} + +int nodeOverhead(int nNodeVectorSize){ + return nNodeVectorSize + VECTOR_NODE_METADATA_SIZE; +} + int nodeEdgesMaxCount(const DiskAnnIndex *pIndex){ - unsigned int nMaxEdges = (pIndex->nBlockSize - pIndex->nNodeVectorSize - VECTOR_NODE_METADATA_SIZE) / (pIndex->nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE); + unsigned int nMaxEdges = (pIndex->nBlockSize - nodeOverhead(pIndex->nNodeVectorSize)) / nodeEdgeOverhead(pIndex->nEdgeVectorSize); assert( nMaxEdges > 0); return nMaxEdges; } @@ -419,6 +433,8 @@ int diskAnnCreateIndex( VectorIdxParams *pParams ){ int rc; + int type, dims; + u64 maxEdgesParam, blockSizeBytes; char *zSql; char columnSqlDefs[DISKANN_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[DISKANN_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) @@ -431,16 +447,34 @@ int diskAnnCreateIndex( if( vectorIdxParamsPutU64(pParams, VECTOR_INDEX_TYPE_PARAM_ID, VECTOR_INDEX_TYPE_DISKANN) != 0 ){ return SQLITE_ERROR; } + type = vectorIdxParamsGetU64(pParams, VECTOR_TYPE_PARAM_ID); + if( type == 0 ){ + return SQLITE_ERROR; + } + dims = vectorIdxParamsGetU64(pParams, VECTOR_DIM_PARAM_ID); + if( dims == 0 ){ + return SQLITE_ERROR; + } + assert( 0 < dims && dims <= MAX_VECTOR_SZ ); + + maxEdgesParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_EDGES_PARAM_ID); + if( maxEdgesParam == 0 ){ + // 3 D**(1/2) gives good recall values (90%+) + // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound + maxEdgesParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + } + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxEdgesParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ + return SQLITE_ERROR; + } + if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){ + return SQLITE_ERROR; + } if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){ if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){ return SQLITE_ERROR; } } - if( vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID) == 0 ){ - if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, VECTOR_BLOCK_SIZE_DEFAULT) != 0 ){ - return SQLITE_ERROR; - } - } if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){ if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){ return SQLITE_ERROR; @@ -1430,6 +1464,7 @@ int diskAnnOpenIndex( DiskAnnIndex **ppIndex /* OUT: Index */ ){ DiskAnnIndex *pIndex; + u64 nBlockSize; pIndex = sqlite3DbMallocRaw(db, sizeof(DiskAnnIndex)); if( pIndex == NULL ){ return SQLITE_NOMEM; @@ -1442,9 +1477,15 @@ int diskAnnOpenIndex( diskAnnCloseIndex(pIndex); return SQLITE_NOMEM_BKPT; } + nBlockSize = vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID); + // preserve backward compatibility: treat block size > 128 literally, but <= 128 with shift + if( nBlockSize <= 128 ){ + nBlockSize <<= DISKANN_BLOCK_SIZE_SHIFT; + } + pIndex->nFormatVersion = vectorIdxParamsGetU64(pParams, VECTOR_FORMAT_PARAM_ID); pIndex->nDistanceFunc = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID); - pIndex->nBlockSize = vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID) << DISKANN_BLOCK_SIZE_SHIFT; + pIndex->nBlockSize = nBlockSize; pIndex->nNodeVectorType = vectorIdxParamsGetU64(pParams, VECTOR_TYPE_PARAM_ID); pIndex->nVectorDims = vectorIdxParamsGetU64(pParams, VECTOR_DIM_PARAM_ID); pIndex->pruningAlpha = vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID); From ea1bb70e806eca5402402b8a5580870436a41065 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Wed, 24 Jul 2024 15:49:15 +0400 Subject: [PATCH 2/4] add simple test which mention all index parameters --- libsql-sqlite3/test/libsql_vector_index.test | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libsql-sqlite3/test/libsql_vector_index.test b/libsql-sqlite3/test/libsql_vector_index.test index a180ff0a8a..5210628f11 100644 --- a/libsql-sqlite3/test/libsql_vector_index.test +++ b/libsql-sqlite3/test/libsql_vector_index.test @@ -250,6 +250,13 @@ do_execsql_test vector-transaction { SELECT * FROM vector_top_k('t_transaction_idx', vector('[1,2]'), 2); } {3 4 1 2} +do_execsql_test vector-all-params { + CREATE TABLE t_all_params ( emb FLOAT32(2) ); + CREATE INDEX t_all_params_idx ON t_all_params(libsql_vector_idx(emb, 'type=diskann', 'metric=cos', 'alpha=1.2', 'search_l=200', 'insert_l=70', 'max_edges=6')); + INSERT INTO t_all_params VALUES (vector('[1,2]')), (vector('[3,4]')); + SELECT * FROM vector_top_k('t_all_params_idx', vector('[1,2]'), 2); +} {1 2} + proc error_messages {sql} { set ret "" catch { From 675cdd4710fac562570e895db6f757592fa39aac Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Thu, 25 Jul 2024 12:10:18 +0400 Subject: [PATCH 3/4] rename max_edges to max_neighbors --- libsql-sqlite3/src/vectorIndex.c | 14 +++++++------- libsql-sqlite3/src/vectorIndexInt.h | 2 +- libsql-sqlite3/src/vectordiskann.c | 10 +++++----- libsql-sqlite3/test/libsql_vector_index.test | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/libsql-sqlite3/src/vectorIndex.c b/libsql-sqlite3/src/vectorIndex.c index 47d2d0e32b..901d44fa83 100644 --- a/libsql-sqlite3/src/vectorIndex.c +++ b/libsql-sqlite3/src/vectorIndex.c @@ -389,13 +389,13 @@ struct VectorParamName { }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, - { "max_edges", VECTOR_MAX_EDGES_PARAM_ID, 1, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { diff --git a/libsql-sqlite3/src/vectorIndexInt.h b/libsql-sqlite3/src/vectorIndexInt.h index 7357009362..c87c47a5ab 100644 --- a/libsql-sqlite3/src/vectorIndexInt.h +++ b/libsql-sqlite3/src/vectorIndexInt.h @@ -130,7 +130,7 @@ typedef u8 MetricType; #define VECTOR_SEARCH_L_PARAM_ID 9 #define VECTOR_SEARCH_L_DEFAULT 200 -#define VECTOR_MAX_EDGES_PARAM_ID 10 +#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 /* total amount of vector index parameters */ #define VECTOR_PARAM_IDS_COUNT 9 diff --git a/libsql-sqlite3/src/vectordiskann.c b/libsql-sqlite3/src/vectordiskann.c index e559668e8b..4ca75262b7 100644 --- a/libsql-sqlite3/src/vectordiskann.c +++ b/libsql-sqlite3/src/vectordiskann.c @@ -434,7 +434,7 @@ int diskAnnCreateIndex( ){ int rc; int type, dims; - u64 maxEdgesParam, blockSizeBytes; + u64 maxNeighborsParam, blockSizeBytes; char *zSql; char columnSqlDefs[DISKANN_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[DISKANN_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) @@ -457,13 +457,13 @@ int diskAnnCreateIndex( } assert( 0 < dims && dims <= MAX_VECTOR_SZ ); - maxEdgesParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_EDGES_PARAM_ID); - if( maxEdgesParam == 0 ){ + maxNeighborsParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_NEIGHBORS_PARAM_ID); + if( maxNeighborsParam == 0 ){ // 3 D**(1/2) gives good recall values (90%+) // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound - maxEdgesParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); } - blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxEdgesParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ return SQLITE_ERROR; } diff --git a/libsql-sqlite3/test/libsql_vector_index.test b/libsql-sqlite3/test/libsql_vector_index.test index 5210628f11..447deb6d39 100644 --- a/libsql-sqlite3/test/libsql_vector_index.test +++ b/libsql-sqlite3/test/libsql_vector_index.test @@ -252,7 +252,7 @@ do_execsql_test vector-transaction { do_execsql_test vector-all-params { CREATE TABLE t_all_params ( emb FLOAT32(2) ); - CREATE INDEX t_all_params_idx ON t_all_params(libsql_vector_idx(emb, 'type=diskann', 'metric=cos', 'alpha=1.2', 'search_l=200', 'insert_l=70', 'max_edges=6')); + CREATE INDEX t_all_params_idx ON t_all_params(libsql_vector_idx(emb, 'type=diskann', 'metric=cos', 'alpha=1.2', 'search_l=200', 'insert_l=70', 'max_neighbors=6')); INSERT INTO t_all_params VALUES (vector('[1,2]')), (vector('[3,4]')); SELECT * FROM vector_top_k('t_all_params_idx', vector('[1,2]'), 2); } {1 2} From 86624ef6f37d2807edc890621687983a407cb975 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Thu, 25 Jul 2024 19:28:36 +0400 Subject: [PATCH 4/4] build bundles --- .../SQLite3MultipleCiphers/src/sqlite3.c | 78 +++++++++++++++---- libsql-ffi/bundled/src/sqlite3.c | 78 +++++++++++++++---- 2 files changed, 126 insertions(+), 30 deletions(-) diff --git a/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c b/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c index 362008e23f..8aa37499ae 100644 --- a/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c +++ b/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c @@ -85036,6 +85036,8 @@ typedef u8 MetricType; #define VECTOR_SEARCH_L_PARAM_ID 9 #define VECTOR_SEARCH_L_DEFAULT 200 +#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 + /* total amount of vector index parameters */ #define VECTOR_PARAM_IDS_COUNT 9 @@ -209459,6 +209461,7 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ */ #ifndef SQLITE_OMIT_VECTOR +/* #include "math.h" */ /* #include "sqliteInt.h" */ /* #include "vectorIndexInt.h" */ @@ -209474,6 +209477,11 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ // stack simplify memory managment code and also doesn't impose very strict limits here since 128 bytes for column names should be enough for almost all use cases #define DISKANN_SQL_RENDER_LIMIT 128 +// limit to the maximum size of DiskANN block (128 MB) +// even with 1MB we can store tens of thousands of nodes in several GBs - which is already too much +// but we are "generous" here and allow user to store up to 128MB blobs +#define DISKANN_MAX_BLOCK_SZ 134217728 + /* * Due to historical reasons parameter for index block size were stored as u16 value and divided by 512 (2^9) * So, we will make inverse transform before initializing index from stored parameters @@ -209673,8 +209681,16 @@ void blobSpotFree(BlobSpot *pBlobSpot) { ** Layout specific utilities **************************************************************************/ +int nodeEdgeOverhead(int nEdgeVectorSize){ + return nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE; +} + +int nodeOverhead(int nNodeVectorSize){ + return nNodeVectorSize + VECTOR_NODE_METADATA_SIZE; +} + int nodeEdgesMaxCount(const DiskAnnIndex *pIndex){ - unsigned int nMaxEdges = (pIndex->nBlockSize - pIndex->nNodeVectorSize - VECTOR_NODE_METADATA_SIZE) / (pIndex->nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE); + unsigned int nMaxEdges = (pIndex->nBlockSize - nodeOverhead(pIndex->nNodeVectorSize)) / nodeEdgeOverhead(pIndex->nEdgeVectorSize); assert( nMaxEdges > 0); return nMaxEdges; } @@ -209829,6 +209845,8 @@ int diskAnnCreateIndex( VectorIdxParams *pParams ){ int rc; + int type, dims; + u64 maxNeighborsParam, blockSizeBytes; char *zSql; char columnSqlDefs[DISKANN_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[DISKANN_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) @@ -209841,16 +209859,34 @@ int diskAnnCreateIndex( if( vectorIdxParamsPutU64(pParams, VECTOR_INDEX_TYPE_PARAM_ID, VECTOR_INDEX_TYPE_DISKANN) != 0 ){ return SQLITE_ERROR; } + type = vectorIdxParamsGetU64(pParams, VECTOR_TYPE_PARAM_ID); + if( type == 0 ){ + return SQLITE_ERROR; + } + dims = vectorIdxParamsGetU64(pParams, VECTOR_DIM_PARAM_ID); + if( dims == 0 ){ + return SQLITE_ERROR; + } + assert( 0 < dims && dims <= MAX_VECTOR_SZ ); + + maxNeighborsParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_NEIGHBORS_PARAM_ID); + if( maxNeighborsParam == 0 ){ + // 3 D**(1/2) gives good recall values (90%+) + // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound + maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + } + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ + return SQLITE_ERROR; + } + if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){ + return SQLITE_ERROR; + } if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){ if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){ return SQLITE_ERROR; } } - if( vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID) == 0 ){ - if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, VECTOR_BLOCK_SIZE_DEFAULT) != 0 ){ - return SQLITE_ERROR; - } - } if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){ if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){ return SQLITE_ERROR; @@ -210840,6 +210876,7 @@ int diskAnnOpenIndex( DiskAnnIndex **ppIndex /* OUT: Index */ ){ DiskAnnIndex *pIndex; + u64 nBlockSize; pIndex = sqlite3DbMallocRaw(db, sizeof(DiskAnnIndex)); if( pIndex == NULL ){ return SQLITE_NOMEM; @@ -210852,9 +210889,15 @@ int diskAnnOpenIndex( diskAnnCloseIndex(pIndex); return SQLITE_NOMEM_BKPT; } + nBlockSize = vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID); + // preserve backward compatibility: treat block size > 128 literally, but <= 128 with shift + if( nBlockSize <= 128 ){ + nBlockSize <<= DISKANN_BLOCK_SIZE_SHIFT; + } + pIndex->nFormatVersion = vectorIdxParamsGetU64(pParams, VECTOR_FORMAT_PARAM_ID); pIndex->nDistanceFunc = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID); - pIndex->nBlockSize = vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID) << DISKANN_BLOCK_SIZE_SHIFT; + pIndex->nBlockSize = nBlockSize; pIndex->nNodeVectorType = vectorIdxParamsGetU64(pParams, VECTOR_TYPE_PARAM_ID); pIndex->nVectorDims = vectorIdxParamsGetU64(pParams, VECTOR_DIM_PARAM_ID); pIndex->pruningAlpha = vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID); @@ -211810,18 +211853,19 @@ static struct VectorColumnType VECTOR_COLUMN_TYPES[] = { struct VectorParamName { const char *zName; int tag; - int type; // 0 - enum, 1 - integer, 2 - float + int type; // 0 - string enum, 1 - integer, 2 - float const char *zValueStr; u64 value; }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 2, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { @@ -211841,11 +211885,15 @@ static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, con continue; } if( VECTOR_PARAM_NAMES[i].type == 1 ){ - u64 value = sqlite3Atoi(zValue); + int value = sqlite3Atoi(zValue); if( value == 0 ){ *pErrMsg = "invalid representation of integer vector index parameter"; return -1; } + if( value < 0 ){ + *pErrMsg = "integer vector index parameter must be positive"; + return -1; + } if( vectorIdxParamsPutU64(pParams, VECTOR_PARAM_NAMES[i].tag, value) != 0 ){ *pErrMsg = "unable to serialize integer vector index parameter"; return -1; diff --git a/libsql-ffi/bundled/src/sqlite3.c b/libsql-ffi/bundled/src/sqlite3.c index 362008e23f..8aa37499ae 100644 --- a/libsql-ffi/bundled/src/sqlite3.c +++ b/libsql-ffi/bundled/src/sqlite3.c @@ -85036,6 +85036,8 @@ typedef u8 MetricType; #define VECTOR_SEARCH_L_PARAM_ID 9 #define VECTOR_SEARCH_L_DEFAULT 200 +#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 + /* total amount of vector index parameters */ #define VECTOR_PARAM_IDS_COUNT 9 @@ -209459,6 +209461,7 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ */ #ifndef SQLITE_OMIT_VECTOR +/* #include "math.h" */ /* #include "sqliteInt.h" */ /* #include "vectorIndexInt.h" */ @@ -209474,6 +209477,11 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ // stack simplify memory managment code and also doesn't impose very strict limits here since 128 bytes for column names should be enough for almost all use cases #define DISKANN_SQL_RENDER_LIMIT 128 +// limit to the maximum size of DiskANN block (128 MB) +// even with 1MB we can store tens of thousands of nodes in several GBs - which is already too much +// but we are "generous" here and allow user to store up to 128MB blobs +#define DISKANN_MAX_BLOCK_SZ 134217728 + /* * Due to historical reasons parameter for index block size were stored as u16 value and divided by 512 (2^9) * So, we will make inverse transform before initializing index from stored parameters @@ -209673,8 +209681,16 @@ void blobSpotFree(BlobSpot *pBlobSpot) { ** Layout specific utilities **************************************************************************/ +int nodeEdgeOverhead(int nEdgeVectorSize){ + return nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE; +} + +int nodeOverhead(int nNodeVectorSize){ + return nNodeVectorSize + VECTOR_NODE_METADATA_SIZE; +} + int nodeEdgesMaxCount(const DiskAnnIndex *pIndex){ - unsigned int nMaxEdges = (pIndex->nBlockSize - pIndex->nNodeVectorSize - VECTOR_NODE_METADATA_SIZE) / (pIndex->nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE); + unsigned int nMaxEdges = (pIndex->nBlockSize - nodeOverhead(pIndex->nNodeVectorSize)) / nodeEdgeOverhead(pIndex->nEdgeVectorSize); assert( nMaxEdges > 0); return nMaxEdges; } @@ -209829,6 +209845,8 @@ int diskAnnCreateIndex( VectorIdxParams *pParams ){ int rc; + int type, dims; + u64 maxNeighborsParam, blockSizeBytes; char *zSql; char columnSqlDefs[DISKANN_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[DISKANN_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) @@ -209841,16 +209859,34 @@ int diskAnnCreateIndex( if( vectorIdxParamsPutU64(pParams, VECTOR_INDEX_TYPE_PARAM_ID, VECTOR_INDEX_TYPE_DISKANN) != 0 ){ return SQLITE_ERROR; } + type = vectorIdxParamsGetU64(pParams, VECTOR_TYPE_PARAM_ID); + if( type == 0 ){ + return SQLITE_ERROR; + } + dims = vectorIdxParamsGetU64(pParams, VECTOR_DIM_PARAM_ID); + if( dims == 0 ){ + return SQLITE_ERROR; + } + assert( 0 < dims && dims <= MAX_VECTOR_SZ ); + + maxNeighborsParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_NEIGHBORS_PARAM_ID); + if( maxNeighborsParam == 0 ){ + // 3 D**(1/2) gives good recall values (90%+) + // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound + maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + } + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ + return SQLITE_ERROR; + } + if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){ + return SQLITE_ERROR; + } if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){ if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){ return SQLITE_ERROR; } } - if( vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID) == 0 ){ - if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, VECTOR_BLOCK_SIZE_DEFAULT) != 0 ){ - return SQLITE_ERROR; - } - } if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){ if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){ return SQLITE_ERROR; @@ -210840,6 +210876,7 @@ int diskAnnOpenIndex( DiskAnnIndex **ppIndex /* OUT: Index */ ){ DiskAnnIndex *pIndex; + u64 nBlockSize; pIndex = sqlite3DbMallocRaw(db, sizeof(DiskAnnIndex)); if( pIndex == NULL ){ return SQLITE_NOMEM; @@ -210852,9 +210889,15 @@ int diskAnnOpenIndex( diskAnnCloseIndex(pIndex); return SQLITE_NOMEM_BKPT; } + nBlockSize = vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID); + // preserve backward compatibility: treat block size > 128 literally, but <= 128 with shift + if( nBlockSize <= 128 ){ + nBlockSize <<= DISKANN_BLOCK_SIZE_SHIFT; + } + pIndex->nFormatVersion = vectorIdxParamsGetU64(pParams, VECTOR_FORMAT_PARAM_ID); pIndex->nDistanceFunc = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID); - pIndex->nBlockSize = vectorIdxParamsGetU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID) << DISKANN_BLOCK_SIZE_SHIFT; + pIndex->nBlockSize = nBlockSize; pIndex->nNodeVectorType = vectorIdxParamsGetU64(pParams, VECTOR_TYPE_PARAM_ID); pIndex->nVectorDims = vectorIdxParamsGetU64(pParams, VECTOR_DIM_PARAM_ID); pIndex->pruningAlpha = vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID); @@ -211810,18 +211853,19 @@ static struct VectorColumnType VECTOR_COLUMN_TYPES[] = { struct VectorParamName { const char *zName; int tag; - int type; // 0 - enum, 1 - integer, 2 - float + int type; // 0 - string enum, 1 - integer, 2 - float const char *zValueStr; u64 value; }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 2, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { @@ -211841,11 +211885,15 @@ static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, con continue; } if( VECTOR_PARAM_NAMES[i].type == 1 ){ - u64 value = sqlite3Atoi(zValue); + int value = sqlite3Atoi(zValue); if( value == 0 ){ *pErrMsg = "invalid representation of integer vector index parameter"; return -1; } + if( value < 0 ){ + *pErrMsg = "integer vector index parameter must be positive"; + return -1; + } if( vectorIdxParamsPutU64(pParams, VECTOR_PARAM_NAMES[i].tag, value) != 0 ){ *pErrMsg = "unable to serialize integer vector index parameter"; return -1;